Files
PureRL-1.5B-v7-s2-l2-maskon/trainer_state.json
ModelHub XC 40eab7c35a 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-maskon
Source: Original Platform
2026-06-04 16:48:33 +08:00

11528 lines
457 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.7654397487640381,
"adv/mean_abs_reasoning": 0.424932599067688,
"adv/mean_abs_step_conf": 0.7687661647796631,
"adv/ratio_final_to_reasoning": 1.801320375145213,
"adv/ratio_step_to_reasoning": 1.8091484778206095,
"adv/std_final_conf": 0.9287529587745667,
"adv/std_reasoning": 0.7013161778450012,
"adv/std_step_conf": 0.9334626793861389,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 5.33984375,
"calib/ece": 0.21378906250000007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.3046875,
"calib/gap": 0.00016580667354659795,
"calib/mean_conf": 0.8817578125000001,
"calib/mu_c": 0.8818128654970762,
"calib/mu_w": 0.8816470588235296,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21378906250000007,
"calib/std_conf": 0.048946278921025696,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8082866741321388,
"calib/step_q_c_n": 893.0,
"calib/step_q_gap": 0.019531399870535426,
"calib/step_q_w": 0.7887552742616034,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1315.0,
"completions/max_terminated_length": 1315.0,
"completions/mean_length": 451.0703125,
"completions/mean_terminated_length": 452.8392333984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.0010666666666666667,
"grad_norm": 1.8525424003601074,
"kl": 0.00033098459243774414,
"learning_rate": 0.0,
"loss": 0.0618,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03566828370094299,
"mask/share_reasoning": 0.8323470950126648,
"mask/share_step_conf": 0.1280784010887146,
"num_tokens": 223058.0,
"reward": 0.40438759326934814,
"reward_std": 0.16610386967658997,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.7295855283737183,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.253622829914093,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7925335168838501,
"adv/mean_abs_reasoning": 0.4465929865837097,
"adv/mean_abs_step_conf": 0.7833997011184692,
"adv/ratio_final_to_reasoning": 1.7746215025597967,
"adv/ratio_step_to_reasoning": 1.754169287590521,
"adv/std_final_conf": 0.9316501617431641,
"adv/std_reasoning": 0.7013890147209167,
"adv/std_step_conf": 0.9330483675003052,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 4.7890625,
"calib/ece": 0.2887058823529413,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2549019607843137,
"calib/gap": -0.004035076611371591,
"calib/mean_conf": 0.8730196078431373,
"calib/mu_c": 0.8713422818791945,
"calib/mu_w": 0.8753773584905661,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2887058823529413,
"calib/std_conf": 0.04379877793804094,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7949142857142858,
"calib/step_q_c_n": 700.0,
"calib/step_q_gap": 0.006359152634437981,
"calib/step_q_w": 0.7885551330798478,
"calib/step_q_w_n": 526.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1720.0,
"completions/max_terminated_length": 1720.0,
"completions/mean_length": 490.6640625,
"completions/mean_terminated_length": 492.5882568359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0021333333333333334,
"grad_norm": 1.1731619834899902,
"kl": 0.0003865659236907959,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.022,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033253151923418045,
"mask/share_reasoning": 0.8548593521118164,
"mask/share_step_conf": 0.10798123478889465,
"num_tokens": 451956.0,
"reward": 0.3687654733657837,
"reward_std": 0.16945478320121765,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6672624945640564,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.24535658955574036,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7793192863464355,
"adv/mean_abs_reasoning": 0.5044786930084229,
"adv/mean_abs_step_conf": 0.7530263662338257,
"adv/ratio_final_to_reasoning": 1.5448011920959839,
"adv/ratio_step_to_reasoning": 1.492682201785781,
"adv/std_final_conf": 0.9305400252342224,
"adv/std_reasoning": 0.757487416267395,
"adv/std_step_conf": 0.9341217875480652,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.22257812500000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.34765625,
"calib/gap": 0.003928571428571503,
"calib/mean_conf": 0.878828125,
"calib/mu_c": 0.8801785714285715,
"calib/mu_w": 0.87625,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22257812500000002,
"calib/std_conf": 0.07099318952536486,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7881730769230769,
"calib/step_q_c_n": 832.0,
"calib/step_q_gap": 0.022878959276018018,
"calib/step_q_w": 0.7652941176470589,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1350.0,
"completions/max_terminated_length": 1350.0,
"completions/mean_length": 487.234375,
"completions/mean_terminated_length": 489.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.0032,
"grad_norm": 1.6487758159637451,
"kl": 0.00039315223693847656,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0013,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033384665846824646,
"mask/share_reasoning": 0.8496053218841553,
"mask/share_step_conf": 0.11310373246669769,
"num_tokens": 681944.0,
"reward": 0.40173280239105225,
"reward_std": 0.19228878617286682,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7216054797172546,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.24938993155956268,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7682288885116577,
"adv/mean_abs_reasoning": 0.45164400339126587,
"adv/mean_abs_step_conf": 0.7649694681167603,
"adv/ratio_final_to_reasoning": 1.7009611170374153,
"adv/ratio_step_to_reasoning": 1.693744326001946,
"adv/std_final_conf": 0.930860161781311,
"adv/std_reasoning": 0.7206089496612549,
"adv/std_step_conf": 0.9333772659301758,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 4.8515625,
"calib/ece": 0.27234126984126983,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.23412698412698413,
"calib/gap": 0.006078947368421184,
"calib/mean_conf": 0.8741666666666666,
"calib/mu_c": 0.8765789473684211,
"calib/mu_w": 0.8704999999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.27166666666666667,
"calib/std_conf": 0.050195056039351635,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8030393487109906,
"calib/step_q_c_n": 737.0,
"calib/step_q_gap": 0.0183858833644559,
"calib/step_q_w": 0.7846534653465347,
"calib/step_q_w_n": 505.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2249.0,
"completions/max_terminated_length": 2249.0,
"completions/mean_length": 503.55078125,
"completions/mean_terminated_length": 507.5157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.004266666666666667,
"grad_norm": 1.3600562810897827,
"kl": 0.0003751814365386963,
"learning_rate": 7.5e-07,
"loss": 0.0245,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03287990763783455,
"mask/share_reasoning": 0.8468308448791504,
"mask/share_step_conf": 0.11247673630714417,
"num_tokens": 917021.0,
"reward": 0.3587590754032135,
"reward_std": 0.17810939252376556,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6715430021286011,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.2673061192035675,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.776305079460144,
"adv/mean_abs_reasoning": 0.3951473832130432,
"adv/mean_abs_step_conf": 0.7590612173080444,
"adv/ratio_final_to_reasoning": 1.9645962808808484,
"adv/ratio_step_to_reasoning": 1.9209572163579216,
"adv/std_final_conf": 0.9308260083198547,
"adv/std_reasoning": 0.6612785458564758,
"adv/std_step_conf": 0.9333109855651855,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 4.8203125,
"calib/ece": 0.35124,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.368,
"calib/gap": -0.00788702525544649,
"calib/mean_conf": 0.88324,
"calib/mu_c": 0.879548872180451,
"calib/mu_w": 0.8874358974358975,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.35124,
"calib/std_conf": 0.04294534200585671,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.805139092240117,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.0162280214597178,
"calib/step_q_w": 0.7889110707803992,
"calib/step_q_w_n": 551.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2471.0,
"completions/max_terminated_length": 2471.0,
"completions/mean_length": 512.76953125,
"completions/mean_terminated_length": 512.76953125,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.005333333333333333,
"grad_norm": 1.6314057111740112,
"kl": 0.0002815425395965576,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.035,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033720463514328,
"mask/share_reasoning": 0.8549020290374756,
"mask/share_step_conf": 0.11137749999761581,
"num_tokens": 1154978.0,
"reward": 0.3144644796848297,
"reward_std": 0.14418430626392365,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6046000123023987,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.27254605293273926,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7703990936279297,
"adv/mean_abs_reasoning": 0.4255276918411255,
"adv/mean_abs_step_conf": 0.7566233277320862,
"adv/ratio_final_to_reasoning": 1.810455837303216,
"adv/ratio_step_to_reasoning": 1.7780824661690364,
"adv/std_final_conf": 0.9292229413986206,
"adv/std_reasoning": 0.7013127207756042,
"adv/std_step_conf": 0.9343317747116089,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 5.15625,
"calib/ece": 0.295275590551181,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.30708661417322836,
"calib/gap": 0.0041955896452540165,
"calib/mean_conf": 0.8818897637795275,
"calib/mu_c": 0.8836241610738255,
"calib/mu_w": 0.8794285714285714,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.295275590551181,
"calib/std_conf": 0.03836690428010434,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.799431968295905,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": -0.002575136499832098,
"calib/step_q_w": 0.8020071047957371,
"calib/step_q_w_n": 563.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2261.0,
"completions/max_terminated_length": 2261.0,
"completions/mean_length": 447.765625,
"completions/mean_terminated_length": 447.765625,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.0064,
"grad_norm": 1.4825108051300049,
"kl": 0.0004347562789916992,
"learning_rate": 1.25e-06,
"loss": 0.0162,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036845672875642776,
"mask/share_reasoning": 0.8360493183135986,
"mask/share_step_conf": 0.12710505723953247,
"num_tokens": 1375558.0,
"reward": 0.34460651874542236,
"reward_std": 0.16137650609016418,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6606269478797913,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.28469520807266235,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7863481044769287,
"adv/mean_abs_reasoning": 0.469906747341156,
"adv/mean_abs_step_conf": 0.769094705581665,
"adv/ratio_final_to_reasoning": 1.6734130951008324,
"adv/ratio_step_to_reasoning": 1.6366964508030275,
"adv/std_final_conf": 0.9292763471603394,
"adv/std_reasoning": 0.7206797003746033,
"adv/std_step_conf": 0.9334774017333984,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 5.2578125,
"calib/ece": 0.25059288537549407,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3438735177865613,
"calib/gap": 0.00951075268817192,
"calib/mean_conf": 0.88300395256917,
"calib/mu_c": 0.8865000000000001,
"calib/mu_w": 0.8769892473118281,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.25059288537549407,
"calib/std_conf": 0.045860977239795646,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7959679408138101,
"calib/step_q_c_n": 811.0,
"calib/step_q_gap": 0.051538034271754074,
"calib/step_q_w": 0.744429906542056,
"calib/step_q_w_n": 535.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2026.0,
"completions/max_terminated_length": 2026.0,
"completions/mean_length": 544.1171875,
"completions/mean_terminated_length": 546.2510375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.007466666666666667,
"grad_norm": 2.106478452682495,
"kl": 0.00030663609504699707,
"learning_rate": 1.5e-06,
"loss": 0.0688,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.029625695198774338,
"mask/share_reasoning": 0.8616422414779663,
"mask/share_step_conf": 0.10482584685087204,
"num_tokens": 1622276.0,
"reward": 0.3923969268798828,
"reward_std": 0.17531530559062958,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6933324337005615,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.22885112464427948,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7503417730331421,
"adv/mean_abs_reasoning": 0.37843891978263855,
"adv/mean_abs_step_conf": 0.7550092935562134,
"adv/ratio_final_to_reasoning": 1.9827288732990542,
"adv/ratio_step_to_reasoning": 1.9950624898460843,
"adv/std_final_conf": 0.9293138980865479,
"adv/std_reasoning": 0.6815049052238464,
"adv/std_step_conf": 0.9338960647583008,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 4.83984375,
"calib/ece": 0.30968127490039843,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.32669322709163345,
"calib/gap": -0.007965991692627239,
"calib/mean_conf": 0.8807569721115538,
"calib/mu_c": 0.8773611111111111,
"calib/mu_w": 0.8853271028037384,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3083665338645418,
"calib/std_conf": 0.07351602734159984,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8087121212121212,
"calib/step_q_c_n": 660.0,
"calib/step_q_gap": 0.025499685978960573,
"calib/step_q_w": 0.7832124352331606,
"calib/step_q_w_n": 579.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2435.0,
"completions/max_terminated_length": 2435.0,
"completions/mean_length": 557.890625,
"completions/mean_terminated_length": 557.890625,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.008533333333333334,
"grad_norm": 1.6290005445480347,
"kl": 0.0003802478313446045,
"learning_rate": 1.75e-06,
"loss": 0.065,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03190384805202484,
"mask/share_reasoning": 0.8644559979438782,
"mask/share_step_conf": 0.10364013910293579,
"num_tokens": 1871608.0,
"reward": 0.3481142520904541,
"reward_std": 0.1533786803483963,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.637712836265564,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": -0.2492968589067459,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7698661088943481,
"adv/mean_abs_reasoning": 0.47346532344818115,
"adv/mean_abs_step_conf": 0.7808018326759338,
"adv/ratio_final_to_reasoning": 1.626024274148573,
"adv/ratio_step_to_reasoning": 1.6491214752317325,
"adv/std_final_conf": 0.9296618700027466,
"adv/std_reasoning": 0.7394227385520935,
"adv/std_step_conf": 0.9349858164787292,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 5.1796875,
"calib/ece": 0.3006827309236948,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.0015334485968878653,
"calib/mean_conf": 0.8842971887550201,
"calib/mu_c": 0.884931506849315,
"calib/mu_w": 0.8833980582524271,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2993172690763052,
"calib/std_conf": 0.04356143277983223,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7792553191489362,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.03448179998517309,
"calib/step_q_w": 0.7447735191637631,
"calib/step_q_w_n": 574.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2802.0,
"completions/max_terminated_length": 2802.0,
"completions/mean_length": 526.42578125,
"completions/mean_terminated_length": 528.490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.0096,
"grad_norm": 1.390598177909851,
"kl": 0.0004108846187591553,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0238,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03303675353527069,
"mask/share_reasoning": 0.8570089340209961,
"mask/share_step_conf": 0.10604804754257202,
"num_tokens": 2113909.0,
"reward": 0.33846351504325867,
"reward_std": 0.20083707571029663,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6485316753387451,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": -0.27941715717315674,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7514711618423462,
"adv/mean_abs_reasoning": 0.47852805256843567,
"adv/mean_abs_step_conf": 0.758353054523468,
"adv/ratio_final_to_reasoning": 1.5703805823063552,
"adv/ratio_step_to_reasoning": 1.5847619600420684,
"adv/std_final_conf": 0.9294180870056152,
"adv/std_reasoning": 0.7574200630187988,
"adv/std_step_conf": 0.9331981539726257,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 4.94921875,
"calib/ece": 0.30456692913385847,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.43700787401574803,
"calib/gap": 0.008992307692307744,
"calib/mean_conf": 0.8951181102362206,
"calib/mu_c": 0.8987999999999999,
"calib/mu_w": 0.8898076923076922,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30456692913385847,
"calib/std_conf": 0.04612318565208294,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954441260744987,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": 0.015725321153584826,
"calib/step_q_w": 0.7797188049209138,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1447.0,
"completions/max_terminated_length": 1447.0,
"completions/mean_length": 502.77734375,
"completions/mean_terminated_length": 504.7490539550781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.010666666666666666,
"grad_norm": 1.6552625894546509,
"kl": 0.0006910562515258789,
"learning_rate": 2.25e-06,
"loss": 0.0153,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03246258199214935,
"mask/share_reasoning": 0.8544189929962158,
"mask/share_step_conf": 0.10921218991279602,
"num_tokens": 2349420.0,
"reward": 0.3554859757423401,
"reward_std": 0.17813757061958313,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6623660326004028,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.2662378251552582,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7631397247314453,
"adv/mean_abs_reasoning": 0.448530912399292,
"adv/mean_abs_step_conf": 0.7707614898681641,
"adv/ratio_final_to_reasoning": 1.7014205791285166,
"adv/ratio_step_to_reasoning": 1.7184133101221246,
"adv/std_final_conf": 0.9275975227355957,
"adv/std_reasoning": 0.7205783724784851,
"adv/std_step_conf": 0.9346963763237,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 5.390625,
"calib/ece": 0.33494023904382475,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5537848605577689,
"calib/gap": -0.013346128822381287,
"calib/mean_conf": 0.9037051792828684,
"calib/mu_c": 0.8980689655172414,
"calib/mu_w": 0.9114150943396226,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.33047808764940245,
"calib/std_conf": 0.06001507723137061,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7798165137614679,
"calib/step_q_c_n": 763.0,
"calib/step_q_gap": -0.00830342140871032,
"calib/step_q_w": 0.7881199351701782,
"calib/step_q_w_n": 617.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2697.0,
"completions/max_terminated_length": 2697.0,
"completions/mean_length": 544.1875,
"completions/mean_terminated_length": 544.1875,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.011733333333333333,
"grad_norm": 1.134965419769287,
"kl": 0.0011256933212280273,
"learning_rate": 2.5e-06,
"loss": 0.0807,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03155206888914108,
"mask/share_reasoning": 0.8536930084228516,
"mask/share_step_conf": 0.11475491523742676,
"num_tokens": 2593212.0,
"reward": 0.3331599533557892,
"reward_std": 0.16903448104858398,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6271425485610962,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.27019768953323364,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7283312678337097,
"adv/mean_abs_reasoning": 0.48717015981674194,
"adv/mean_abs_step_conf": 0.7630271315574646,
"adv/ratio_final_to_reasoning": 1.4950243834878658,
"adv/ratio_step_to_reasoning": 1.5662435725630062,
"adv/std_final_conf": 0.9263463020324707,
"adv/std_reasoning": 0.7575986981391907,
"adv/std_step_conf": 0.9330752491950989,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.21960000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.592,
"calib/gap": -0.008522383545069512,
"calib/mean_conf": 0.9092,
"calib/mu_c": 0.906609195402299,
"calib/mu_w": 0.9151315789473685,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2164000000000001,
"calib/std_conf": 0.04393813833106724,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7855875831485588,
"calib/step_q_c_n": 902.0,
"calib/step_q_gap": 0.015312353790760569,
"calib/step_q_w": 0.7702752293577982,
"calib/step_q_w_n": 545.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2986.0,
"completions/max_terminated_length": 2986.0,
"completions/mean_length": 503.01171875,
"completions/mean_terminated_length": 503.01171875,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.0128,
"grad_norm": 1.2536349296569824,
"kl": 0.0021218061447143555,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0789,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03501136228442192,
"mask/share_reasoning": 0.839159369468689,
"mask/share_step_conf": 0.12582923471927643,
"num_tokens": 2826159.0,
"reward": 0.4238912761211395,
"reward_std": 0.18918515741825104,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7163012027740479,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.19898740947246552,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.737905740737915,
"adv/mean_abs_reasoning": 0.41767221689224243,
"adv/mean_abs_step_conf": 0.7694048285484314,
"adv/ratio_final_to_reasoning": 1.7667101398997085,
"adv/ratio_step_to_reasoning": 1.8421259481258108,
"adv/std_final_conf": 0.9262003898620605,
"adv/std_reasoning": 0.70124351978302,
"adv/std_step_conf": 0.9342942833900452,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.3045098039215687,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.008605283605283565,
"calib/mean_conf": 0.9162745098039217,
"calib/mu_c": 0.9196153846153845,
"calib/mu_w": 0.9110101010101009,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3045098039215687,
"calib/std_conf": 0.03954450381054809,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7824812030075189,
"calib/step_q_c_n": 798.0,
"calib/step_q_gap": 0.018387635756056953,
"calib/step_q_w": 0.7640935672514619,
"calib/step_q_w_n": 513.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1323.0,
"completions/max_terminated_length": 1323.0,
"completions/mean_length": 482.05078125,
"completions/mean_terminated_length": 483.9411926269531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.013866666666666666,
"grad_norm": 1.1540279388427734,
"kl": 0.0023784637451171875,
"learning_rate": 3e-06,
"loss": 0.0281,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03376149386167526,
"mask/share_reasoning": 0.8471423983573914,
"mask/share_step_conf": 0.11518988013267517,
"num_tokens": 3054156.0,
"reward": 0.3762282729148865,
"reward_std": 0.17012429237365723,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6658226847648621,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.23445990681648254,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7835434079170227,
"adv/mean_abs_reasoning": 0.5537022352218628,
"adv/mean_abs_step_conf": 0.7725571393966675,
"adv/ratio_final_to_reasoning": 1.4150988709718049,
"adv/ratio_step_to_reasoning": 1.3952573969420798,
"adv/std_final_conf": 0.924622118473053,
"adv/std_reasoning": 0.7754649519920349,
"adv/std_step_conf": 0.9346968531608582,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 5.1953125,
"calib/ece": 0.37145161290322576,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.907258064516129,
"calib/gap": -0.0037807383840392506,
"calib/mean_conf": 0.94,
"calib/mu_c": 0.9383687943262411,
"calib/mu_w": 0.9421495327102803,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.37145161290322576,
"calib/std_conf": 0.027119805879978955,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7759945504087195,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.027471060475833586,
"calib/step_q_w": 0.7485234899328859,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 547.07421875,
"completions/mean_terminated_length": 551.3818969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.9899579882621765,
"kl": 0.005403995513916016,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.017,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.031641483306884766,
"mask/share_reasoning": 0.8485899567604065,
"mask/share_step_conf": 0.11195607483386993,
"num_tokens": 3299607.0,
"reward": 0.32880502939224243,
"reward_std": 0.21657250821590424,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5910734534263611,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.23658838868141174,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7555526494979858,
"adv/mean_abs_reasoning": 0.4017358422279358,
"adv/mean_abs_step_conf": 0.7688524127006531,
"adv/ratio_final_to_reasoning": 1.8807200405815483,
"adv/ratio_step_to_reasoning": 1.9138257827252159,
"adv/std_final_conf": 0.906000554561615,
"adv/std_reasoning": 0.6815659999847412,
"adv/std_step_conf": 0.9339501857757568,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 4.83203125,
"calib/ece": 0.3578823529411764,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9176470588235294,
"calib/gap": 0.006999999999999784,
"calib/mean_conf": 0.9461176470588234,
"calib/mu_c": 0.9489999999999997,
"calib/mu_w": 0.942,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3578823529411764,
"calib/std_conf": 0.05374801302622813,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7532305630026809,
"calib/step_q_c_n": 746.0,
"calib/step_q_gap": -0.001677787302818201,
"calib/step_q_w": 0.7549083503054991,
"calib/step_q_w_n": 491.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2360.0,
"completions/max_terminated_length": 2360.0,
"completions/mean_length": 472.23046875,
"completions/mean_terminated_length": 474.0823669433594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.016,
"grad_norm": 1.7005994319915771,
"kl": 0.009075164794921875,
"learning_rate": 3.5e-06,
"loss": 0.0522,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033927664160728455,
"mask/share_reasoning": 0.8490947484970093,
"mask/share_step_conf": 0.11307130753993988,
"num_tokens": 3528378.0,
"reward": 0.35110145807266235,
"reward_std": 0.18108385801315308,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6277461051940918,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.24194946885108948,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7394810914993286,
"adv/mean_abs_reasoning": 0.43682926893234253,
"adv/mean_abs_step_conf": 0.7458778619766235,
"adv/ratio_final_to_reasoning": 1.6928377837563393,
"adv/ratio_step_to_reasoning": 1.7074814235768332,
"adv/std_final_conf": 0.8953023552894592,
"adv/std_reasoning": 0.7014684677124023,
"adv/std_step_conf": 0.9345353841781616,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 6.390625,
"calib/ece": 0.3287351778656126,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9723320158102767,
"calib/gap": 0.0006128730095009693,
"calib/mean_conf": 0.957193675889328,
"calib/mu_c": 0.9574213836477988,
"calib/mu_w": 0.9568085106382979,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3287351778656126,
"calib/std_conf": 0.019470004281586368,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7212183692596064,
"calib/step_q_c_n": 1067.0,
"calib/step_q_gap": -0.011488133376597487,
"calib/step_q_w": 0.7327065026362038,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2635.0,
"completions/max_terminated_length": 2635.0,
"completions/mean_length": 650.8515625,
"completions/mean_terminated_length": 650.8515625,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.017066666666666667,
"grad_norm": 1.4386796951293945,
"kl": 0.010073661804199219,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0856,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.025310292840003967,
"mask/share_reasoning": 0.8630415201187134,
"mask/share_step_conf": 0.11164823174476624,
"num_tokens": 3803844.0,
"reward": 0.3730993866920471,
"reward_std": 0.18799588084220886,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6497913599014282,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.22390510141849518,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7376053929328918,
"adv/mean_abs_reasoning": 0.44774770736694336,
"adv/mean_abs_step_conf": 0.7699299454689026,
"adv/ratio_final_to_reasoning": 1.647368329969808,
"adv/ratio_step_to_reasoning": 1.719562005122498,
"adv/std_final_conf": 0.8852930665016174,
"adv/std_reasoning": 0.7014374732971191,
"adv/std_step_conf": 0.9343675971031189,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 5.65625,
"calib/ece": 0.24218253968253975,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9801587301587301,
"calib/gap": 0.010055555555555484,
"calib/mean_conf": 0.9549603174603174,
"calib/mu_c": 0.9578333333333332,
"calib/mu_w": 0.9477777777777777,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2414285714285715,
"calib/std_conf": 0.05771095200091226,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.7282984790874525,
"calib/step_q_c_n": 1052.0,
"calib/step_q_gap": 0.0283489841379575,
"calib/step_q_w": 0.699949494949495,
"calib/step_q_w_n": 396.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 528.109375,
"completions/mean_terminated_length": 532.2677001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.9054005146026611,
"kl": 0.016378402709960938,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0752,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.031545430421829224,
"mask/share_reasoning": 0.8398832082748413,
"mask/share_step_conf": 0.12075883150100708,
"num_tokens": 4042568.0,
"reward": 0.43897920846939087,
"reward_std": 0.18807503581047058,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7148038744926453,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": -0.17043927311897278,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7131336331367493,
"adv/mean_abs_reasoning": 0.40515708923339844,
"adv/mean_abs_step_conf": 0.7440693378448486,
"adv/ratio_final_to_reasoning": 1.7601410714201648,
"adv/ratio_step_to_reasoning": 1.8364959113826917,
"adv/std_final_conf": 0.8951680660247803,
"adv/std_reasoning": 0.7013642191886902,
"adv/std_step_conf": 0.9341275691986084,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.42048387096774187,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9959677419354839,
"calib/gap": -0.003303834808259798,
"calib/mean_conf": 0.9648387096774195,
"calib/mu_c": 0.9633333333333332,
"calib/mu_w": 0.966637168141593,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.42048387096774187,
"calib/std_conf": 0.015527492318140311,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6932142857142858,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.002466622162883847,
"calib/step_q_w": 0.6907476635514019,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2797.0,
"completions/max_terminated_length": 2797.0,
"completions/mean_length": 552.58984375,
"completions/mean_terminated_length": 556.94091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.0192,
"grad_norm": 1.1966159343719482,
"kl": 0.016684532165527344,
"learning_rate": 4.25e-06,
"loss": 0.0388,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.030978208407759666,
"mask/share_reasoning": 0.8578979969024658,
"mask/share_step_conf": 0.10331130772829056,
"num_tokens": 4294751.0,
"reward": 0.3016693890094757,
"reward_std": 0.1707407385110855,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5470273494720459,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.23978236317634583,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7240345478057861,
"adv/mean_abs_reasoning": 0.41203486919403076,
"adv/mean_abs_step_conf": 0.7667274475097656,
"adv/ratio_final_to_reasoning": 1.7572166870780832,
"adv/ratio_step_to_reasoning": 1.860831460719668,
"adv/std_final_conf": 0.8740739822387695,
"adv/std_reasoning": 0.7013679146766663,
"adv/std_step_conf": 0.9340393543243408,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 4.6484375,
"calib/ece": 0.3597628458498022,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004545454545452632,
"calib/mean_conf": 0.9684584980237153,
"calib/mu_c": 0.9686363636363635,
"calib/mu_w": 0.9681818181818183,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3597628458498022,
"calib/std_conf": 0.010576058843868474,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7163469187675069,
"calib/step_q_c_n": 714.0,
"calib/step_q_gap": 0.0026074229691875894,
"calib/step_q_w": 0.7137394957983193,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 958.0,
"completions/max_terminated_length": 958.0,
"completions/mean_length": 496.23828125,
"completions/mean_terminated_length": 498.1843566894531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.020266666666666665,
"grad_norm": 1.6525537967681885,
"kl": 0.0213470458984375,
"learning_rate": 4.5e-06,
"loss": -0.0181,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030582614243030548,
"mask/share_reasoning": 0.8619511723518372,
"mask/share_step_conf": 0.1035599410533905,
"num_tokens": 4526548.0,
"reward": 0.3647327721118927,
"reward_std": 0.187033012509346,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6250780820846558,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.21358135342597961,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.705518364906311,
"adv/mean_abs_reasoning": 0.42651718854904175,
"adv/mean_abs_step_conf": 0.7572118043899536,
"adv/ratio_final_to_reasoning": 1.6541381774235089,
"adv/ratio_step_to_reasoning": 1.7753371369765745,
"adv/std_final_conf": 0.8713658452033997,
"adv/std_reasoning": 0.7204932570457458,
"adv/std_step_conf": 0.9350305199623108,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 5.42578125,
"calib/ece": 0.42270916334661346,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": -0.001246633320508006,
"calib/mean_conf": 0.970996015936255,
"calib/mu_c": 0.9704347826086958,
"calib/mu_w": 0.9716814159292038,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.42195219123505967,
"calib/std_conf": 0.015773672656279998,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6602646239554317,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.00811857328479082,
"calib/step_q_w": 0.6521460506706409,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2399.0,
"completions/max_terminated_length": 2399.0,
"completions/mean_length": 500.640625,
"completions/mean_terminated_length": 502.60394287109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.9578653573989868,
"kl": 0.029819488525390625,
"learning_rate": 4.75e-06,
"loss": 0.0468,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03496409207582474,
"mask/share_reasoning": 0.8334662318229675,
"mask/share_step_conf": 0.12766346335411072,
"num_tokens": 4759584.0,
"reward": 0.3266947865486145,
"reward_std": 0.17774325609207153,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5629937648773193,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.2135103940963745,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7249806523323059,
"adv/mean_abs_reasoning": 0.5042846202850342,
"adv/mean_abs_step_conf": 0.7544502019882202,
"adv/ratio_final_to_reasoning": 1.43764180617392,
"adv/ratio_step_to_reasoning": 1.4960801334012255,
"adv/std_final_conf": 0.8839868903160095,
"adv/std_reasoning": 0.757595956325531,
"adv/std_step_conf": 0.9354380965232849,
"calib/answer_extract_rate": 0.98828125,
"calib/avg_num_step_conf": 5.6796875,
"calib/ece": 0.35011857707509875,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0021065552016986677,
"calib/mean_conf": 0.9706719367588932,
"calib/mu_c": 0.9698726114649681,
"calib/mu_w": 0.9719791666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.35011857707509875,
"calib/std_conf": 0.01113797242974695,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6385108958837773,
"calib/step_q_c_n": 826.0,
"calib/step_q_gap": -0.01804961367036284,
"calib/step_q_w": 0.6565605095541401,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2919.0,
"completions/max_terminated_length": 2919.0,
"completions/mean_length": 536.6796875,
"completions/mean_terminated_length": 536.6796875,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.0224,
"grad_norm": 0.8330198526382446,
"kl": 0.030071258544921875,
"learning_rate": 5e-06,
"loss": 0.0666,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03229574114084244,
"mask/share_reasoning": 0.84485924243927,
"mask/share_step_conf": 0.12284499406814575,
"num_tokens": 4999934.0,
"reward": 0.3799642324447632,
"reward_std": 0.2192906141281128,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6333242058753967,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": -0.19370830059051514,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.699131965637207,
"adv/mean_abs_reasoning": 0.37140166759490967,
"adv/mean_abs_step_conf": 0.7832655906677246,
"adv/ratio_final_to_reasoning": 1.8824147187183744,
"adv/ratio_step_to_reasoning": 2.108944732908517,
"adv/std_final_conf": 0.8496929407119751,
"adv/std_reasoning": 0.6402788758277893,
"adv/std_step_conf": 0.9349262714385986,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 5.4765625,
"calib/ece": 0.3171764705882353,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00029667936853572385,
"calib/mean_conf": 0.972078431372549,
"calib/mu_c": 0.9719760479041916,
"calib/mu_w": 0.9722727272727273,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3171764705882353,
"calib/std_conf": 0.011681799380022513,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6708108108108108,
"calib/step_q_c_n": 888.0,
"calib/step_q_gap": 0.033048164896413934,
"calib/step_q_w": 0.6377626459143969,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2611.0,
"completions/max_terminated_length": 2611.0,
"completions/mean_length": 501.46875,
"completions/mean_terminated_length": 501.46875,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.023466666666666667,
"grad_norm": 1.0240269899368286,
"kl": 0.044139862060546875,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0309,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03162510693073273,
"mask/share_reasoning": 0.8491679430007935,
"mask/share_step_conf": 0.1192069873213768,
"num_tokens": 5230126.0,
"reward": 0.4074317514896393,
"reward_std": 0.1798522025346756,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6704937219619751,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": -0.185317724943161,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7653679847717285,
"adv/mean_abs_reasoning": 0.5065703392028809,
"adv/mean_abs_step_conf": 0.7755952477455139,
"adv/ratio_final_to_reasoning": 1.5108819556551247,
"adv/ratio_step_to_reasoning": 1.5310711814788842,
"adv/std_final_conf": 0.88029944896698,
"adv/std_reasoning": 0.7393872737884521,
"adv/std_step_conf": 0.9353004693984985,
"calib/answer_extract_rate": 0.9921875,
"calib/avg_num_step_conf": 5.5234375,
"calib/ece": 0.41898039215686267,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 4.362457933226871e-06,
"calib/mean_conf": 0.9758431372549019,
"calib/mu_c": 0.975845070422535,
"calib/mu_w": 0.9758407079646018,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41898039215686267,
"calib/std_conf": 0.011747961940927244,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6223529411764707,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": 0.018685760899120885,
"calib/step_q_w": 0.6036671802773498,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1440.0,
"completions/max_terminated_length": 1440.0,
"completions/mean_length": 524.51171875,
"completions/mean_terminated_length": 526.5686645507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.024533333333333334,
"grad_norm": 1.0188655853271484,
"kl": 0.0431671142578125,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0684,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0323941633105278,
"mask/share_reasoning": 0.841258704662323,
"mask/share_step_conf": 0.1224408820271492,
"num_tokens": 5468337.0,
"reward": 0.3318406045436859,
"reward_std": 0.2227940410375595,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5750659704208374,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.2207598090171814,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7638611197471619,
"adv/mean_abs_reasoning": 0.6413958072662354,
"adv/mean_abs_step_conf": 0.7876379489898682,
"adv/ratio_final_to_reasoning": 1.1909356299083083,
"adv/ratio_step_to_reasoning": 1.2280060768512782,
"adv/std_final_conf": 0.9069669842720032,
"adv/std_reasoning": 0.8429003357887268,
"adv/std_step_conf": 0.9353696703910828,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 5.875,
"calib/ece": 0.4378968253968255,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0004918864097362974,
"calib/mean_conf": 0.9775793650793652,
"calib/mu_c": 0.9773529411764708,
"calib/mu_w": 0.977844827586207,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.4378968253968255,
"calib/std_conf": 0.011551027697426637,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6079061371841155,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.04093734075023725,
"calib/step_q_w": 0.5669687964338782,
"calib/step_q_w_n": 673.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2887.0,
"completions/max_terminated_length": 2887.0,
"completions/mean_length": 565.125,
"completions/mean_terminated_length": 567.3411865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.0256,
"grad_norm": 1.043018102645874,
"kl": 0.03503227233886719,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0101,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.030982669442892075,
"mask/share_reasoning": 0.84433513879776,
"mask/share_step_conf": 0.12077593803405762,
"num_tokens": 5717521.0,
"reward": 0.32456642389297485,
"reward_std": 0.24878433346748352,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5507019758224487,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": -0.20469412207603455,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.6897413730621338,
"adv/mean_abs_reasoning": 0.4197363257408142,
"adv/mean_abs_step_conf": 0.7734383344650269,
"adv/ratio_final_to_reasoning": 1.6432730044147925,
"adv/ratio_step_to_reasoning": 1.8426766687394658,
"adv/std_final_conf": 0.8677418231964111,
"adv/std_reasoning": 0.7012581825256348,
"adv/std_step_conf": 0.9352645874023438,
"calib/answer_extract_rate": 1.0,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.38632812500000013,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0030465587044534725,
"calib/mean_conf": 0.9800781250000001,
"calib/mu_c": 0.9813157894736844,
"calib/mu_w": 0.9782692307692309,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38632812500000013,
"calib/std_conf": 0.010532397470869356,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6058703481392557,
"calib/step_q_c_n": 833.0,
"calib/step_q_gap": 0.05295916392872935,
"calib/step_q_w": 0.5529111842105263,
"calib/step_q_w_n": 608.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1054.0,
"completions/max_terminated_length": 1054.0,
"completions/mean_length": 486.05859375,
"completions/mean_terminated_length": 487.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.9559767842292786,
"kl": 0.03623199462890625,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0248,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.032504092901945114,
"mask/share_reasoning": 0.8402522206306458,
"mask/share_step_conf": 0.12333747744560242,
"num_tokens": 5945176.0,
"reward": 0.3616005778312683,
"reward_std": 0.18630656599998474,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6108984351158142,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": -0.2064473032951355,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.6565980911254883,
"adv/mean_abs_reasoning": 0.3444192707538605,
"adv/mean_abs_step_conf": 0.767449140548706,
"adv/ratio_final_to_reasoning": 1.906391851095715,
"adv/ratio_step_to_reasoning": 2.228241000768985,
"adv/std_final_conf": 0.8418591022491455,
"adv/std_reasoning": 0.6611375212669373,
"adv/std_step_conf": 0.9351149201393127,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.33865079365079376,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 9.876543209874633e-05,
"calib/mean_conf": 0.9815079365079367,
"calib/mu_c": 0.9815432098765432,
"calib/mu_w": 0.9814444444444445,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.33865079365079376,
"calib/std_conf": 0.010842850055649682,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6031436314363143,
"calib/step_q_c_n": 738.0,
"calib/step_q_gap": 0.025001868984206976,
"calib/step_q_w": 0.5781417624521074,
"calib/step_q_w_n": 522.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1885.0,
"completions/max_terminated_length": 1885.0,
"completions/mean_length": 483.078125,
"completions/mean_terminated_length": 484.9725646972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.9195064306259155,
"kl": 0.03989410400390625,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0103,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.031222233548760414,
"mask/share_reasoning": 0.8557568788528442,
"mask/share_step_conf": 0.10911465436220169,
"num_tokens": 6174084.0,
"reward": 0.387053519487381,
"reward_std": 0.143305242061615,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.641502320766449,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.1900515854358673,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7401793003082275,
"adv/mean_abs_reasoning": 0.5729160308837891,
"adv/mean_abs_step_conf": 0.7543942928314209,
"adv/ratio_final_to_reasoning": 1.291950757891022,
"adv/ratio_step_to_reasoning": 1.3167624087384686,
"adv/std_final_conf": 0.9016159176826477,
"adv/std_reasoning": 0.8098499774932861,
"adv/std_step_conf": 0.9356675148010254,
"calib/answer_extract_rate": 0.99609375,
"calib/avg_num_step_conf": 5.6875,
"calib/ece": 0.47172549019607846,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9921568627450981,
"calib/gap": -0.0069422556020685144,
"calib/mean_conf": 0.9763529411764706,
"calib/mu_c": 0.9729770992366412,
"calib/mu_w": 0.9799193548387097,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4671764705882353,
"calib/std_conf": 0.06286331999786045,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5627042253521127,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": -0.017269903334214298,
"calib/step_q_w": 0.579974128686327,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1910.0,
"completions/max_terminated_length": 1910.0,
"completions/mean_length": 512.48828125,
"completions/mean_terminated_length": 512.48828125,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.0288,
"grad_norm": 0.8841757774353027,
"kl": 0.05023193359375,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0316,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03174913674592972,
"mask/share_reasoning": 0.84727942943573,
"mask/share_step_conf": 0.1209714263677597,
"num_tokens": 6410497.0,
"reward": 0.3019501566886902,
"reward_std": 0.2588861584663391,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5266007781028748,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": -0.22348174452781677,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7030376195907593,
"adv/mean_abs_reasoning": 0.42762160301208496,
"adv/mean_abs_step_conf": 0.7977121472358704,
"adv/ratio_final_to_reasoning": 1.644064786808469,
"adv/ratio_step_to_reasoning": 1.865462693224426,
"adv/std_final_conf": 0.850563108921051,
"adv/std_reasoning": 0.7014147043228149,
"adv/std_step_conf": 0.9355003237724304,
"calib/answer_extract_rate": 0.96875,
"calib/avg_num_step_conf": 4.77734375,
"calib/ece": 0.3548790322580647,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9959677419354839,
"calib/gap": -0.0013978494623654303,
"calib/mean_conf": 0.9798790322580647,
"calib/mu_c": 0.9793548387096775,
"calib/mu_w": 0.980752688172043,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3548790322580647,
"calib/std_conf": 0.012296139403274755,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5986900662251655,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.023647331182430453,
"calib/step_q_w": 0.5750427350427351,
"calib/step_q_w_n": 468.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2213.0,
"completions/max_terminated_length": 2213.0,
"completions/mean_length": 557.98046875,
"completions/mean_terminated_length": 560.1686401367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.7727410197257996,
"kl": 0.034938812255859375,
"learning_rate": 4.805555555555556e-06,
"loss": 0.0384,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02986774779856205,
"mask/share_reasoning": 0.8633028268814087,
"mask/share_step_conf": 0.10292316228151321,
"num_tokens": 6660284.0,
"reward": 0.3603275418281555,
"reward_std": 0.18742801249027252,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6148539185523987,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": -0.20748008787631989,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7449028491973877,
"adv/mean_abs_reasoning": 0.4773523807525635,
"adv/mean_abs_step_conf": 0.7774635553359985,
"adv/ratio_final_to_reasoning": 1.5604883922921284,
"adv/ratio_step_to_reasoning": 1.6286994402548047,
"adv/std_final_conf": 0.8901979923248291,
"adv/std_reasoning": 0.7394049167633057,
"adv/std_step_conf": 0.935321033000946,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.4923015873015873,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.996031746031746,
"calib/gap": -0.0033163168840989465,
"calib/mean_conf": 0.9803968253968254,
"calib/mu_c": 0.9786991869918701,
"calib/mu_w": 0.982015503875969,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.4923015873015873,
"calib/std_conf": 0.012176010348925173,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5727431059506531,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.04853050752545629,
"calib/step_q_w": 0.5242125984251969,
"calib/step_q_w_n": 762.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2197.0,
"completions/max_terminated_length": 2197.0,
"completions/mean_length": 585.17578125,
"completions/mean_terminated_length": 585.17578125,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.8804644346237183,
"kl": 0.03813934326171875,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0372,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02827608771622181,
"mask/share_reasoning": 0.8626615405082703,
"mask/share_step_conf": 0.10906237363815308,
"num_tokens": 6917217.0,
"reward": 0.2944219708442688,
"reward_std": 0.21288272738456726,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.4901074171066284,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.19188852608203888,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7416458129882812,
"adv/mean_abs_reasoning": 0.5515131950378418,
"adv/mean_abs_step_conf": 0.760871946811676,
"adv/ratio_final_to_reasoning": 1.3447471786008558,
"adv/ratio_step_to_reasoning": 1.379607874584885,
"adv/std_final_conf": 0.8862450122833252,
"adv/std_reasoning": 0.7928551435470581,
"adv/std_step_conf": 0.9354478716850281,
"calib/answer_extract_rate": 0.98046875,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.4210714285714285,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9920634920634921,
"calib/gap": 0.00931240848029502,
"calib/mean_conf": 0.9726587301587302,
"calib/mu_c": 0.9768345323741006,
"calib/mu_w": 0.9675221238938055,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.4210714285714285,
"calib/std_conf": 0.057172762787333534,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5627169274537696,
"calib/step_q_c_n": 703.0,
"calib/step_q_gap": 0.011031166406306414,
"calib/step_q_w": 0.5516857610474631,
"calib/step_q_w_n": 611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 593.671875,
"completions/mean_terminated_length": 596.0000610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.032,
"grad_norm": 0.8357536196708679,
"kl": 0.035503387451171875,
"learning_rate": 4.75e-06,
"loss": -0.0279,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.027449211105704308,
"mask/share_reasoning": 0.8674349784851074,
"mask/share_step_conf": 0.1012095957994461,
"num_tokens": 7176181.0,
"reward": 0.31889694929122925,
"reward_std": 0.22138473391532898,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5547477006912231,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.21617251634597778,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7557934522628784,
"adv/mean_abs_reasoning": 0.476267009973526,
"adv/mean_abs_step_conf": 0.7879382371902466,
"adv/ratio_final_to_reasoning": 1.586911199885313,
"adv/ratio_step_to_reasoning": 1.6544044006618164,
"adv/std_final_conf": 0.8960928916931152,
"adv/std_reasoning": 0.7394196391105652,
"adv/std_step_conf": 0.9349035024642944,
"calib/answer_extract_rate": 0.96484375,
"calib/avg_num_step_conf": 5.90625,
"calib/ece": 0.5464081632653063,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9959183673469387,
"calib/gap": 0.0024523809523813167,
"calib/mean_conf": 0.9749795918367347,
"calib/mu_c": 0.9763809523809525,
"calib/mu_w": 0.9739285714285711,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.5464081632653063,
"calib/std_conf": 0.014863308879714873,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5909352517985611,
"calib/step_q_c_n": 556.0,
"calib/step_q_gap": 0.03200219740525567,
"calib/step_q_w": 0.5589330543933054,
"calib/step_q_w_n": 956.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2991.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 618.41015625,
"completions/mean_terminated_length": 623.279541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.7795594930648804,
"kl": 0.03357505798339844,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0062,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02844122424721718,
"mask/share_reasoning": 0.8597810864448547,
"mask/share_step_conf": 0.10396520793437958,
"num_tokens": 7440406.0,
"reward": 0.2383827120065689,
"reward_std": 0.2029481828212738,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.4378613233566284,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": -0.23609593510627747,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.6904056668281555,
"adv/mean_abs_reasoning": 0.43361717462539673,
"adv/mean_abs_step_conf": 0.7783961892127991,
"adv/ratio_final_to_reasoning": 1.592200925677354,
"adv/ratio_step_to_reasoning": 1.7951230596095693,
"adv/std_final_conf": 0.8829302191734314,
"adv/std_reasoning": 0.7205833196640015,
"adv/std_step_conf": 0.9355403184890747,
"calib/answer_extract_rate": 0.9765625,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.47612000000000015,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.972,
"calib/gap": 0.007701811663785918,
"calib/mean_conf": 0.9681200000000001,
"calib/mu_c": 0.9720325203252034,
"calib/mu_w": 0.9643307086614175,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.47612000000000015,
"calib/std_conf": 0.060432322477296865,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5733505154639176,
"calib/step_q_c_n": 582.0,
"calib/step_q_gap": 0.02332039498198979,
"calib/step_q_w": 0.5500301204819278,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2556.0,
"completions/max_terminated_length": 2556.0,
"completions/mean_length": 514.97265625,
"completions/mean_terminated_length": 523.1468505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.8769909739494324,
"kl": 0.0423736572265625,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0049,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030418388545513153,
"mask/share_reasoning": 0.8499599695205688,
"mask/share_step_conf": 0.1039966493844986,
"num_tokens": 7678943.0,
"reward": 0.29528889060020447,
"reward_std": 0.19717194139957428,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.5112226009368896,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": -0.21126985549926758,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.686201274394989,
"adv/mean_abs_reasoning": 0.5028985738754272,
"adv/mean_abs_step_conf": 0.7896302938461304,
"adv/ratio_final_to_reasoning": 1.3644923848302015,
"adv/ratio_step_to_reasoning": 1.5701581489108165,
"adv/std_final_conf": 0.8819124698638916,
"adv/std_reasoning": 0.775467574596405,
"adv/std_step_conf": 0.9355705380439758,
"calib/answer_extract_rate": 0.9609375,
"calib/avg_num_step_conf": 5.3125,
"calib/ece": 0.46453441295546577,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9757085020242915,
"calib/gap": 0.026363278688524838,
"calib/mean_conf": 0.9584615384615386,
"calib/mu_c": 0.9718032786885249,
"calib/mu_w": 0.9454400000000001,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.46453441295546577,
"calib/std_conf": 0.10703239103764979,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5873088685015291,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.037124732524191906,
"calib/step_q_w": 0.5501841359773372,
"calib/step_q_w_n": 706.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2995.0,
"completions/max_terminated_length": 2995.0,
"completions/mean_length": 562.0234375,
"completions/mean_terminated_length": 562.0234375,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0352,
"grad_norm": 0.8330094814300537,
"kl": 0.039539337158203125,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0501,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.029015641659498215,
"mask/share_reasoning": 0.8654122948646545,
"mask/share_step_conf": 0.10557205975055695,
"num_tokens": 7929693.0,
"reward": 0.29371070861816406,
"reward_std": 0.21325403451919556,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.512758195400238,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": -0.21127429604530334,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7370667457580566,
"adv/mean_abs_reasoning": 0.5098717212677002,
"adv/mean_abs_step_conf": 0.7471521496772766,
"adv/ratio_final_to_reasoning": 1.44559251869368,
"adv/ratio_step_to_reasoning": 1.4653727957683615,
"adv/std_final_conf": 0.9055343270301819,
"adv/std_reasoning": 0.757660984992981,
"adv/std_step_conf": 0.9352141618728638,
"calib/answer_extract_rate": 0.984375,
"calib/avg_num_step_conf": 5.61328125,
"calib/ece": 0.3965476190476191,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9682539682539683,
"calib/gap": -0.0023428939735739007,
"calib/mean_conf": 0.9599603174603174,
"calib/mu_c": 0.9589655172413794,
"calib/mu_w": 0.9613084112149533,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3905555555555556,
"calib/std_conf": 0.071727537913533,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48412466843501323,
"calib/step_q_c_n": 754.0,
"calib/step_q_gap": 0.010698606941601796,
"calib/step_q_w": 0.47342606149341143,
"calib/step_q_w_n": 683.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2751.0,
"completions/max_terminated_length": 2751.0,
"completions/mean_length": 488.24609375,
"completions/mean_terminated_length": 490.16082763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.707058310508728,
"kl": 0.05702972412109375,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0112,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033386290073394775,
"mask/share_reasoning": 0.8376938104629517,
"mask/share_step_conf": 0.12501364946365356,
"num_tokens": 8159796.0,
"reward": 0.36031782627105713,
"reward_std": 0.20753872394561768,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5918011665344238,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": -0.180540531873703,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.6628189086914062,
"adv/mean_abs_reasoning": 0.46881189942359924,
"adv/mean_abs_step_conf": 0.7591454982757568,
"adv/ratio_final_to_reasoning": 1.4138269730489716,
"adv/ratio_step_to_reasoning": 1.6192965648037532,
"adv/std_final_conf": 0.8670271039009094,
"adv/std_reasoning": 0.72071373462677,
"adv/std_step_conf": 0.9348909854888916,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 5.55859375,
"calib/ece": 0.4212826446280994,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9380165289256198,
"calib/gap": 0.0023757575757575866,
"calib/mean_conf": 0.9474776859504134,
"calib/mu_c": 0.9485575757575757,
"calib/mu_w": 0.9461818181818181,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.4116528925619837,
"calib/std_conf": 0.11901983547972321,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.49606580829756797,
"calib/step_q_c_n": 699.0,
"calib/step_q_gap": 0.07246083592187746,
"calib/step_q_w": 0.4236049723756905,
"calib/step_q_w_n": 724.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2929.0,
"completions/max_terminated_length": 2929.0,
"completions/mean_length": 633.07421875,
"completions/mean_terminated_length": 633.07421875,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.7305748462677002,
"kl": 0.0447235107421875,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0196,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.027223873883485794,
"mask/share_reasoning": 0.8737444877624512,
"mask/share_step_conf": 0.09903167188167572,
"num_tokens": 8431119.0,
"reward": 0.3382720947265625,
"reward_std": 0.18578404188156128,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5458762645721436,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": -0.16230084002017975,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.701039731502533,
"adv/mean_abs_reasoning": 0.5507223606109619,
"adv/mean_abs_step_conf": 0.7414873242378235,
"adv/ratio_final_to_reasoning": 1.2729458283204835,
"adv/ratio_step_to_reasoning": 1.3463904451150852,
"adv/std_final_conf": 0.9193742871284485,
"adv/std_reasoning": 0.8100778460502625,
"adv/std_step_conf": 0.9348658919334412,
"calib/answer_extract_rate": 0.94140625,
"calib/avg_num_step_conf": 5.28515625,
"calib/ece": 0.24663865546218486,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.9369747899159664,
"calib/gap": -0.0024999999999999467,
"calib/mean_conf": 0.9580672268907563,
"calib/mu_c": 0.9573529411764706,
"calib/mu_w": 0.9598529411764706,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.24521008403361344,
"calib/std_conf": 0.04645006917027477,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.445568669527897,
"calib/step_q_c_n": 932.0,
"calib/step_q_gap": -0.018136793655000816,
"calib/step_q_w": 0.4637054631828978,
"calib/step_q_w_n": 421.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 521.45703125,
"completions/mean_terminated_length": 521.45703125,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.0384,
"grad_norm": 1.119794249534607,
"kl": 0.064483642578125,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0233,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.03289055451750755,
"mask/share_reasoning": 0.8433002233505249,
"mask/share_step_conf": 0.12380918860435486,
"num_tokens": 8667324.0,
"reward": 0.42279964685440063,
"reward_std": 0.19623100757598877,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.6815191507339478,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": -0.15466980636119843,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.6451778411865234,
"adv/mean_abs_reasoning": 0.47128838300704956,
"adv/mean_abs_step_conf": 0.7719460725784302,
"adv/ratio_final_to_reasoning": 1.3689661456749143,
"adv/ratio_step_to_reasoning": 1.6379484417864028,
"adv/std_final_conf": 0.8708301782608032,
"adv/std_reasoning": 0.7394452691078186,
"adv/std_step_conf": 0.9352310299873352,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.5019915254237289,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.9449152542372882,
"calib/gap": -0.014523809523809605,
"calib/mean_conf": 0.9527542372881357,
"calib/mu_c": 0.945,
"calib/mu_w": 0.9595238095238096,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.49432203389830515,
"calib/std_conf": 0.0742336481384533,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.47167597765363134,
"calib/step_q_c_n": 537.0,
"calib/step_q_gap": 0.034392958785706806,
"calib/step_q_w": 0.43728301886792453,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2603.0,
"completions/max_terminated_length": 2603.0,
"completions/mean_length": 564.5625,
"completions/mean_terminated_length": 571.2569580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.039466666666666664,
"grad_norm": 1.040130615234375,
"kl": 0.05255126953125,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0745,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.028845518827438354,
"mask/share_reasoning": 0.8554621934890747,
"mask/share_step_conf": 0.10397352278232574,
"num_tokens": 8918948.0,
"reward": 0.2830325961112976,
"reward_std": 0.15572383999824524,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.46216249465942383,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": -0.16640979051589966,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7251741290092468,
"adv/mean_abs_reasoning": 0.5883480906486511,
"adv/mean_abs_step_conf": 0.7738329172134399,
"adv/ratio_final_to_reasoning": 1.2325596709419855,
"adv/ratio_step_to_reasoning": 1.315263752042252,
"adv/std_final_conf": 0.9110890626907349,
"adv/std_reasoning": 0.8268707394599915,
"adv/std_step_conf": 0.934368908405304,
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 5.234375,
"calib/ece": 0.40309012875536493,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.9699570815450643,
"calib/gap": 0.0037602688573562526,
"calib/mean_conf": 0.961030042918455,
"calib/mu_c": 0.9626923076923075,
"calib/mu_w": 0.9589320388349513,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.40309012875536493,
"calib/std_conf": 0.04745690566931552,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48682027649769577,
"calib/step_q_c_n": 651.0,
"calib/step_q_gap": 0.02317731568492365,
"calib/step_q_w": 0.4636429608127721,
"calib/step_q_w_n": 689.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 553.5859375,
"completions/mean_terminated_length": 555.7568969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.9234789609909058,
"kl": 0.050746917724609375,
"learning_rate": 4.527777777777778e-06,
"loss": -0.1114,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.029032420367002487,
"mask/share_reasoning": 0.8594563007354736,
"mask/share_step_conf": 0.10760502517223358,
"num_tokens": 9167554.0,
"reward": 0.32611435651779175,
"reward_std": 0.21093352138996124,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5374273657798767,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": -0.16879235208034515,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7383555173873901,
"adv/mean_abs_reasoning": 0.6348795890808105,
"adv/mean_abs_step_conf": 0.7644864916801453,
"adv/ratio_final_to_reasoning": 1.1629851236143751,
"adv/ratio_step_to_reasoning": 1.2041440689359408,
"adv/std_final_conf": 0.9126340746879578,
"adv/std_reasoning": 0.8593624234199524,
"adv/std_step_conf": 0.9349706172943115,
"calib/answer_extract_rate": 0.8828125,
"calib/avg_num_step_conf": 5.8203125,
"calib/ece": 0.44205357142857155,
"calib/final_conf_rate": 0.875,
"calib/format_rate": 0.875,
"calib/frac_conf_gt_0.9": 0.9151785714285714,
"calib/gap": 0.02399266523160315,
"calib/mean_conf": 0.9465178571428573,
"calib/mu_c": 0.9584070796460177,
"calib/mu_w": 0.9344144144144145,
"calib/nonempty_final_conf_rate": 0.875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.44205357142857155,
"calib/std_conf": 0.07916088749940765,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4808552631578948,
"calib/step_q_c_n": 608.0,
"calib/step_q_gap": 0.02749925408760001,
"calib/step_q_w": 0.4533560090702948,
"calib/step_q_w_n": 882.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2507.0,
"completions/max_terminated_length": 2507.0,
"completions/mean_length": 561.3046875,
"completions/mean_terminated_length": 565.7244262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.0416,
"grad_norm": 1.360388159751892,
"kl": 0.048961639404296875,
"learning_rate": 4.5e-06,
"loss": -0.1002,
"mask/has_final_conf_rate": 0.875,
"mask/share_final_conf": 0.02830466441810131,
"mask/share_reasoning": 0.8525670766830444,
"mask/share_step_conf": 0.11131571978330612,
"num_tokens": 9417336.0,
"reward": 0.3034813106060028,
"reward_std": 0.217860609292984,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.49029532074928284,
"rewards/format_reward_step": 0.875,
"rewards/step_l2_reward": -0.1466139405965805,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7583894729614258,
"adv/mean_abs_reasoning": 0.6187953352928162,
"adv/mean_abs_step_conf": 0.7595531940460205,
"adv/ratio_final_to_reasoning": 1.2255901583397573,
"adv/ratio_step_to_reasoning": 1.2274707818968889,
"adv/std_final_conf": 0.9252941012382507,
"adv/std_reasoning": 0.85927414894104,
"adv/std_step_conf": 0.9349480271339417,
"calib/answer_extract_rate": 0.88671875,
"calib/avg_num_step_conf": 5.12890625,
"calib/ece": 0.4759911894273128,
"calib/final_conf_rate": 0.88671875,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.8149779735682819,
"calib/gap": 0.02241496062992121,
"calib/mean_conf": 0.9095594713656387,
"calib/mu_c": 0.9221,
"calib/mu_w": 0.8996850393700788,
"calib/nonempty_final_conf_rate": 0.88671875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.47251101321585903,
"calib/std_conf": 0.14125235871308942,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.450984251968504,
"calib/step_q_c_n": 508.0,
"calib/step_q_gap": 0.005580525260429459,
"calib/step_q_w": 0.44540372670807454,
"calib/step_q_w_n": 805.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1913.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 556.08984375,
"completions/mean_terminated_length": 556.08984375,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.042666666666666665,
"grad_norm": 1.4263373613357544,
"kl": 0.0540618896484375,
"learning_rate": 4.472222222222223e-06,
"loss": -0.1697,
"mask/has_final_conf_rate": 0.88671875,
"mask/share_final_conf": 0.029433563351631165,
"mask/share_reasoning": 0.8610671758651733,
"mask/share_step_conf": 0.10949921607971191,
"num_tokens": 9666455.0,
"reward": 0.2889021635055542,
"reward_std": 0.21637332439422607,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.46521133184432983,
"rewards/format_reward_step": 0.88671875,
"rewards/step_l2_reward": -0.1428757607936859,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7723698616027832,
"adv/mean_abs_reasoning": 0.6076771020889282,
"adv/mean_abs_step_conf": 0.7377324104309082,
"adv/ratio_final_to_reasoning": 1.271020183165884,
"adv/ratio_step_to_reasoning": 1.2140204195532573,
"adv/std_final_conf": 0.9363389611244202,
"adv/std_reasoning": 0.8595664501190186,
"adv/std_step_conf": 0.9345912933349609,
"calib/answer_extract_rate": 0.8515625,
"calib/avg_num_step_conf": 5.2578125,
"calib/ece": 0.15857798165137627,
"calib/final_conf_rate": 0.8515625,
"calib/format_rate": 0.84375,
"calib/frac_conf_gt_0.9": 0.48623853211009177,
"calib/gap": 0.06253179190751423,
"calib/mean_conf": 0.7936238532110093,
"calib/mu_c": 0.8065317919075143,
"calib/mu_w": 0.7440000000000001,
"calib/nonempty_final_conf_rate": 0.8515625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.07931192660550469,
"calib/std_conf": 0.21936770224630264,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4551341890315052,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.07533615861675663,
"calib/step_q_w": 0.5304703476482618,
"calib/step_q_w_n": 489.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1056.0,
"completions/max_terminated_length": 1056.0,
"completions/mean_length": 472.890625,
"completions/mean_terminated_length": 474.7451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.04373333333333333,
"grad_norm": 1.5240364074707031,
"kl": 0.0609893798828125,
"learning_rate": 4.444444444444444e-06,
"loss": -0.2322,
"mask/has_final_conf_rate": 0.8515625,
"mask/share_final_conf": 0.030579429119825363,
"mask/share_reasoning": 0.845970630645752,
"mask/share_step_conf": 0.11954362690448761,
"num_tokens": 9894763.0,
"reward": 0.4309816360473633,
"reward_std": 0.23047995567321777,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6809769868850708,
"rewards/format_reward_step": 0.84375,
"rewards/step_l2_reward": -0.12291993945837021,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7998873591423035,
"adv/mean_abs_reasoning": 0.5951753854751587,
"adv/mean_abs_step_conf": 0.757778525352478,
"adv/ratio_final_to_reasoning": 1.3439523519671648,
"adv/ratio_step_to_reasoning": 1.2732020574868113,
"adv/std_final_conf": 0.935519814491272,
"adv/std_reasoning": 0.8270787596702576,
"adv/std_step_conf": 0.9347375631332397,
"calib/answer_extract_rate": 0.82421875,
"calib/avg_num_step_conf": 5.34765625,
"calib/ece": 0.25783018867924523,
"calib/final_conf_rate": 0.828125,
"calib/format_rate": 0.82421875,
"calib/frac_conf_gt_0.9": 0.42924528301886794,
"calib/gap": 0.056300287356321754,
"calib/mean_conf": 0.7922641509433961,
"calib/mu_c": 0.817758620689655,
"calib/mu_w": 0.7614583333333332,
"calib/nonempty_final_conf_rate": 0.828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2514622641509433,
"calib/std_conf": 0.1821920147105221,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48056506849315067,
"calib/step_q_c_n": 584.0,
"calib/step_q_gap": 0.009609654480411844,
"calib/step_q_w": 0.4709554140127388,
"calib/step_q_w_n": 785.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2805.0,
"completions/max_terminated_length": 2805.0,
"completions/mean_length": 455.0390625,
"completions/mean_terminated_length": 455.0390625,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.0448,
"grad_norm": 0.8913173079490662,
"kl": 0.06402587890625,
"learning_rate": 4.416666666666667e-06,
"loss": -0.2165,
"mask/has_final_conf_rate": 0.828125,
"mask/share_final_conf": 0.03204452618956566,
"mask/share_reasoning": 0.8392115831375122,
"mask/share_step_conf": 0.1287439465522766,
"num_tokens": 10115621.0,
"reward": 0.3496522307395935,
"reward_std": 0.2136489748954773,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.5659554600715637,
"rewards/format_reward_step": 0.82421875,
"rewards/step_l2_reward": -0.12211980670690536,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.852458655834198,
"adv/mean_abs_reasoning": 0.8048035502433777,
"adv/mean_abs_step_conf": 0.7613873481750488,
"adv/ratio_final_to_reasoning": 1.0592133391762613,
"adv/ratio_step_to_reasoning": 0.9460536648288895,
"adv/std_final_conf": 0.9367731809616089,
"adv/std_reasoning": 0.9359363913536072,
"adv/std_step_conf": 0.9346553683280945,
"calib/answer_extract_rate": 0.71484375,
"calib/avg_num_step_conf": 4.671875,
"calib/ece": 0.20497267759562837,
"calib/final_conf_rate": 0.71484375,
"calib/format_rate": 0.70703125,
"calib/frac_conf_gt_0.9": 0.2568306010928962,
"calib/gap": 0.08441549127420511,
"calib/mean_conf": 0.7023497267759564,
"calib/mu_c": 0.7434042553191489,
"calib/mu_w": 0.6589887640449438,
"calib/nonempty_final_conf_rate": 0.71484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19683060109289613,
"calib/std_conf": 0.20942139411753757,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4550574712643678,
"calib/step_q_c_n": 435.0,
"calib/step_q_gap": -0.008792725844699256,
"calib/step_q_w": 0.46385019710906705,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1955.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 480.94140625,
"completions/mean_terminated_length": 480.94140625,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.04586666666666667,
"grad_norm": 1.3835124969482422,
"kl": 0.0644989013671875,
"learning_rate": 4.388888888888889e-06,
"loss": -0.4611,
"mask/has_final_conf_rate": 0.71484375,
"mask/share_final_conf": 0.025213249027729034,
"mask/share_reasoning": 0.8631592392921448,
"mask/share_step_conf": 0.11162751913070679,
"num_tokens": 10343966.0,
"reward": 0.3104777932167053,
"reward_std": 0.2599700689315796,
"rewards/accuracy_reward_step": 0.3671875,
"rewards/final_brier_reward_step": 0.507054328918457,
"rewards/format_reward_step": 0.70703125,
"rewards/step_l2_reward": -0.1009424701333046,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.8195953369140625,
"adv/mean_abs_reasoning": 0.8131282925605774,
"adv/mean_abs_step_conf": 0.7810917496681213,
"adv/ratio_final_to_reasoning": 1.0079532890598606,
"adv/ratio_step_to_reasoning": 0.9606008754269618,
"adv/std_final_conf": 0.9367455840110779,
"adv/std_reasoning": 0.9358900785446167,
"adv/std_step_conf": 0.9341350793838501,
"calib/answer_extract_rate": 0.53125,
"calib/avg_num_step_conf": 4.80859375,
"calib/ece": 0.2119402985074627,
"calib/final_conf_rate": 0.5234375,
"calib/format_rate": 0.5234375,
"calib/frac_conf_gt_0.9": 0.15671641791044777,
"calib/gap": 0.026965240641711463,
"calib/mean_conf": 0.663134328358209,
"calib/mu_c": 0.6768181818181819,
"calib/mu_w": 0.6498529411764704,
"calib/nonempty_final_conf_rate": 0.5234375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1912686567164179,
"calib/std_conf": 0.20979171586467474,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4602666666666667,
"calib/step_q_c_n": 300.0,
"calib/step_q_gap": -0.002343430003580338,
"calib/step_q_w": 0.46261009667024705,
"calib/step_q_w_n": 931.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 496.44921875,
"completions/mean_terminated_length": 498.3961181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.046933333333333334,
"grad_norm": 1.691990852355957,
"kl": 0.06795501708984375,
"learning_rate": 4.361111111111112e-06,
"loss": -0.6901,
"mask/has_final_conf_rate": 0.5234375,
"mask/share_final_conf": 0.019329769536852837,
"mask/share_reasoning": 0.8671712875366211,
"mask/share_step_conf": 0.10959267616271973,
"num_tokens": 10577377.0,
"reward": 0.22037556767463684,
"reward_std": 0.24473640322685242,
"rewards/accuracy_reward_step": 0.26171875,
"rewards/final_brier_reward_step": 0.3613913953304291,
"rewards/format_reward_step": 0.5234375,
"rewards/step_l2_reward": -0.07767152786254883,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.6867586374282837,
"adv/mean_abs_reasoning": 0.7163244485855103,
"adv/mean_abs_step_conf": 0.6675255298614502,
"adv/ratio_final_to_reasoning": 0.9587256707269888,
"adv/ratio_step_to_reasoning": 0.9318759553433911,
"adv/std_final_conf": 0.8759419322013855,
"adv/std_reasoning": 0.8753003478050232,
"adv/std_step_conf": 0.8735520243644714,
"calib/answer_extract_rate": 0.2734375,
"calib/avg_num_step_conf": 4.5,
"calib/ece": 0.26422535211267606,
"calib/final_conf_rate": 0.27734375,
"calib/format_rate": 0.2734375,
"calib/frac_conf_gt_0.9": 0.11267605633802817,
"calib/gap": 0.017556089743589798,
"calib/mean_conf": 0.6481690140845071,
"calib/mu_c": 0.6578125,
"calib/mu_w": 0.6402564102564102,
"calib/nonempty_final_conf_rate": 0.27734375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.23084507042253524,
"calib/std_conf": 0.2252279587575002,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.4766923076923077,
"calib/step_q_c_n": 130.0,
"calib/step_q_gap": 0.011105223543579679,
"calib/step_q_w": 0.465587084148728,
"calib/step_q_w_n": 1022.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2360.0,
"completions/max_terminated_length": 2360.0,
"completions/mean_length": 430.234375,
"completions/mean_terminated_length": 430.234375,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.048,
"grad_norm": 2.014629364013672,
"kl": 0.08644866943359375,
"learning_rate": 4.333333333333334e-06,
"loss": -1.075,
"mask/has_final_conf_rate": 0.27734375,
"mask/share_final_conf": 0.012363248504698277,
"mask/share_reasoning": 0.8680437803268433,
"mask/share_step_conf": 0.11959296464920044,
"num_tokens": 10792565.0,
"reward": 0.10859501361846924,
"reward_std": 0.17341598868370056,
"rewards/accuracy_reward_step": 0.125,
"rewards/final_brier_reward_step": 0.18230313062667847,
"rewards/format_reward_step": 0.2734375,
"rewards/step_l2_reward": -0.04480060189962387,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.2659332752227783,
"adv/mean_abs_reasoning": 0.27259397506713867,
"adv/mean_abs_step_conf": 0.2573040723800659,
"adv/ratio_final_to_reasoning": 0.9755654913402989,
"adv/ratio_step_to_reasoning": 0.9439096088484461,
"adv/std_final_conf": 0.5735336542129517,
"adv/std_reasoning": 0.5728173851966858,
"adv/std_step_conf": 0.5705223083496094,
"calib/answer_extract_rate": 0.08203125,
"calib/avg_num_step_conf": 3.69140625,
"calib/ece": 0.27476190476190476,
"calib/final_conf_rate": 0.08203125,
"calib/format_rate": 0.08203125,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/gap": 0.16142857142857125,
"calib/mean_conf": 0.6080952380952381,
"calib/mu_c": 0.7157142857142856,
"calib/mu_w": 0.5542857142857144,
"calib/nonempty_final_conf_rate": 0.08203125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.27476190476190476,
"calib/std_conf": 0.22402421313903184,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4428571428571428,
"calib/step_q_c_n": 28.0,
"calib/step_q_gap": -0.0036641221374046906,
"calib/step_q_w": 0.4465212649945475,
"calib/step_q_w_n": 917.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2995.0,
"completions/max_terminated_length": 2995.0,
"completions/mean_length": 388.27734375,
"completions/mean_terminated_length": 388.27734375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.04906666666666667,
"grad_norm": 3.7936737537384033,
"kl": 0.1002655029296875,
"learning_rate": 4.305555555555556e-06,
"loss": -0.6942,
"mask/has_final_conf_rate": 0.08203125,
"mask/share_final_conf": 0.003270457498729229,
"mask/share_reasoning": 0.8765566349029541,
"mask/share_step_conf": 0.1201729029417038,
"num_tokens": 10996732.0,
"reward": 0.03558116778731346,
"reward_std": 0.07172537595033646,
"rewards/accuracy_reward_step": 0.02734375,
"rewards/final_brier_reward_step": 0.05937773361802101,
"rewards/format_reward_step": 0.08203125,
"rewards/step_l2_reward": -0.010090397670865059,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.09661008417606354,
"adv/mean_abs_reasoning": 0.14102500677108765,
"adv/mean_abs_step_conf": 0.09573409706354141,
"adv/ratio_final_to_reasoning": 0.6850564051585646,
"adv/ratio_step_to_reasoning": 0.6788448322426772,
"adv/std_final_conf": 0.37023141980171204,
"adv/std_reasoning": 0.43742695450782776,
"adv/std_step_conf": 0.3668881356716156,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 3.09375,
"calib/ece": 0.4033333333333334,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.15166666666666667,
"calib/mean_conf": 0.5877777777777777,
"calib/mu_c": 0.6383333333333333,
"calib/mu_w": 0.48666666666666664,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1622222222222222,
"calib/std_conf": 0.3198186214360946,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.49692307692307697,
"calib/step_q_c_n": 26.0,
"calib/step_q_gap": 0.09089174533038769,
"calib/step_q_w": 0.4060313315926893,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1854.0,
"completions/max_terminated_length": 1854.0,
"completions/mean_length": 311.859375,
"completions/mean_terminated_length": 311.859375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.050133333333333335,
"grad_norm": 1.8240582942962646,
"kl": 0.11304473876953125,
"learning_rate": 4.277777777777778e-06,
"loss": -0.3307,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0008738382603041828,
"mask/share_reasoning": 0.882652759552002,
"mask/share_step_conf": 0.11647340655326843,
"num_tokens": 11182544.0,
"reward": 0.011280306614935398,
"reward_std": 0.03114240989089012,
"rewards/accuracy_reward_step": 0.0234375,
"rewards/final_brier_reward_step": 0.016706641763448715,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.002739777322858572,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.019286589697003365,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.019292207434773445,
"adv/ratio_final_to_reasoning": 1.000320062725725,
"adv/ratio_step_to_reasoning": 1.0006114328376434,
"adv/std_final_conf": 0.16526895761489868,
"adv/std_reasoning": 0.16521607339382172,
"adv/std_step_conf": 0.165317103266716,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.62109375,
"calib/ece": 0.88,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.88,
"calib/mu_c": NaN,
"calib/mu_w": 0.88,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.88,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_w": 0.3338875502008032,
"calib/step_q_w_n": 415.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1733.0,
"completions/max_terminated_length": 1733.0,
"completions/mean_length": 210.82421875,
"completions/mean_terminated_length": 210.82421875,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.0512,
"grad_norm": 0.3917446732521057,
"kl": 0.161651611328125,
"learning_rate": 4.25e-06,
"loss": -0.0596,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.000414299254771322,
"mask/share_reasoning": 0.8767637014389038,
"mask/share_step_conf": 0.12282195687294006,
"num_tokens": 11340203.0,
"reward": 0.00033257147879339755,
"reward_std": 0.000940654135774821,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0008812500163912773,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0009973570704460144,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 1.26953125,
"calib/ece": 0.975,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.975,
"calib/mu_c": NaN,
"calib/mu_w": 0.975,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.975,
"calib/std_conf": 0.0050000000000000044,
"calib/step_conf_rate": 0.96875,
"calib/step_q_w": 0.2965538461538461,
"calib/step_q_w_n": 325.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1345.0,
"completions/max_terminated_length": 1345.0,
"completions/mean_length": 141.046875,
"completions/mean_terminated_length": 141.046875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.25012171268463135,
"kl": 0.2276153564453125,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0187,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00011055837967433035,
"mask/share_reasoning": 0.8652602434158325,
"mask/share_step_conf": 0.13462916016578674,
"num_tokens": 11480847.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.13671875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_w": 0.335085910652921,
"calib/step_q_w_n": 291.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1318.0,
"completions/max_terminated_length": 1318.0,
"completions/mean_length": 153.07421875,
"completions/mean_terminated_length": 153.07421875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.19393673539161682,
"kl": 0.192138671875,
"learning_rate": 4.194444444444445e-06,
"loss": 0.0201,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8889811038970947,
"mask/share_step_conf": 0.11101890355348587,
"num_tokens": 11625394.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.0078125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_w": 0.3553875968992248,
"calib/step_q_w_n": 258.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 942.0,
"completions/max_terminated_length": 942.0,
"completions/mean_length": 130.890625,
"completions/mean_terminated_length": 130.890625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.0544,
"grad_norm": 0.2684899568557739,
"kl": 0.2183380126953125,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0212,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8706995248794556,
"mask/share_step_conf": 0.12930047512054443,
"num_tokens": 11768198.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.019299857318401337,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.019311558455228806,
"adv/ratio_final_to_reasoning": 1.0010082024164397,
"adv/ratio_step_to_reasoning": 1.0016150946721005,
"adv/std_final_conf": 0.16538265347480774,
"adv/std_reasoning": 0.16521607339382172,
"adv/std_step_conf": 0.16548292338848114,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.0,
"calib/ece": 0.83,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.83,
"calib/mu_c": NaN,
"calib/mu_w": 0.83,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.83,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_w": 0.4443359375,
"calib/step_q_w_n": 256.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1137.0,
"completions/max_terminated_length": 1137.0,
"completions/mean_length": 139.91015625,
"completions/mean_terminated_length": 140.45883178710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.41800710558891296,
"kl": 0.1961212158203125,
"learning_rate": 4.138888888888889e-06,
"loss": -0.0459,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0002508600882720202,
"mask/share_reasoning": 0.8741176724433899,
"mask/share_step_conf": 0.12172523885965347,
"num_tokens": 11911967.0,
"reward": 8.606584742665291e-05,
"reward_std": 0.00024343098630197346,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.001215234398841858,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0018243527738377452,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.01171875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_w": 0.515907335907336,
"calib/step_q_w_n": 259.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1001.0,
"completions/max_terminated_length": 1001.0,
"completions/mean_length": 125.6328125,
"completions/mean_terminated_length": 125.6328125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.24187202751636505,
"kl": 0.2152557373046875,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0195,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8754205107688904,
"mask/share_step_conf": 0.12457950413227081,
"num_tokens": 12049953.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_w": 0.6044230769230768,
"calib/step_q_w_n": 260.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 748.0,
"completions/max_terminated_length": 748.0,
"completions/mean_length": 134.953125,
"completions/mean_terminated_length": 134.953125,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.0576,
"grad_norm": 0.08608198165893555,
"kl": 0.1942138671875,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0183,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8727961778640747,
"mask/share_step_conf": 0.12720385193824768,
"num_tokens": 12190733.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.9921875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_w": 0.6146062992125984,
"calib/step_q_w_n": 254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 977.0,
"completions/max_terminated_length": 977.0,
"completions/mean_length": 139.8125,
"completions/mean_terminated_length": 139.8125,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.3314754068851471,
"kl": 0.1821441650390625,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0194,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8783440589904785,
"mask/share_step_conf": 0.12165594100952148,
"num_tokens": 12334349.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.019313529133796692,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.019305992871522903,
"adv/ratio_final_to_reasoning": 1.0017173060707774,
"adv/ratio_step_to_reasoning": 1.001326429587746,
"adv/std_final_conf": 0.16549980640411377,
"adv/std_reasoning": 0.16521607339382172,
"adv/std_step_conf": 0.16543522477149963,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.015625,
"calib/ece": 0.7,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.7,
"calib/mu_c": NaN,
"calib/mu_w": 0.7,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.7,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.96875,
"calib/step_q_w": 0.6632307692307692,
"calib/step_q_w_n": 260.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 851.0,
"completions/max_terminated_length": 851.0,
"completions/mean_length": 125.453125,
"completions/mean_terminated_length": 125.453125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.38354986906051636,
"kl": 0.2048492431640625,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0481,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00015233286831062287,
"mask/share_reasoning": 0.8705087900161743,
"mask/share_step_conf": 0.12933892011642456,
"num_tokens": 12473305.0,
"reward": 0.0006501090247184038,
"reward_std": 0.0018387859454378486,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.001992187462747097,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0014732194831594825,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.019323885440826416,
"adv/mean_abs_reasoning": 0.01930764690041542,
"adv/mean_abs_step_conf": 0.01930798403918743,
"adv/ratio_final_to_reasoning": 1.0008410419195437,
"adv/ratio_step_to_reasoning": 1.0000174614117272,
"adv/std_final_conf": 0.16558855772018433,
"adv/std_reasoning": 0.16544939577579498,
"adv/std_step_conf": 0.1654522866010666,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.0078125,
"calib/ece": 0.10999999999999999,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.89,
"calib/mu_c": 0.89,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.895,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.20169140625000004,
"calib/step_q_w": 0.69330859375,
"calib/step_q_w_n": 256.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1275.0,
"completions/max_terminated_length": 1275.0,
"completions/mean_length": 148.18359375,
"completions/mean_terminated_length": 148.18359375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0608,
"grad_norm": 0.37517762184143066,
"kl": 0.1676788330078125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0603,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00014583332813344896,
"mask/share_reasoning": 0.8833422660827637,
"mask/share_step_conf": 0.11651188880205154,
"num_tokens": 12618032.0,
"reward": 0.0019196701468899846,
"reward_std": 0.00542964693158865,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00385898444801569,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0015821442939341068,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.03125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_w": 0.6980719696969697,
"calib/step_q_w_n": 264.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 932.0,
"completions/max_terminated_length": 932.0,
"completions/mean_length": 137.72265625,
"completions/mean_terminated_length": 137.72265625,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.47784343361854553,
"kl": 0.1968841552734375,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0183,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8683435916900635,
"mask/share_step_conf": 0.1316564381122589,
"num_tokens": 12759609.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521607339382172,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.05078125,
"calib/ece": 0.25,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.75,
"calib/mu_c": 0.75,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_w": 0.7050929368029739,
"calib/step_q_w_n": 269.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 993.0,
"completions/max_terminated_length": 993.0,
"completions/mean_length": 155.48828125,
"completions/mean_terminated_length": 155.48828125,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.1444707214832306,
"kl": 0.1646575927734375,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0064,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0001254300441360101,
"mask/share_reasoning": 0.8840838670730591,
"mask/share_step_conf": 0.11579069495201111,
"num_tokens": 12905662.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.1953125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_w": 0.6973169934640524,
"calib/step_q_w_n": 306.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1015.0,
"completions/max_terminated_length": 1015.0,
"completions/mean_length": 162.34375,
"completions/mean_terminated_length": 162.34375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.064,
"grad_norm": 0.1543644368648529,
"kl": 0.16278076171875,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0151,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8886805772781372,
"mask/share_step_conf": 0.111319400370121,
"num_tokens": 13056078.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.265625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_w": 0.7531172839506173,
"calib/step_q_w_n": 324.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 922.0,
"completions/max_terminated_length": 922.0,
"completions/mean_length": 149.2265625,
"completions/mean_terminated_length": 149.2265625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.5520812273025513,
"kl": 0.1855621337890625,
"learning_rate": 3.88888888888889e-06,
"loss": 0.016,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8732104301452637,
"mask/share_step_conf": 0.12678956985473633,
"num_tokens": 13198344.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.044541239738464355,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.23372872173786163,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 1.3828125,
"calib/ece": 0.22249999999999995,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.17000000000000004,
"calib/mean_conf": 0.9275,
"calib/mu_c": 0.9700000000000001,
"calib/mu_w": 0.8,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.2,
"calib/std_conf": 0.07361215932167725,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_w": 0.7208813559322034,
"calib/step_q_w_n": 354.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 836.0,
"completions/max_terminated_length": 836.0,
"completions/mean_length": 200.24609375,
"completions/mean_terminated_length": 201.03138732910156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.15924867987632751,
"kl": 0.1466217041015625,
"learning_rate": 3.861111111111112e-06,
"loss": 0.0156,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.00020690049859695137,
"mask/share_reasoning": 0.8920506834983826,
"mask/share_step_conf": 0.10383619368076324,
"num_tokens": 13356687.0,
"reward": 0.0011718750465661287,
"reward_std": 0.002551448065787554,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.4296875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_w": 0.7488524590163935,
"calib/step_q_w_n": 366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1489.0,
"completions/max_terminated_length": 1489.0,
"completions/mean_length": 221.94921875,
"completions/mean_terminated_length": 221.94921875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.0672,
"grad_norm": 0.13147154450416565,
"kl": 0.12944793701171875,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0131,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.8959101438522339,
"mask/share_step_conf": 0.1040898859500885,
"num_tokens": 13522146.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.03812452405691147,
"adv/mean_abs_reasoning": 0.08312930166721344,
"adv/mean_abs_step_conf": 0.03789771348237991,
"adv/ratio_final_to_reasoning": 0.4586171577566367,
"adv/ratio_step_to_reasoning": 0.4558887506849698,
"adv/std_final_conf": 0.23102885484695435,
"adv/std_reasoning": 0.3306039571762085,
"adv/std_step_conf": 0.22967655956745148,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 1.68359375,
"calib/ece": 0.304,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": -0.15749999999999997,
"calib/mean_conf": 0.8639999999999999,
"calib/mu_c": 0.8325,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.18399999999999997,
"calib/std_conf": 0.1342534915747073,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.95,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.21518279069767432,
"calib/step_q_w": 0.7348172093023256,
"calib/step_q_w_n": 430.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2281.0,
"completions/max_terminated_length": 2281.0,
"completions/mean_length": 232.96484375,
"completions/mean_terminated_length": 232.96484375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.8708637952804565,
"kl": 0.1328582763671875,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.1549,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0008578735869377851,
"mask/share_reasoning": 0.889821469783783,
"mask/share_step_conf": 0.10932067036628723,
"num_tokens": 13685561.0,
"reward": 0.002973411697894335,
"reward_std": 0.01275265496224165,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.003977734129875898,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.0027184111531823874,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.02526082471013069,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16532622277736664,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 1.828125,
"calib/ece": 0.06499999999999995,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.935,
"calib/mu_c": 0.935,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.0,
"calib/std_conf": 0.034999999999999976,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_w": 0.7642735042735043,
"calib/step_q_w_n": 468.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1282.0,
"completions/max_terminated_length": 1282.0,
"completions/mean_length": 210.89453125,
"completions/mean_terminated_length": 210.89453125,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.16085658967494965,
"kl": 0.15030670166015625,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0059,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00012572437117341906,
"mask/share_reasoning": 0.8844339847564697,
"mask/share_step_conf": 0.1154402494430542,
"num_tokens": 13844574.0,
"reward": 0.0007812500116415322,
"reward_std": 0.0014465939020738006,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 1.875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_w": 0.7479742361111111,
"calib/step_q_w_n": 480.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1680.0,
"completions/max_terminated_length": 1680.0,
"completions/mean_length": 259.40625,
"completions/mean_terminated_length": 259.40625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.0704,
"grad_norm": 0.10541064292192459,
"kl": 0.1195220947265625,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0107,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.895097017288208,
"mask/share_step_conf": 0.10490301251411438,
"num_tokens": 14017334.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.038456253707408905,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.03827614337205887,
"adv/ratio_final_to_reasoning": 0.9972878234343494,
"adv/ratio_step_to_reasoning": 0.992617013695941,
"adv/std_final_conf": 0.23301896452903748,
"adv/std_reasoning": 0.2336508184671402,
"adv/std_step_conf": 0.2319279909133911,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 2.03515625,
"calib/ece": 0.9133333333333333,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/mean_conf": 0.9133333333333334,
"calib/mu_c": NaN,
"calib/mu_w": 0.9133333333333334,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.9133333333333333,
"calib/std_conf": 0.08013876853447535,
"calib/step_conf_rate": 0.96875,
"calib/step_q_w": 0.7468445297504799,
"calib/step_q_w_n": 521.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1351.0,
"completions/max_terminated_length": 1351.0,
"completions/mean_length": 240.91015625,
"completions/mean_terminated_length": 240.91015625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.07146666666666666,
"grad_norm": 1.064578890800476,
"kl": 0.127685546875,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0962,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0005408349097706378,
"mask/share_reasoning": 0.8941315412521362,
"mask/share_step_conf": 0.10532761365175247,
"num_tokens": 14184015.0,
"reward": 0.0013375489506870508,
"reward_std": 0.0037831594236195087,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0016371094388887286,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.0005245117354206741,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.08223120868206024,
"adv/mean_abs_reasoning": 0.10238249599933624,
"adv/mean_abs_step_conf": 0.0826721042394638,
"adv/ratio_final_to_reasoning": 0.8031764402637083,
"adv/ratio_step_to_reasoning": 0.8074827970593702,
"adv/std_final_conf": 0.32961374521255493,
"adv/std_reasoning": 0.3694836497306824,
"adv/std_step_conf": 0.32891473174095154,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 2.12109375,
"calib/ece": 0.8766666666666667,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.8766666666666666,
"calib/mu_c": NaN,
"calib/mu_w": 0.8766666666666666,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.8766666666666667,
"calib/std_conf": 0.10322575044801346,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_w": 0.7278688766114181,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1589.0,
"completions/max_terminated_length": 1589.0,
"completions/mean_length": 271.78515625,
"completions/mean_terminated_length": 271.78515625,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.07253333333333334,
"grad_norm": 1.744085669517517,
"kl": 0.11829376220703125,
"learning_rate": 3.694444444444445e-06,
"loss": -0.2893,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.001398720545694232,
"mask/share_reasoning": 0.8910526633262634,
"mask/share_step_conf": 0.10754863917827606,
"num_tokens": 14357680.0,
"reward": 0.00116417882964015,
"reward_std": 0.009257977828383446,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0035250000655651093,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.005884142592549324,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.09641197323799133,
"adv/mean_abs_reasoning": 0.09645654261112213,
"adv/mean_abs_step_conf": 0.09608538448810577,
"adv/ratio_final_to_reasoning": 0.9995379331259002,
"adv/ratio_step_to_reasoning": 0.9961520689735611,
"adv/std_final_conf": 0.3694744110107422,
"adv/std_reasoning": 0.36964312195777893,
"adv/std_step_conf": 0.3682325482368469,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 2.21484375,
"calib/ece": 0.432,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.4,
"calib/gap": -0.0033333333333334103,
"calib/mean_conf": 0.8320000000000001,
"calib/mu_c": 0.83,
"calib/mu_w": 0.8333333333333334,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.432,
"calib/std_conf": 0.11855800268223146,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6866666666666666,
"calib/step_q_c_n": 3.0,
"calib/step_q_gap": -0.044130023640661986,
"calib/step_q_w": 0.7307966903073286,
"calib/step_q_w_n": 564.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1816.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 302.3203125,
"completions/mean_terminated_length": 302.3203125,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.0736,
"grad_norm": 1.0207089185714722,
"kl": 0.106475830078125,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.3893,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0009857293916866183,
"mask/share_reasoning": 0.9040440320968628,
"mask/share_step_conf": 0.0949702113866806,
"num_tokens": 14539570.0,
"reward": 0.005831425078213215,
"reward_std": 0.018426839262247086,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.010892968624830246,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.004698868375271559,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.11544821411371231,
"adv/mean_abs_reasoning": 0.13496293127536774,
"adv/mean_abs_step_conf": 0.11570356786251068,
"adv/ratio_final_to_reasoning": 0.8554068366977068,
"adv/ratio_step_to_reasoning": 0.8572988654672759,
"adv/std_final_conf": 0.4038769602775574,
"adv/std_reasoning": 0.43712061643600464,
"adv/std_step_conf": 0.4047698676586151,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 2.4375,
"calib/ece": 0.77,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.7142857142857143,
"calib/gap": 0.019999999999999907,
"calib/mean_conf": 0.9128571428571429,
"calib/mu_c": 0.93,
"calib/mu_w": 0.9100000000000001,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.77,
"calib/std_conf": 0.05495824017620382,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_w": 0.724153205128205,
"calib/step_q_w_n": 624.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1243.0,
"completions/max_terminated_length": 1243.0,
"completions/mean_length": 272.37109375,
"completions/mean_terminated_length": 272.37109375,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.07466666666666667,
"grad_norm": 1.4476022720336914,
"kl": 0.1175994873046875,
"learning_rate": 3.638888888888889e-06,
"loss": -0.3608,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.002013806952163577,
"mask/share_reasoning": 0.8816252946853638,
"mask/share_step_conf": 0.11636090278625488,
"num_tokens": 14716289.0,
"reward": 0.0008833690080791712,
"reward_std": 0.006656920071691275,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.003947656136006117,
"rewards/format_reward_step": 0.0234375,
"rewards/step_l2_reward": -0.007649668026715517,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.11568950116634369,
"adv/mean_abs_reasoning": 0.1157369613647461,
"adv/mean_abs_step_conf": 0.11430220305919647,
"adv/ratio_final_to_reasoning": 0.9995899304954721,
"adv/ratio_step_to_reasoning": 0.9876032834400416,
"adv/std_final_conf": 0.40472161769866943,
"adv/std_reasoning": 0.4048856496810913,
"adv/std_step_conf": 0.39993974566459656,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 2.6328125,
"calib/ece": 0.5416666666666666,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.14250000000000007,
"calib/mean_conf": 0.875,
"calib/mu_c": 0.97,
"calib/mu_w": 0.8274999999999999,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.5416666666666666,
"calib/std_conf": 0.11800423721205947,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.89,
"calib/step_q_c_n": 3.0,
"calib/step_q_gap": 0.1498345752608049,
"calib/step_q_w": 0.7401654247391951,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1238.0,
"completions/max_terminated_length": 1238.0,
"completions/mean_length": 264.23828125,
"completions/mean_terminated_length": 264.23828125,
"completions/min_length": 82.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.07573333333333333,
"grad_norm": 1.3018159866333008,
"kl": 0.117828369140625,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.3776,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0016604708507657051,
"mask/share_reasoning": 0.8779726624488831,
"mask/share_step_conf": 0.1203669011592865,
"num_tokens": 14888342.0,
"reward": 0.005938471294939518,
"reward_std": 0.021040217950940132,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.012510547414422035,
"rewards/format_reward_step": 0.0234375,
"rewards/step_l2_reward": -0.006883603520691395,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.09619291871786118,
"adv/mean_abs_reasoning": 0.11568251252174377,
"adv/mean_abs_step_conf": 0.0960080623626709,
"adv/ratio_final_to_reasoning": 0.8315251512174814,
"adv/ratio_step_to_reasoning": 0.829927188386621,
"adv/std_final_conf": 0.36863529682159424,
"adv/std_reasoning": 0.404695063829422,
"adv/std_step_conf": 0.3679353594779968,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 2.67578125,
"calib/ece": 0.892,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.4,
"calib/mean_conf": 0.892,
"calib/mu_c": NaN,
"calib/mu_w": 0.892,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.892,
"calib/std_conf": 0.06764613810115104,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_w": 0.7401007785888079,
"calib/step_q_w_n": 685.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 958.0,
"completions/max_terminated_length": 958.0,
"completions/mean_length": 274.6875,
"completions/mean_terminated_length": 274.6875,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.0768,
"grad_norm": 1.2662255764007568,
"kl": 0.1224365234375,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.3751,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0011026529828086495,
"mask/share_reasoning": 0.8825006484985352,
"mask/share_step_conf": 0.1163966953754425,
"num_tokens": 15063070.0,
"reward": 0.0011323363287374377,
"reward_std": 0.007427211385220289,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0039015626534819603,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.006324389949440956,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.08131413161754608,
"adv/mean_abs_reasoning": 0.10292495787143707,
"adv/mean_abs_step_conf": 0.07757073640823364,
"adv/ratio_final_to_reasoning": 0.7900331785329955,
"adv/ratio_step_to_reasoning": 0.7536630377359665,
"adv/std_final_conf": 0.32966530323028564,
"adv/std_reasoning": 0.36954551935195923,
"adv/std_step_conf": 0.329280823469162,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 2.29296875,
"calib/ece": 0.46285714285714286,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.7142857142857143,
"calib/gap": 0.13750000000000018,
"calib/mean_conf": 0.8914285714285715,
"calib/mu_c": 0.9700000000000001,
"calib/mu_w": 0.8324999999999999,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.46285714285714286,
"calib/std_conf": 0.12426108130275793,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4166666666666667,
"calib/step_q_c_n": 6.0,
"calib/step_q_gap": -0.34818703384968447,
"calib/step_q_w": 0.7648537005163512,
"calib/step_q_w_n": 581.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 962.0,
"completions/max_terminated_length": 962.0,
"completions/mean_length": 263.05078125,
"completions/mean_terminated_length": 263.05078125,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.07786666666666667,
"grad_norm": 1.5457650423049927,
"kl": 0.1108245849609375,
"learning_rate": 3.555555555555556e-06,
"loss": -0.3003,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0014520924305543303,
"mask/share_reasoning": 0.8960614204406738,
"mask/share_step_conf": 0.10248646140098572,
"num_tokens": 15237443.0,
"reward": 0.0062183234840631485,
"reward_std": 0.013650456443428993,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.008403124287724495,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.0022164792753756046,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.09627045691013336,
"adv/mean_abs_reasoning": 0.11570973694324493,
"adv/mean_abs_step_conf": 0.08923877775669098,
"adv/ratio_final_to_reasoning": 0.8319996177793884,
"adv/ratio_step_to_reasoning": 0.7712296312665733,
"adv/std_final_conf": 0.3689347803592682,
"adv/std_reasoning": 0.40479037165641785,
"adv/std_step_conf": 0.3464997708797455,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 2.3671875,
"calib/ece": 0.6666666666666666,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.11714285714285722,
"calib/mean_conf": 0.8888888888888888,
"calib/mu_c": 0.98,
"calib/mu_w": 0.8628571428571428,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.6666666666666666,
"calib/std_conf": 0.1257373315416304,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.765,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.03894519867549673,
"calib/step_q_w": 0.7260548013245033,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1318.0,
"completions/max_terminated_length": 1318.0,
"completions/mean_length": 252.3203125,
"completions/mean_terminated_length": 252.3203125,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.07893333333333333,
"grad_norm": 2.045851707458496,
"kl": 0.12564849853515625,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.3596,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0016800765879452229,
"mask/share_reasoning": 0.8799179196357727,
"mask/share_step_conf": 0.11840201169252396,
"num_tokens": 15405965.0,
"reward": 0.004328823648393154,
"reward_std": 0.015819130465388298,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.008874218910932541,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.0056853219866752625,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.038303278386592865,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.03767896816134453,
"adv/ratio_final_to_reasoning": 0.9933207072951502,
"adv/ratio_step_to_reasoning": 0.9771304410663373,
"adv/std_final_conf": 0.23209019005298615,
"adv/std_reasoning": 0.2336508184671402,
"adv/std_step_conf": 0.22837959229946136,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 2.40625,
"calib/ece": 0.97,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.97,
"calib/mu_c": NaN,
"calib/mu_w": 0.97,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.97,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.984375,
"calib/step_q_w": 0.7387186688311689,
"calib/step_q_w_n": 616.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2305.0,
"completions/max_terminated_length": 2305.0,
"completions/mean_length": 242.41015625,
"completions/mean_terminated_length": 242.41015625,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.08,
"grad_norm": 0.2904098927974701,
"kl": 0.1312713623046875,
"learning_rate": 3.5e-06,
"loss": -0.1373,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0005445777205750346,
"mask/share_reasoning": 0.8716053366661072,
"mask/share_step_conf": 0.12785005569458008,
"num_tokens": 15572774.0,
"reward": -4.18518902733922e-06,
"reward_std": 0.002756119705736637,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0004617187369149178,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.002032589167356491,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.07703585922718048,
"adv/mean_abs_reasoning": 0.09642931818962097,
"adv/mean_abs_step_conf": 0.0654100775718689,
"adv/ratio_final_to_reasoning": 0.7988842052755707,
"adv/ratio_step_to_reasoning": 0.6783214773254429,
"adv/std_final_conf": 0.3300666809082031,
"adv/std_reasoning": 0.3695387542247772,
"adv/std_step_conf": 0.29358813166618347,
"calib/answer_extract_rate": 0.02734375,
"calib/avg_num_step_conf": 2.02734375,
"calib/ece": 0.6114285714285713,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.5714285714285714,
"calib/gap": 0.05900000000000016,
"calib/mean_conf": 0.8228571428571428,
"calib/mu_c": 0.865,
"calib/mu_w": 0.8059999999999998,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.5742857142857142,
"calib/std_conf": 0.16900670195623144,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.965,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.22326034816247575,
"calib/step_q_w": 0.7417396518375242,
"calib/step_q_w_n": 517.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 952.0,
"completions/max_terminated_length": 952.0,
"completions/mean_length": 229.1796875,
"completions/mean_terminated_length": 229.1796875,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.08106666666666666,
"grad_norm": 1.202623963356018,
"kl": 0.13092041015625,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.2987,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.002270713448524475,
"mask/share_reasoning": 0.8843415975570679,
"mask/share_step_conf": 0.11338771134614944,
"num_tokens": 15734500.0,
"reward": 0.0039461469277739525,
"reward_std": 0.01371490303426981,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.007228515576571226,
"rewards/format_reward_step": 0.015625,
"rewards/step_l2_reward": -0.004023721441626549,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.07710777968168259,
"adv/mean_abs_reasoning": 0.08146252483129501,
"adv/mean_abs_step_conf": 0.076274573802948,
"adv/ratio_final_to_reasoning": 0.9465429636678842,
"adv/ratio_step_to_reasoning": 0.9363148755935197,
"adv/std_final_conf": 0.33037495613098145,
"adv/std_reasoning": 0.3306713402271271,
"adv/std_step_conf": 0.32685402035713196,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 2.1640625,
"calib/ece": 0.34666666666666673,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.11999999999999988,
"calib/mean_conf": 0.8466666666666666,
"calib/mu_c": 0.9066666666666666,
"calib/mu_w": 0.7866666666666667,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.34666666666666673,
"calib/std_conf": 0.13374935098492585,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.8500000000000001,
"calib/step_q_c_n": 4.0,
"calib/step_q_gap": 0.13186727272727294,
"calib/step_q_w": 0.7181327272727271,
"calib/step_q_w_n": 550.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2538.0,
"completions/max_terminated_length": 2538.0,
"completions/mean_length": 215.12109375,
"completions/mean_terminated_length": 215.12109375,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.08213333333333334,
"grad_norm": 14.370048522949219,
"kl": 1.21722412109375,
"learning_rate": 3.444444444444445e-06,
"loss": -0.2759,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0014718102756887674,
"mask/share_reasoning": 0.8603799343109131,
"mask/share_step_conf": 0.1381482481956482,
"num_tokens": 15894235.0,
"reward": 0.006166364997625351,
"reward_std": 0.016257690265774727,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.00962656270712614,
"rewards/format_reward_step": 0.015625,
"rewards/step_l2_reward": -0.002762582851573825,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.19243697822093964,
"adv/mean_abs_reasoning": 0.19285863637924194,
"adv/mean_abs_step_conf": 0.18977494537830353,
"adv/ratio_final_to_reasoning": 0.9978136412959326,
"adv/ratio_step_to_reasoning": 0.9840106149310598,
"adv/std_final_conf": 0.5214677453041077,
"adv/std_reasoning": 0.5226067304611206,
"adv/std_step_conf": 0.5145155787467957,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 1.9921875,
"calib/ece": 0.6799999999999999,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.7,
"calib/gap": 0.11250000000000004,
"calib/mean_conf": 0.8800000000000001,
"calib/mu_c": 0.97,
"calib/mu_w": 0.8574999999999999,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.6799999999999999,
"calib/std_conf": 0.183412104289766,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.8049999999999999,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.05166907480314953,
"calib/step_q_w": 0.7533309251968504,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 923.0,
"completions/max_terminated_length": 923.0,
"completions/mean_length": 222.7734375,
"completions/mean_terminated_length": 222.7734375,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.0832,
"grad_norm": 1.918144941329956,
"kl": 0.1381072998046875,
"learning_rate": 3.416666666666667e-06,
"loss": -0.5302,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0031930464319884777,
"mask/share_reasoning": 0.8872984647750854,
"mask/share_step_conf": 0.1095084697008133,
"num_tokens": 16059289.0,
"reward": 0.006953669711947441,
"reward_std": 0.030223235487937927,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.01484218705445528,
"rewards/format_reward_step": 0.0390625,
"rewards/step_l2_reward": -0.010309848934412003,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.0188005194067955,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.01921161264181137,
"adv/ratio_final_to_reasoning": 0.4875547478257375,
"adv/ratio_step_to_reasoning": 0.49821564788888056,
"adv/std_final_conf": 0.1611037701368332,
"adv/std_reasoning": 0.2336508184671402,
"adv/std_step_conf": 0.16462647914886475,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 1.83203125,
"calib/ece": 0.6333333333333335,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.06999999999999995,
"calib/mean_conf": 0.9666666666666668,
"calib/mu_c": 0.92,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.6333333333333335,
"calib/std_conf": 0.03299831645537219,
"calib/step_conf_rate": 0.96875,
"calib/step_q_w": 0.7336100213219616,
"calib/step_q_w_n": 469.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 772.0,
"completions/max_terminated_length": 772.0,
"completions/mean_length": 185.90234375,
"completions/mean_terminated_length": 185.90234375,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.08426666666666667,
"grad_norm": 1.044692039489746,
"kl": 0.153411865234375,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0782,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0006982251070439816,
"mask/share_reasoning": 0.8826907277107239,
"mask/share_step_conf": 0.1166110560297966,
"num_tokens": 16213256.0,
"reward": 0.0006480214651674032,
"reward_std": 0.0018328814767301083,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 7.773437391733751e-05,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.00034419141593389213,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.13986647129058838,
"adv/mean_abs_reasoning": 0.1668010950088501,
"adv/mean_abs_step_conf": 0.14373824000358582,
"adv/ratio_final_to_reasoning": 0.8385225006056907,
"adv/ratio_step_to_reasoning": 0.8617343908681138,
"adv/std_final_conf": 0.43378275632858276,
"adv/std_reasoning": 0.4675740599632263,
"adv/std_step_conf": 0.4371723234653473,
"calib/answer_extract_rate": 0.046875,
"calib/avg_num_step_conf": 1.77734375,
"calib/ece": 0.53,
"calib/final_conf_rate": 0.046875,
"calib/format_rate": 0.0390625,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": -0.01828571428571435,
"calib/mean_conf": 0.9466666666666667,
"calib/mu_c": 0.9359999999999999,
"calib/mu_w": 0.9542857142857143,
"calib/nonempty_final_conf_rate": 0.046875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.53,
"calib/std_conf": 0.05821416398857659,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.8175,
"calib/step_q_c_n": 4.0,
"calib/step_q_gap": 0.06217110125646719,
"calib/step_q_w": 0.7553288987435328,
"calib/step_q_w_n": 451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 173.30859375,
"completions/mean_terminated_length": 173.30859375,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.08533333333333333,
"grad_norm": 1.3374284505844116,
"kl": 0.19683837890625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.4672,
"mask/has_final_conf_rate": 0.046875,
"mask/share_final_conf": 0.0058358209207654,
"mask/share_reasoning": 0.8643442988395691,
"mask/share_step_conf": 0.1298198699951172,
"num_tokens": 16359783.0,
"reward": 0.004866867791861296,
"reward_std": 0.029468756169080734,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.016436327248811722,
"rewards/format_reward_step": 0.0390625,
"rewards/step_l2_reward": -0.018421342596411705,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.08202240616083145,
"adv/mean_abs_reasoning": 0.10839013755321503,
"adv/mean_abs_step_conf": 0.08240969479084015,
"adv/ratio_final_to_reasoning": 0.7567331125542845,
"adv/ratio_step_to_reasoning": 0.7603062109814228,
"adv/std_final_conf": 0.32736995816230774,
"adv/std_reasoning": 0.36963728070259094,
"adv/std_step_conf": 0.3276873528957367,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 1.62890625,
"calib/ece": 0.6324999999999998,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.875,
"calib/gap": -0.09066666666666656,
"calib/mean_conf": 0.94,
"calib/mu_c": 0.8833333333333333,
"calib/mu_w": 0.9739999999999999,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.5987499999999999,
"calib/std_conf": 0.08015609770940699,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.94,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.20176201923076909,
"calib/step_q_w": 0.7382379807692309,
"calib/step_q_w_n": 416.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1605.0,
"completions/max_terminated_length": 1605.0,
"completions/mean_length": 197.1328125,
"completions/mean_terminated_length": 197.90589904785156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.0864,
"grad_norm": 0.9779260158538818,
"kl": 0.1576995849609375,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.2706,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0026308889500796795,
"mask/share_reasoning": 0.8815246224403381,
"mask/share_step_conf": 0.1119382381439209,
"num_tokens": 16516497.0,
"reward": 0.001145427580922842,
"reward_std": 0.014946578070521355,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.004672265611588955,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.008631411008536816,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.037952158600091934,
"adv/mean_abs_reasoning": 0.05784125253558159,
"adv/mean_abs_step_conf": 0.03859909623861313,
"adv/ratio_final_to_reasoning": 0.6561434432414013,
"adv/ratio_step_to_reasoning": 0.6673281532910881,
"adv/std_final_conf": 0.22997251152992249,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.23388271033763885,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 1.5234375,
"calib/ece": 0.6924999999999999,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.75,
"calib/gap": 0.023333333333333428,
"calib/mean_conf": 0.9424999999999999,
"calib/mu_c": 0.96,
"calib/mu_w": 0.9366666666666665,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.6924999999999999,
"calib/std_conf": 0.054486236794258416,
"calib/step_conf_rate": 0.984375,
"calib/step_q_w": 0.7191292307692306,
"calib/step_q_w_n": 390.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1145.0,
"completions/max_terminated_length": 1145.0,
"completions/mean_length": 153.69921875,
"completions/mean_terminated_length": 153.69921875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.7302096486091614,
"kl": 0.1923828125,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.1652,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0009736621286720037,
"mask/share_reasoning": 0.8738419413566589,
"mask/share_step_conf": 0.12518437206745148,
"num_tokens": 16661396.0,
"reward": -1.4653633115813136e-05,
"reward_std": 0.0022511552087962627,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0003085937350988388,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.0026816511526703835,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.09552133083343506,
"adv/mean_abs_reasoning": 0.11570973694324493,
"adv/mean_abs_step_conf": 0.09623701125383377,
"adv/ratio_final_to_reasoning": 0.82552543421897,
"adv/ratio_step_to_reasoning": 0.831710569880887,
"adv/std_final_conf": 0.36607638001441956,
"adv/std_reasoning": 0.40479037165641785,
"adv/std_step_conf": 0.36880892515182495,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 1.47265625,
"calib/ece": 0.66,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": -0.05500000000000016,
"calib/mean_conf": 0.9466666666666667,
"calib/mu_c": 0.9099999999999999,
"calib/mu_w": 0.9650000000000001,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.6366666666666667,
"calib/std_conf": 0.04678556282539399,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.875,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.14929253333333337,
"calib/step_q_w": 0.7257074666666666,
"calib/step_q_w_n": 375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 761.0,
"completions/max_terminated_length": 761.0,
"completions/mean_length": 161.3984375,
"completions/mean_terminated_length": 162.03138732910156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.08853333333333334,
"grad_norm": 1.34121572971344,
"kl": 0.1849365234375,
"learning_rate": 3.277777777777778e-06,
"loss": -0.2845,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0027125701308250427,
"mask/share_reasoning": 0.8722102642059326,
"mask/share_step_conf": 0.12117096781730652,
"num_tokens": 16809978.0,
"reward": 0.0018548424122855067,
"reward_std": 0.012132625095546246,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.004958593752235174,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l2_reward": -0.006717659067362547,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.019323859363794327,
"adv/mean_abs_reasoning": 0.01930764690041542,
"adv/mean_abs_step_conf": 0.01931280642747879,
"adv/ratio_final_to_reasoning": 1.0008396913131117,
"adv/ratio_step_to_reasoning": 1.0002672271297472,
"adv/std_final_conf": 0.16558833420276642,
"adv/std_reasoning": 0.16544939577579498,
"adv/std_step_conf": 0.16549362242221832,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 1.0625,
"calib/ece": 0.6866666666666666,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": -0.08999999999999997,
"calib/mean_conf": 0.94,
"calib/mu_c": 0.88,
"calib/mu_w": 0.97,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.6466666666666666,
"calib/std_conf": 0.04242640687119283,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.815,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.06681481481481477,
"calib/step_q_w": 0.7481851851851852,
"calib/step_q_w_n": 270.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 160.22265625,
"completions/mean_terminated_length": 160.22265625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.0896,
"grad_norm": 0.5504854917526245,
"kl": 0.20377349853515625,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0583,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0006643373053520918,
"mask/share_reasoning": 0.8847761154174805,
"mask/share_step_conf": 0.11455954611301422,
"num_tokens": 16956915.0,
"reward": 0.0017426569247618318,
"reward_std": 0.004928978160023689,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0038499999791383743,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.001927186269313097,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.057187922298908234,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.057268816977739334,
"adv/ratio_final_to_reasoning": 0.9887047065676265,
"adv/ratio_step_to_reasoning": 0.9901032702237521,
"adv/std_final_conf": 0.2829303443431854,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.2833392918109894,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 1.0390625,
"calib/ece": 0.9299999999999999,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/mean_conf": 0.93,
"calib/mu_c": NaN,
"calib/mu_w": 0.93,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.9299999999999999,
"calib/std_conf": 0.07094598884597589,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_w": 0.7473684210526316,
"calib/step_q_w_n": 266.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2001.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 144.11328125,
"completions/mean_terminated_length": 144.11328125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.09066666666666667,
"grad_norm": 1.0435339212417603,
"kl": 0.1903533935546875,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.2095,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.002138955984264612,
"mask/share_reasoning": 0.861068844795227,
"mask/share_step_conf": 0.13679219782352448,
"num_tokens": 17101632.0,
"reward": 0.0010343744652345777,
"reward_std": 0.0029256530106067657,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00046406249748542905,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l2_reward": -0.0007390635437332094,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.038520447909832,
"adv/mean_abs_reasoning": 0.07714889943599701,
"adv/mean_abs_step_conf": 0.03856028616428375,
"adv/ratio_final_to_reasoning": 0.4993000313865617,
"adv/ratio_step_to_reasoning": 0.49981641275743016,
"adv/std_final_conf": 0.2334073781967163,
"adv/std_reasoning": 0.33054888248443604,
"adv/std_step_conf": 0.2336476892232895,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.92578125,
"calib/ece": 0.6614285714285715,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.8571428571428571,
"calib/gap": 0.03200000000000003,
"calib/mean_conf": 0.9471428571428572,
"calib/mu_c": 0.97,
"calib/mu_w": 0.938,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.6614285714285715,
"calib/std_conf": 0.02490799396308954,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_c": 0.52,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.16228813559322053,
"calib/step_q_w": 0.6822881355932205,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1061.0,
"completions/max_terminated_length": 1061.0,
"completions/mean_length": 154.296875,
"completions/mean_terminated_length": 154.296875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.8700916171073914,
"kl": 0.1883392333984375,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.1395,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.002657730830833316,
"mask/share_reasoning": 0.8664588928222656,
"mask/share_step_conf": 0.1308833360671997,
"num_tokens": 17246644.0,
"reward": 0.003074523527175188,
"reward_std": 0.009674238041043282,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.0042089843191206455,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.001966187497600913,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.07712167501449585,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.33043214678764343,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.03125,
"calib/avg_num_step_conf": 0.90234375,
"calib/ece": 0.5572727272727273,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.6363636363636364,
"calib/gap": -0.004285714285714337,
"calib/mean_conf": 0.8827272727272727,
"calib/mu_c": 0.88,
"calib/mu_w": 0.8842857142857143,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/pce": 0.5381818181818182,
"calib/std_conf": 0.14007082034008295,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_w": 0.7795238095238096,
"calib/step_q_w_n": 231.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 917.0,
"completions/max_terminated_length": 917.0,
"completions/mean_length": 138.26171875,
"completions/mean_terminated_length": 138.26171875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0928,
"grad_norm": 0.2774300277233124,
"kl": 0.21185302734375,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0056,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0029894590843468904,
"mask/share_reasoning": 0.881792426109314,
"mask/share_step_conf": 0.1152181625366211,
"num_tokens": 17387535.0,
"reward": 0.0015625000232830644,
"reward_std": 0.0044194171205163,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.03856541961431503,
"adv/mean_abs_reasoning": 0.13927656412124634,
"adv/mean_abs_step_conf": 0.03861725330352783,
"adv/ratio_final_to_reasoning": 0.2768981260963772,
"adv/ratio_step_to_reasoning": 0.2772702898522814,
"adv/std_final_conf": 0.23367911577224731,
"adv/std_reasoning": 0.437213271856308,
"adv/std_step_conf": 0.23399266600608826,
"calib/answer_extract_rate": 0.05859375,
"calib/avg_num_step_conf": 0.88671875,
"calib/ece": 0.6288888888888889,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.8888888888888888,
"calib/gap": -0.013333333333333197,
"calib/mean_conf": 0.9622222222222222,
"calib/mu_c": 0.9533333333333335,
"calib/mu_w": 0.9666666666666667,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.875,
"calib/pce": 0.6288888888888889,
"calib/std_conf": 0.03189488909868294,
"calib/step_conf_rate": 0.875,
"calib/step_q_c": 0.6,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.19370442477876115,
"calib/step_q_w": 0.7937044247787611,
"calib/step_q_w_n": 226.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2740.0,
"completions/max_terminated_length": 2740.0,
"completions/mean_length": 140.62109375,
"completions/mean_terminated_length": 140.62109375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.5458513498306274,
"kl": 0.226104736328125,
"learning_rate": 3.138888888888889e-06,
"loss": -0.1198,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.0038935919292271137,
"mask/share_reasoning": 0.8730115294456482,
"mask/share_step_conf": 0.12309487909078598,
"num_tokens": 17533382.0,
"reward": 0.003952160477638245,
"reward_std": 0.012706692330539227,
"rewards/accuracy_reward_step": 0.02734375,
"rewards/final_brier_reward_step": 0.004346875008195639,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.003473804332315922,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.10836289823055267,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.36953291296958923,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.08203125,
"calib/avg_num_step_conf": 0.796875,
"calib/ece": 0.6340909090909089,
"calib/final_conf_rate": 0.0859375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.9090909090909091,
"calib/gap": 0.03857142857142837,
"calib/mean_conf": 0.952272727272727,
"calib/mu_c": 0.9785714285714285,
"calib/mu_w": 0.9400000000000002,
"calib/nonempty_final_conf_rate": 0.0859375,
"calib/nonempty_reasoning_rate": 0.87890625,
"calib/nonempty_step_conf_rate": 0.796875,
"calib/pce": 0.6340909090909089,
"calib/std_conf": 0.09214276562052143,
"calib/step_conf_rate": 0.796875,
"calib/step_q_w": 0.7481210784313724,
"calib/step_q_w_n": 204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 884.0,
"completions/max_terminated_length": 884.0,
"completions/mean_length": 159.33984375,
"completions/mean_terminated_length": 159.33984375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.18698692321777344,
"kl": 0.219879150390625,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0224,
"mask/has_final_conf_rate": 0.0859375,
"mask/share_final_conf": 0.01074138842523098,
"mask/share_reasoning": 0.8866013288497925,
"mask/share_step_conf": 0.10265731811523438,
"num_tokens": 17683061.0,
"reward": 0.002734375186264515,
"reward_std": 0.006207750178873539,
"rewards/accuracy_reward_step": 0.02734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.15290415287017822,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.43724557757377625,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.09375,
"calib/avg_num_step_conf": 0.8125,
"calib/ece": 0.5904347826086954,
"calib/final_conf_rate": 0.08984375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.9565217391304348,
"calib/gap": -0.09423076923076923,
"calib/mean_conf": 0.9382608695652171,
"calib/mu_c": 0.885,
"calib/mu_w": 0.9792307692307692,
"calib/nonempty_final_conf_rate": 0.08984375,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.8125,
"calib/pce": 0.5469565217391302,
"calib/std_conf": 0.2004701468377696,
"calib/step_conf_rate": 0.8125,
"calib/step_q_w": 0.76063125,
"calib/step_q_w_n": 208.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 132.74609375,
"completions/mean_terminated_length": 132.74609375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.096,
"grad_norm": 0.3043246865272522,
"kl": 0.2758331298828125,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.081,
"mask/has_final_conf_rate": 0.08984375,
"mask/share_final_conf": 0.015118993818759918,
"mask/share_reasoning": 0.855556070804596,
"mask/share_step_conf": 0.12932494282722473,
"num_tokens": 17820364.0,
"reward": 0.00390625,
"reward_std": 0.00875919871032238,
"rewards/accuracy_reward_step": 0.0390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.2954920530319214,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5959096550941467,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.18359375,
"calib/avg_num_step_conf": 0.65625,
"calib/ece": 0.46977272727272706,
"calib/final_conf_rate": 0.171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.9318181818181818,
"calib/gap": 0.06453416149068314,
"calib/mean_conf": 0.933409090909091,
"calib/mu_c": 0.9671428571428569,
"calib/mu_w": 0.9026086956521737,
"calib/nonempty_final_conf_rate": 0.171875,
"calib/nonempty_reasoning_rate": 0.83984375,
"calib/nonempty_step_conf_rate": 0.65625,
"calib/pce": 0.46295454545454523,
"calib/std_conf": 0.20092901484918615,
"calib/step_conf_rate": 0.65625,
"calib/step_q_w": 0.7510714285714286,
"calib/step_q_w_n": 168.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 838.0,
"completions/max_terminated_length": 838.0,
"completions/mean_length": 163.19921875,
"completions/mean_terminated_length": 163.19921875,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.2645391821861267,
"kl": 0.241119384765625,
"learning_rate": 3.055555555555556e-06,
"loss": -0.036,
"mask/has_final_conf_rate": 0.171875,
"mask/share_final_conf": 0.020244672894477844,
"mask/share_reasoning": 0.8839057087898254,
"mask/share_step_conf": 0.09584958851337433,
"num_tokens": 17969855.0,
"reward": 0.008593750186264515,
"reward_std": 0.016925785690546036,
"rewards/accuracy_reward_step": 0.0859375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.39354297518730164,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6814802885055542,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.30078125,
"calib/avg_num_step_conf": 0.55078125,
"calib/ece": 0.4719444444444445,
"calib/final_conf_rate": 0.28125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.875,
"calib/gap": 0.03213899613899618,
"calib/mean_conf": 0.9580555555555557,
"calib/mu_c": 0.9745714285714285,
"calib/mu_w": 0.9424324324324324,
"calib/nonempty_final_conf_rate": 0.28125,
"calib/nonempty_reasoning_rate": 0.8515625,
"calib/nonempty_step_conf_rate": 0.55078125,
"calib/pce": 0.4719444444444445,
"calib/std_conf": 0.06317169920341635,
"calib/step_conf_rate": 0.55078125,
"calib/step_q_w": 0.8117730496453901,
"calib/step_q_w_n": 141.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 978.0,
"completions/max_terminated_length": 978.0,
"completions/mean_length": 172.0390625,
"completions/mean_terminated_length": 172.0390625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.4520159065723419,
"kl": 0.23626708984375,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0759,
"mask/has_final_conf_rate": 0.28125,
"mask/share_final_conf": 0.04713796079158783,
"mask/share_reasoning": 0.8745333552360535,
"mask/share_step_conf": 0.07832865417003632,
"num_tokens": 18120617.0,
"reward": 0.01484375074505806,
"reward_std": 0.022541169077157974,
"rewards/accuracy_reward_step": 0.1484375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.40045166015625,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6815094947814941,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.35546875,
"calib/avg_num_step_conf": 0.51171875,
"calib/ece": 0.5515384615384616,
"calib/final_conf_rate": 0.35546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.8901098901098901,
"calib/gap": 0.04241414141414146,
"calib/mean_conf": 0.9471428571428571,
"calib/mu_c": 0.9727777777777779,
"calib/mu_w": 0.9303636363636364,
"calib/nonempty_final_conf_rate": 0.35546875,
"calib/nonempty_reasoning_rate": 0.8671875,
"calib/nonempty_step_conf_rate": 0.51171875,
"calib/pce": 0.5515384615384616,
"calib/std_conf": 0.1252889908977975,
"calib/step_conf_rate": 0.51171875,
"calib/step_q_w": 0.7734198473282442,
"calib/step_q_w_n": 131.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1179.0,
"completions/max_terminated_length": 1179.0,
"completions/mean_length": 174.6484375,
"completions/mean_terminated_length": 174.6484375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0992,
"grad_norm": 0.4297439455986023,
"kl": 0.2293701171875,
"learning_rate": 3e-06,
"loss": -0.0498,
"mask/has_final_conf_rate": 0.35546875,
"mask/share_final_conf": 0.058933135122060776,
"mask/share_reasoning": 0.8597003221511841,
"mask/share_step_conf": 0.08136658370494843,
"num_tokens": 18271103.0,
"reward": 0.014062500558793545,
"reward_std": 0.02293594926595688,
"rewards/accuracy_reward_step": 0.140625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.560976505279541,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.792765736579895,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.54296875,
"calib/avg_num_step_conf": 0.34765625,
"calib/ece": 0.5181870503597124,
"calib/final_conf_rate": 0.54296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.9064748201438849,
"calib/gap": 0.016617065994114832,
"calib/mean_conf": 0.9570359712230215,
"calib/mu_c": 0.9663606557377049,
"calib/mu_w": 0.9497435897435901,
"calib/nonempty_final_conf_rate": 0.54296875,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.34765625,
"calib/pce": 0.5181870503597124,
"calib/std_conf": 0.09007282505830924,
"calib/step_conf_rate": 0.34765625,
"calib/step_q_w": 0.7235955056179776,
"calib/step_q_w_n": 89.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 557.0,
"completions/max_terminated_length": 557.0,
"completions/mean_length": 149.36328125,
"completions/mean_terminated_length": 149.94903564453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.47062671184539795,
"kl": 0.300323486328125,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0837,
"mask/has_final_conf_rate": 0.5390625,
"mask/share_final_conf": 0.10760974884033203,
"mask/share_reasoning": 0.831997275352478,
"mask/share_step_conf": 0.056486740708351135,
"num_tokens": 18418020.0,
"reward": 0.02421875111758709,
"reward_std": 0.032127510756254196,
"rewards/accuracy_reward_step": 0.2421875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5798459053039551,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928367853164673,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.61328125,
"calib/avg_num_step_conf": 0.21875,
"calib/ece": 0.5654774193548388,
"calib/final_conf_rate": 0.60546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.8193548387096774,
"calib/gap": 0.03835517364840679,
"calib/mean_conf": 0.9125741935483872,
"calib/mu_c": 0.9368245614035088,
"calib/mu_w": 0.898469387755102,
"calib/nonempty_final_conf_rate": 0.60546875,
"calib/nonempty_reasoning_rate": 0.83203125,
"calib/nonempty_step_conf_rate": 0.21875,
"calib/pce": 0.5551548387096775,
"calib/std_conf": 0.20260869479120286,
"calib/step_conf_rate": 0.21875,
"calib/step_q_w": 0.7480517857142858,
"calib/step_q_w_n": 56.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1212.0,
"completions/max_terminated_length": 1212.0,
"completions/mean_length": 188.37890625,
"completions/mean_terminated_length": 188.37890625,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.6134114265441895,
"kl": 0.2447509765625,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0036,
"mask/has_final_conf_rate": 0.60546875,
"mask/share_final_conf": 0.13950452208518982,
"mask/share_reasoning": 0.8184226155281067,
"mask/share_step_conf": 0.04207289218902588,
"num_tokens": 18572373.0,
"reward": 0.0234375,
"reward_std": 0.03320576995611191,
"rewards/accuracy_reward_step": 0.234375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6374795436859131,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8428910374641418,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5859375,
"calib/avg_num_step_conf": 0.25,
"calib/ece": 0.5187037037037037,
"calib/final_conf_rate": 0.6328125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.8580246913580247,
"calib/gap": 2.940721250577205e-05,
"calib/mean_conf": 0.9456172839506173,
"calib/mu_c": 0.9456338028169015,
"calib/mu_w": 0.9456043956043957,
"calib/nonempty_final_conf_rate": 0.6328125,
"calib/nonempty_reasoning_rate": 0.8359375,
"calib/nonempty_step_conf_rate": 0.25,
"calib/pce": 0.5130246913580246,
"calib/std_conf": 0.09783420946055232,
"calib/step_conf_rate": 0.25,
"calib/step_q_w": 0.7690625,
"calib/step_q_w_n": 64.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 789.0,
"completions/max_terminated_length": 789.0,
"completions/mean_length": 174.91015625,
"completions/mean_terminated_length": 175.59608459472656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1024,
"grad_norm": 0.34501567482948303,
"kl": 0.276336669921875,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0185,
"mask/has_final_conf_rate": 0.6328125,
"mask/share_final_conf": 0.1448785662651062,
"mask/share_reasoning": 0.8194688558578491,
"mask/share_step_conf": 0.03174634277820587,
"num_tokens": 18722966.0,
"reward": 0.02890624850988388,
"reward_std": 0.0365084670484066,
"rewards/accuracy_reward_step": 0.2890625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.4943743646144867,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7574437260627747,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.578125,
"calib/avg_num_step_conf": 0.30078125,
"calib/ece": 0.6255974842767297,
"calib/final_conf_rate": 0.62109375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.6981132075471698,
"calib/gap": 0.026420008267879158,
"calib/mean_conf": 0.8708805031446539,
"calib/mu_c": 0.8904878048780487,
"calib/mu_w": 0.8640677966101695,
"calib/nonempty_final_conf_rate": 0.62109375,
"calib/nonempty_reasoning_rate": 0.87890625,
"calib/nonempty_step_conf_rate": 0.30078125,
"calib/pce": 0.619308176100629,
"calib/std_conf": 0.23975454861615617,
"calib/step_conf_rate": 0.30078125,
"calib/step_q_w": 0.8445220779220781,
"calib/step_q_w_n": 77.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1062.0,
"completions/max_terminated_length": 1062.0,
"completions/mean_length": 172.87890625,
"completions/mean_terminated_length": 172.87890625,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.38908159732818604,
"kl": 0.296417236328125,
"learning_rate": 2.888888888888889e-06,
"loss": -0.0426,
"mask/has_final_conf_rate": 0.62109375,
"mask/share_final_conf": 0.16929784417152405,
"mask/share_reasoning": 0.7983481287956238,
"mask/share_step_conf": 0.03235398232936859,
"num_tokens": 18872295.0,
"reward": 0.01640625111758709,
"reward_std": 0.028315432369709015,
"rewards/accuracy_reward_step": 0.1640625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.45829716324806213,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7391588091850281,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.39453125,
"calib/avg_num_step_conf": 0.37890625,
"calib/ece": 0.6931932773109243,
"calib/final_conf_rate": 0.46484375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.7647058823529411,
"calib/gap": -0.006038314176245119,
"calib/mean_conf": 0.9200840336134454,
"calib/mu_c": 0.9155172413793105,
"calib/mu_w": 0.9215555555555556,
"calib/nonempty_final_conf_rate": 0.46484375,
"calib/nonempty_reasoning_rate": 0.7734375,
"calib/nonempty_step_conf_rate": 0.37890625,
"calib/pce": 0.6847899159663864,
"calib/std_conf": 0.1431547073947489,
"calib/step_conf_rate": 0.37890625,
"calib/step_q_w": 0.7460824742268042,
"calib/step_q_w_n": 97.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 970.0,
"completions/max_terminated_length": 970.0,
"completions/mean_length": 233.30859375,
"completions/mean_terminated_length": 234.22354125976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.2955537438392639,
"kl": 0.227935791015625,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0498,
"mask/has_final_conf_rate": 0.4609375,
"mask/share_final_conf": 0.11774331331253052,
"mask/share_reasoning": 0.8449557423591614,
"mask/share_step_conf": 0.033394694328308105,
"num_tokens": 19038206.0,
"reward": 0.01328125037252903,
"reward_std": 0.02625075727701187,
"rewards/accuracy_reward_step": 0.1328125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.2998278737068176,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6183291673660278,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.38671875,
"calib/avg_num_step_conf": 0.453125,
"calib/ece": 0.7344554455445544,
"calib/final_conf_rate": 0.39453125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.7227722772277227,
"calib/gap": 0.01857429718875503,
"calib/mean_conf": 0.8930693069306932,
"calib/mu_c": 0.9083333333333334,
"calib/mu_w": 0.8897590361445784,
"calib/nonempty_final_conf_rate": 0.39453125,
"calib/nonempty_reasoning_rate": 0.83984375,
"calib/nonempty_step_conf_rate": 0.453125,
"calib/pce": 0.7246534653465346,
"calib/std_conf": 0.1987771655225595,
"calib/step_conf_rate": 0.453125,
"calib/step_q_w": 0.8375862068965518,
"calib/step_q_w_n": 116.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1659.0,
"completions/max_terminated_length": 1659.0,
"completions/mean_length": 240.1171875,
"completions/mean_terminated_length": 241.058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1056,
"grad_norm": 0.17655298113822937,
"kl": 0.2249603271484375,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0271,
"mask/has_final_conf_rate": 0.39453125,
"mask/share_final_conf": 0.12888720631599426,
"mask/share_reasoning": 0.8282879590988159,
"mask/share_step_conf": 0.03891859948635101,
"num_tokens": 19205476.0,
"reward": 0.00742187537252903,
"reward_std": 0.017176657915115356,
"rewards/accuracy_reward_step": 0.07421875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.36230605840682983,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6611034274101257,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.39453125,
"calib/avg_num_step_conf": 0.515625,
"calib/ece": 0.6748543689320389,
"calib/final_conf_rate": 0.40234375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.6504854368932039,
"calib/gap": -0.04204295704295724,
"calib/mean_conf": 0.87873786407767,
"calib/mu_c": 0.8473076923076922,
"calib/mu_w": 0.8893506493506494,
"calib/nonempty_final_conf_rate": 0.40234375,
"calib/nonempty_reasoning_rate": 0.91015625,
"calib/nonempty_step_conf_rate": 0.515625,
"calib/pce": 0.6505825242718447,
"calib/std_conf": 0.19604180063302273,
"calib/step_conf_rate": 0.515625,
"calib/step_q_w": 0.7578030303030303,
"calib/step_q_w_n": 132.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1726.0,
"completions/max_terminated_length": 1726.0,
"completions/mean_length": 201.21875,
"completions/mean_terminated_length": 201.21875,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.23728390038013458,
"kl": 0.248291015625,
"learning_rate": 2.805555555555556e-06,
"loss": 0.0129,
"mask/has_final_conf_rate": 0.40234375,
"mask/share_final_conf": 0.15121278166770935,
"mask/share_reasoning": 0.8124673962593079,
"mask/share_step_conf": 0.036319803446531296,
"num_tokens": 19364396.0,
"reward": 0.01015624962747097,
"reward_std": 0.020753080025315285,
"rewards/accuracy_reward_step": 0.1015625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.2316746860742569,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5226800441741943,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.3125,
"calib/avg_num_step_conf": 0.50390625,
"calib/ece": 0.6648275862068965,
"calib/final_conf_rate": 0.33984375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5862068965517241,
"calib/gap": 0.04954334365325075,
"calib/mean_conf": 0.8639080459770117,
"calib/mu_c": 0.9026315789473683,
"calib/mu_w": 0.8530882352941176,
"calib/nonempty_final_conf_rate": 0.33984375,
"calib/nonempty_reasoning_rate": 0.81640625,
"calib/nonempty_step_conf_rate": 0.50390625,
"calib/pce": 0.6551724137931034,
"calib/std_conf": 0.20962668691341935,
"calib/step_conf_rate": 0.50390625,
"calib/step_q_w": 0.7727906976744185,
"calib/step_q_w_n": 129.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2881.0,
"completions/max_terminated_length": 2881.0,
"completions/mean_length": 272.4765625,
"completions/mean_terminated_length": 272.4765625,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.16580340266227722,
"kl": 0.199188232421875,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0138,
"mask/has_final_conf_rate": 0.33984375,
"mask/share_final_conf": 0.13236968219280243,
"mask/share_reasoning": 0.8355380892753601,
"mask/share_step_conf": 0.03209220618009567,
"num_tokens": 19541142.0,
"reward": 0.00742187537252903,
"reward_std": 0.013269728049635887,
"rewards/accuracy_reward_step": 0.07421875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.30013322830200195,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.572675883769989,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.30859375,
"calib/avg_num_step_conf": 0.60546875,
"calib/ece": 0.5996341463414635,
"calib/final_conf_rate": 0.3203125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.4878048780487805,
"calib/gap": -0.11944444444444435,
"calib/mean_conf": 0.818658536585366,
"calib/mu_c": 0.7399999999999999,
"calib/mu_w": 0.8594444444444442,
"calib/nonempty_final_conf_rate": 0.3203125,
"calib/nonempty_reasoning_rate": 0.9140625,
"calib/nonempty_step_conf_rate": 0.60546875,
"calib/pce": 0.5384146341463416,
"calib/std_conf": 0.23736748392972173,
"calib/step_conf_rate": 0.60546875,
"calib/step_q_w": 0.7538709677419354,
"calib/step_q_w_n": 155.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 768.0,
"completions/max_terminated_length": 768.0,
"completions/mean_length": 182.15234375,
"completions/mean_terminated_length": 182.86668395996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1088,
"grad_norm": 0.24702408909797668,
"kl": 0.2725830078125,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0477,
"mask/has_final_conf_rate": 0.31640625,
"mask/share_final_conf": 0.13098107278347015,
"mask/share_reasoning": 0.8201836347579956,
"mask/share_step_conf": 0.044929005205631256,
"num_tokens": 19694469.0,
"reward": 0.011328124441206455,
"reward_std": 0.01718788966536522,
"rewards/accuracy_reward_step": 0.11328125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.2895159423351288,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5958887934684753,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.40234375,
"calib/avg_num_step_conf": 0.5,
"calib/ece": 0.7086407766990289,
"calib/final_conf_rate": 0.40234375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5631067961165048,
"calib/gap": -0.04659774436090214,
"calib/mean_conf": 0.84747572815534,
"calib/mu_c": 0.8094736842105263,
"calib/mu_w": 0.8560714285714285,
"calib/nonempty_final_conf_rate": 0.40234375,
"calib/nonempty_reasoning_rate": 0.90234375,
"calib/nonempty_step_conf_rate": 0.5,
"calib/pce": 0.6858252427184464,
"calib/std_conf": 0.2311462297074407,
"calib/step_conf_rate": 0.5,
"calib/step_q_w": 0.741015625,
"calib/step_q_w_n": 128.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1722.0,
"completions/mean_length": 217.74609375,
"completions/mean_terminated_length": 217.74609375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.1895759403705597,
"kl": 0.2274017333984375,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0327,
"mask/has_final_conf_rate": 0.40234375,
"mask/share_final_conf": 0.18485210835933685,
"mask/share_reasoning": 0.7832228541374207,
"mask/share_step_conf": 0.0319250263273716,
"num_tokens": 19854764.0,
"reward": 0.0078125,
"reward_std": 0.01658429019153118,
"rewards/accuracy_reward_step": 0.078125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.2904398739337921,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5958819389343262,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5546875,
"calib/avg_num_step_conf": 0.35546875,
"calib/ece": 0.6597841726618705,
"calib/final_conf_rate": 0.54296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.43884892086330934,
"calib/gap": 0.0170217391304347,
"calib/mean_conf": 0.8084172661870503,
"calib/mu_c": 0.8224999999999999,
"calib/mu_w": 0.8054782608695652,
"calib/nonempty_final_conf_rate": 0.54296875,
"calib/nonempty_reasoning_rate": 0.91015625,
"calib/nonempty_step_conf_rate": 0.35546875,
"calib/pce": 0.6477697841726618,
"calib/std_conf": 0.2301618416917616,
"calib/step_conf_rate": 0.35546875,
"calib/step_q_w": 0.7660439560439561,
"calib/step_q_w_n": 91.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1096.0,
"completions/max_terminated_length": 1096.0,
"completions/mean_length": 152.58203125,
"completions/mean_terminated_length": 152.58203125,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.21708692610263824,
"kl": 0.3060302734375,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0092,
"mask/has_final_conf_rate": 0.54296875,
"mask/share_final_conf": 0.26213765144348145,
"mask/share_reasoning": 0.7116431593894958,
"mask/share_step_conf": 0.026219181716442108,
"num_tokens": 20000505.0,
"reward": 0.00937500037252903,
"reward_std": 0.016637086868286133,
"rewards/accuracy_reward_step": 0.09375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.32168108224868774,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6184209585189819,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55078125,
"calib/avg_num_step_conf": 0.33984375,
"calib/ece": 0.6417605633802816,
"calib/final_conf_rate": 0.5546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.39436619718309857,
"calib/gap": 0.0321286472148542,
"calib/mean_conf": 0.796830985915493,
"calib/mu_c": 0.8230769230769232,
"calib/mu_w": 0.790948275862069,
"calib/nonempty_final_conf_rate": 0.5546875,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.33984375,
"calib/pce": 0.6277464788732394,
"calib/std_conf": 0.2340650256295708,
"calib/step_conf_rate": 0.33984375,
"calib/step_q_w": 0.7344827586206897,
"calib/step_q_w_n": 87.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2006.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 176.86328125,
"completions/mean_terminated_length": 176.86328125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.112,
"grad_norm": 0.26342976093292236,
"kl": 0.2776336669921875,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0817,
"mask/has_final_conf_rate": 0.5546875,
"mask/share_final_conf": 0.26337558031082153,
"mask/share_reasoning": 0.7090876698493958,
"mask/share_step_conf": 0.027536744251847267,
"num_tokens": 20151542.0,
"reward": 0.01132812537252903,
"reward_std": 0.01842541992664337,
"rewards/accuracy_reward_step": 0.11328125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.28838005661964417,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5958743095397949,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.625,
"calib/avg_num_step_conf": 0.28125,
"calib/ece": 0.6691975308641975,
"calib/final_conf_rate": 0.6328125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.4382716049382716,
"calib/gap": -0.00977710233029383,
"calib/mean_conf": 0.7899382716049385,
"calib/mu_c": 0.7814285714285715,
"calib/mu_w": 0.7912056737588653,
"calib/nonempty_final_conf_rate": 0.6328125,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.28125,
"calib/pce": 0.6647530864197531,
"calib/std_conf": 0.2551869247433646,
"calib/step_conf_rate": 0.28125,
"calib/step_q_w": 0.7486111111111111,
"calib/step_q_w_n": 72.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 700.0,
"completions/max_terminated_length": 700.0,
"completions/mean_length": 135.83984375,
"completions/mean_terminated_length": 136.37255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.17964833974838257,
"kl": 0.297821044921875,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0105,
"mask/has_final_conf_rate": 0.62890625,
"mask/share_final_conf": 0.308359295129776,
"mask/share_reasoning": 0.6718757152557373,
"mask/share_step_conf": 0.01585877686738968,
"num_tokens": 20290901.0,
"reward": 0.008593750186264515,
"reward_std": 0.01651938259601593,
"rewards/accuracy_reward_step": 0.0859375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.2720876932144165,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5725226998329163,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.69921875,
"calib/avg_num_step_conf": 0.23046875,
"calib/ece": 0.6738212290502794,
"calib/final_conf_rate": 0.69921875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3240223463687151,
"calib/gap": -0.02180394736842106,
"calib/mean_conf": 0.7579106145251396,
"calib/mu_c": 0.7384210526315789,
"calib/mu_w": 0.7602249999999999,
"calib/nonempty_final_conf_rate": 0.69921875,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.23046875,
"calib/pce": 0.6627932960893855,
"calib/std_conf": 0.2682787528242608,
"calib/step_conf_rate": 0.23046875,
"calib/step_q_w": 0.7813559322033898,
"calib/step_q_w_n": 59.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 625.0,
"completions/max_terminated_length": 625.0,
"completions/mean_length": 107.0859375,
"completions/mean_terminated_length": 107.50588989257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.30144068598747253,
"kl": 0.359100341796875,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0394,
"mask/has_final_conf_rate": 0.6953125,
"mask/share_final_conf": 0.3688632845878601,
"mask/share_reasoning": 0.6099119186401367,
"mask/share_step_conf": 0.017318598926067352,
"num_tokens": 20422931.0,
"reward": 0.008593750186264515,
"reward_std": 0.015585274435579777,
"rewards/accuracy_reward_step": 0.0859375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.49251788854599,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7574393153190613,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.75,
"calib/avg_num_step_conf": 0.1875,
"calib/ece": 0.5028571428571429,
"calib/final_conf_rate": 0.73828125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3544973544973545,
"calib/gap": 0.019162937615161968,
"calib/mean_conf": 0.7874074074074073,
"calib/mu_c": 0.8006896551724139,
"calib/mu_w": 0.7815267175572519,
"calib/nonempty_final_conf_rate": 0.73828125,
"calib/nonempty_reasoning_rate": 0.9375,
"calib/nonempty_step_conf_rate": 0.1875,
"calib/pce": 0.4916931216931216,
"calib/std_conf": 0.22954325052258504,
"calib/step_conf_rate": 0.1875,
"calib/step_q_w": 0.7729166666666666,
"calib/step_q_w_n": 48.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1124.0,
"completions/max_terminated_length": 1124.0,
"completions/mean_length": 112.07421875,
"completions/mean_terminated_length": 112.51373291015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.1152,
"grad_norm": 0.32904762029647827,
"kl": 0.327239990234375,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.141,
"mask/has_final_conf_rate": 0.734375,
"mask/share_final_conf": 0.3876108229160309,
"mask/share_reasoning": 0.5916550159454346,
"mask/share_step_conf": 0.01682785153388977,
"num_tokens": 20554854.0,
"reward": 0.02265625074505806,
"reward_std": 0.028209349140524864,
"rewards/accuracy_reward_step": 0.2265625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.43973737955093384,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7013425230979919,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.8359375,
"calib/avg_num_step_conf": 0.1171875,
"calib/ece": 0.549514563106796,
"calib/final_conf_rate": 0.8046875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3786407766990291,
"calib/gap": 0.05893205080774444,
"calib/mean_conf": 0.8067961165048543,
"calib/mu_c": 0.850566037735849,
"calib/mu_w": 0.7916339869281046,
"calib/nonempty_final_conf_rate": 0.8046875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.1171875,
"calib/pce": 0.549514563106796,
"calib/std_conf": 0.21122257892800225,
"calib/step_conf_rate": 0.1171875,
"calib/step_q_w": 0.7606666666666667,
"calib/step_q_w_n": 30.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 84.38671875,
"completions/mean_terminated_length": 84.38671875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.479681134223938,
"kl": 0.422088623046875,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0318,
"mask/has_final_conf_rate": 0.8046875,
"mask/share_final_conf": 0.40282905101776123,
"mask/share_reasoning": 0.5846548080444336,
"mask/share_step_conf": 0.012516127899289131,
"num_tokens": 20681057.0,
"reward": 0.021484375,
"reward_std": 0.025183971971273422,
"rewards/accuracy_reward_step": 0.21484375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.26682794094085693,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5482406616210938,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.875,
"calib/avg_num_step_conf": 0.08984375,
"calib/ece": 0.6419282511210762,
"calib/final_conf_rate": 0.87109375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3183856502242152,
"calib/gap": 0.027856381087806525,
"calib/mean_conf": 0.7719730941704035,
"calib/mu_c": 0.796206896551724,
"calib/mu_w": 0.7683505154639175,
"calib/nonempty_final_conf_rate": 0.87109375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.08984375,
"calib/pce": 0.6419282511210762,
"calib/std_conf": 0.22947524304040517,
"calib/step_conf_rate": 0.08984375,
"calib/step_q_w": 0.6339130434782608,
"calib/step_q_w_n": 23.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 742.0,
"completions/max_terminated_length": 742.0,
"completions/mean_length": 77.94140625,
"completions/mean_terminated_length": 78.24706268310547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.434518426656723,
"kl": 0.47857666015625,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.1372,
"mask/has_final_conf_rate": 0.8671875,
"mask/share_final_conf": 0.4286842346191406,
"mask/share_reasoning": 0.5606935024261475,
"mask/share_step_conf": 0.006716080941259861,
"num_tokens": 20805930.0,
"reward": 0.01132812537252903,
"reward_std": 0.015281605534255505,
"rewards/accuracy_reward_step": 0.11328125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.3461071252822876,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5961334705352783,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.87890625,
"calib/avg_num_step_conf": 0.0703125,
"calib/ece": 0.5774269005847953,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.27631578947368424,
"calib/gap": -0.016889568764568597,
"calib/mean_conf": 0.7620760233918129,
"calib/mu_c": 0.7490384615384618,
"calib/mu_w": 0.7659280303030304,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.0703125,
"calib/pce": 0.5557163742690058,
"calib/std_conf": 0.2304349605063494,
"calib/step_conf_rate": 0.0703125,
"calib/step_q_w": 0.7294444444444445,
"calib/step_q_w_n": 18.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 623.0,
"completions/max_terminated_length": 623.0,
"completions/mean_length": 69.4140625,
"completions/mean_terminated_length": 69.4140625,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.1184,
"grad_norm": 0.5920610427856445,
"kl": 0.51239013671875,
"learning_rate": 2.5e-06,
"loss": -0.0325,
"mask/has_final_conf_rate": 0.890625,
"mask/share_final_conf": 0.4412933588027954,
"mask/share_reasoning": 0.5496131777763367,
"mask/share_step_conf": 0.009093429893255234,
"num_tokens": 20931108.0,
"reward": 0.02031250111758709,
"reward_std": 0.01981809176504612,
"rewards/accuracy_reward_step": 0.203125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.4163287281990051,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6815744638442993,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.953125,
"calib/avg_num_step_conf": 0.01953125,
"calib/ece": 0.5136514522821576,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.26141078838174275,
"calib/gap": 0.026646881477293993,
"calib/mean_conf": 0.7226970954356847,
"calib/mu_c": 0.7429310344827585,
"calib/mu_w": 0.7162841530054646,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.01953125,
"calib/pce": 0.49784232365145226,
"calib/std_conf": 0.2611397527943396,
"calib/step_conf_rate": 0.01953125,
"calib/step_q_w": 0.7539999999999999,
"calib/step_q_w_n": 5.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 673.0,
"completions/max_terminated_length": 673.0,
"completions/mean_length": 54.40625,
"completions/mean_terminated_length": 54.40625,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.7337045073509216,
"kl": 0.593536376953125,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0399,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.49154359102249146,
"mask/share_reasoning": 0.5072588324546814,
"mask/share_step_conf": 0.0011975823435932398,
"num_tokens": 21052956.0,
"reward": 0.02265625074505806,
"reward_std": 0.02384321764111519,
"rewards/accuracy_reward_step": 0.2265625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5207710862159729,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7752877473831177,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.9296875,
"calib/avg_num_step_conf": 0.0234375,
"calib/ece": 0.6300747899159664,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.31092436974789917,
"calib/gap": -0.020954728682170742,
"calib/mean_conf": 0.7815873949579831,
"calib/mu_c": 0.7644186046511627,
"calib/mu_w": 0.7853733333333335,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.0234375,
"calib/pce": 0.6154949579831932,
"calib/std_conf": 0.21596721197403,
"calib/step_conf_rate": 0.0234375,
"calib/step_q_w": 0.8350000000000001,
"calib/step_q_w_n": 6.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 754.0,
"completions/max_terminated_length": 754.0,
"completions/mean_length": 83.03515625,
"completions/mean_terminated_length": 83.03515625,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.8056948781013489,
"kl": 0.4501953125,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.1778,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.4397127032279968,
"mask/share_reasoning": 0.5587553381919861,
"mask/share_step_conf": 0.0015319802332669497,
"num_tokens": 21179413.0,
"reward": 0.01718750223517418,
"reward_std": 0.0298269335180521,
"rewards/accuracy_reward_step": 0.171875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6232556104660034,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8265672922134399,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.91015625,
"calib/avg_num_step_conf": 0.05859375,
"calib/ece": 0.4028260869565218,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.25217391304347825,
"calib/gap": 0.01284474071777375,
"calib/mean_conf": 0.760304347826087,
"calib/mu_c": 0.7677319587628866,
"calib/mu_w": 0.7548872180451128,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.05859375,
"calib/pce": 0.37069565217391304,
"calib/std_conf": 0.24044052591615153,
"calib/step_conf_rate": 0.05859375,
"calib/step_q_w": 0.7380000000000001,
"calib/step_q_w_n": 15.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 806.0,
"completions/max_terminated_length": 806.0,
"completions/mean_length": 149.109375,
"completions/mean_terminated_length": 149.69412231445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1216,
"grad_norm": 0.7283843755722046,
"kl": 0.28704833984375,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.1192,
"mask/has_final_conf_rate": 0.89453125,
"mask/share_final_conf": 0.29339420795440674,
"mask/share_reasoning": 0.6928345561027527,
"mask/share_step_conf": 0.009865010157227516,
"num_tokens": 21322609.0,
"reward": 0.03828125074505806,
"reward_std": 0.03569255769252777,
"rewards/accuracy_reward_step": 0.3828125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6219078302383423,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8099403381347656,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.78515625,
"calib/avg_num_step_conf": 0.109375,
"calib/ece": 0.3879310344827587,
"calib/final_conf_rate": 0.79296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3448275862068966,
"calib/gap": 0.09248554336989034,
"calib/mean_conf": 0.7869458128078818,
"calib/mu_c": 0.8407058823529411,
"calib/mu_w": 0.7482203389830507,
"calib/nonempty_final_conf_rate": 0.79296875,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.109375,
"calib/pce": 0.3780788177339902,
"calib/std_conf": 0.22538315929172625,
"calib/step_conf_rate": 0.109375,
"calib/step_q_w": 0.7196428571428573,
"calib/step_q_w_n": 28.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2805.0,
"completions/max_terminated_length": 2805.0,
"completions/mean_length": 243.6328125,
"completions/mean_terminated_length": 243.6328125,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.5352555513381958,
"kl": 0.211669921875,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0547,
"mask/has_final_conf_rate": 0.79296875,
"mask/share_final_conf": 0.16653913259506226,
"mask/share_reasoning": 0.8282657861709595,
"mask/share_step_conf": 0.0051950933411717415,
"num_tokens": 21490243.0,
"reward": 0.03359375149011612,
"reward_std": 0.035612430423498154,
"rewards/accuracy_reward_step": 0.3359375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5753067135810852,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7754773497581482,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55078125,
"calib/avg_num_step_conf": 0.34375,
"calib/ece": 0.38651851851851843,
"calib/final_conf_rate": 0.52734375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3037037037037037,
"calib/gap": 0.0821271831616659,
"calib/mean_conf": 0.7817777777777777,
"calib/mu_c": 0.8286206896551724,
"calib/mu_w": 0.7464935064935065,
"calib/nonempty_final_conf_rate": 0.52734375,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.34375,
"calib/pce": 0.36933333333333324,
"calib/std_conf": 0.23689618500333814,
"calib/step_conf_rate": 0.34375,
"calib/step_q_w": 0.7093181818181818,
"calib/step_q_w_n": 88.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1235.0,
"completions/max_terminated_length": 1235.0,
"completions/mean_length": 330.046875,
"completions/mean_terminated_length": 330.046875,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.3845723569393158,
"kl": 0.1770172119140625,
"learning_rate": 2.361111111111111e-06,
"loss": 0.0159,
"mask/has_final_conf_rate": 0.52734375,
"mask/share_final_conf": 0.04086475074291229,
"mask/share_reasoning": 0.9291352033615112,
"mask/share_step_conf": 0.030000029131770134,
"num_tokens": 21679255.0,
"reward": 0.02460937574505806,
"reward_std": 0.032943278551101685,
"rewards/accuracy_reward_step": 0.24609375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.4300439953804016,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7204158902168274,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.390625,
"calib/avg_num_step_conf": 0.50390625,
"calib/ece": 0.481078431372549,
"calib/final_conf_rate": 0.3984375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.37254901960784315,
"calib/gap": -0.00723667377398729,
"calib/mean_conf": 0.7830392156862743,
"calib/mu_c": 0.7782857142857142,
"calib/mu_w": 0.7855223880597015,
"calib/nonempty_final_conf_rate": 0.3984375,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.50390625,
"calib/pce": 0.46049019607843134,
"calib/std_conf": 0.2285966960446325,
"calib/step_conf_rate": 0.50390625,
"calib/step_q_w": 0.7446434108527131,
"calib/step_q_w_n": 129.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1321.0,
"completions/max_terminated_length": 1321.0,
"completions/mean_length": 312.45703125,
"completions/mean_terminated_length": 312.45703125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.1248,
"grad_norm": 0.1619957536458969,
"kl": 0.179473876953125,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0044,
"mask/has_final_conf_rate": 0.3984375,
"mask/share_final_conf": 0.02772274985909462,
"mask/share_reasoning": 0.9462649822235107,
"mask/share_step_conf": 0.02601221576333046,
"num_tokens": 21865844.0,
"reward": 0.014062500558793545,
"reward_std": 0.024633172899484634,
"rewards/accuracy_reward_step": 0.140625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.3409615159034729,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6401100158691406,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.19140625,
"calib/avg_num_step_conf": 0.65625,
"calib/ece": 0.3719607843137256,
"calib/final_conf_rate": 0.19921875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.45098039215686275,
"calib/gap": 0.04656923076923081,
"calib/mean_conf": 0.8429411764705881,
"calib/mu_c": 0.8657692307692308,
"calib/mu_w": 0.8192,
"calib/nonempty_final_conf_rate": 0.19921875,
"calib/nonempty_reasoning_rate": 0.84765625,
"calib/nonempty_step_conf_rate": 0.65625,
"calib/pce": 0.35254901960784324,
"calib/std_conf": 0.1939506114376732,
"calib/step_conf_rate": 0.65625,
"calib/step_q_w": 0.7661904761904762,
"calib/step_q_w_n": 168.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1865.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 320.09375,
"completions/mean_terminated_length": 320.09375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.15597763657569885,
"kl": 0.179046630859375,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0162,
"mask/has_final_conf_rate": 0.19921875,
"mask/share_final_conf": 0.015014585107564926,
"mask/share_reasoning": 0.9491573572158813,
"mask/share_step_conf": 0.035828039050102234,
"num_tokens": 22051796.0,
"reward": 0.01015624962747097,
"reward_std": 0.019530273973941803,
"rewards/accuracy_reward_step": 0.1015625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.3127126395702362,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6183814406394958,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.1875,
"calib/avg_num_step_conf": 0.66015625,
"calib/ece": 0.25979591836734695,
"calib/final_conf_rate": 0.19140625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3469387755102041,
"calib/gap": 0.15809364548494986,
"calib/mean_conf": 0.790408163265306,
"calib/mu_c": 0.8646153846153847,
"calib/mu_w": 0.7065217391304348,
"calib/nonempty_final_conf_rate": 0.19140625,
"calib/nonempty_reasoning_rate": 0.84765625,
"calib/nonempty_step_conf_rate": 0.66015625,
"calib/pce": 0.25979591836734695,
"calib/std_conf": 0.2042203610116495,
"calib/step_conf_rate": 0.66015625,
"calib/step_q_w": 0.7396449704142012,
"calib/step_q_w_n": 169.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 371.734375,
"completions/mean_terminated_length": 371.734375,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.14461444318294525,
"kl": 0.1634979248046875,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0027,
"mask/has_final_conf_rate": 0.19140625,
"mask/share_final_conf": 0.007947621867060661,
"mask/share_reasoning": 0.9608822464942932,
"mask/share_step_conf": 0.03117012232542038,
"num_tokens": 22252024.0,
"reward": 0.010546875186264515,
"reward_std": 0.017912933602929115,
"rewards/accuracy_reward_step": 0.10546875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.24497468769550323,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5481371879577637,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.1015625,
"calib/avg_num_step_conf": 0.75390625,
"calib/ece": 0.3674074074074074,
"calib/final_conf_rate": 0.10546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2222222222222222,
"calib/gap": -0.0340588235294117,
"calib/mean_conf": 0.6755555555555555,
"calib/mu_c": 0.6629411764705884,
"calib/mu_w": 0.6970000000000001,
"calib/nonempty_final_conf_rate": 0.10546875,
"calib/nonempty_reasoning_rate": 0.85546875,
"calib/nonempty_step_conf_rate": 0.75390625,
"calib/pce": 0.20666666666666667,
"calib/std_conf": 0.2829518128499299,
"calib/step_conf_rate": 0.75390625,
"calib/step_q_w": 0.7301554404145078,
"calib/step_q_w_n": 193.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2377.0,
"completions/max_terminated_length": 2377.0,
"completions/mean_length": 304.0,
"completions/mean_terminated_length": 304.0,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.128,
"grad_norm": 0.1572761982679367,
"kl": 0.170440673828125,
"learning_rate": 2.25e-06,
"loss": 0.0256,
"mask/has_final_conf_rate": 0.10546875,
"mask/share_final_conf": 0.0060433279722929,
"mask/share_reasoning": 0.9575179815292358,
"mask/share_step_conf": 0.03643868863582611,
"num_tokens": 22436536.0,
"reward": 0.006640625186264515,
"reward_std": 0.014032842591404915,
"rewards/accuracy_reward_step": 0.06640625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.18342046439647675,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.495700865983963,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0859375,
"calib/avg_num_step_conf": 0.796875,
"calib/ece": 0.5049999999999999,
"calib/final_conf_rate": 0.0859375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.36363636363636365,
"calib/gap": -0.08766666666666678,
"calib/mean_conf": 0.7968181818181818,
"calib/mu_c": 0.749,
"calib/mu_w": 0.8366666666666668,
"calib/nonempty_final_conf_rate": 0.0859375,
"calib/nonempty_reasoning_rate": 0.8828125,
"calib/nonempty_step_conf_rate": 0.796875,
"calib/pce": 0.4236363636363636,
"calib/std_conf": 0.20567817676334602,
"calib/step_conf_rate": 0.796875,
"calib/step_q_w": 0.7502941176470589,
"calib/step_q_w_n": 204.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2400.0,
"completions/max_terminated_length": 2400.0,
"completions/mean_length": 355.26171875,
"completions/mean_terminated_length": 356.6549377441406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.10364165157079697,
"kl": 0.1559600830078125,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0065,
"mask/has_final_conf_rate": 0.0859375,
"mask/share_final_conf": 0.0035274566616863012,
"mask/share_reasoning": 0.9567550420761108,
"mask/share_step_conf": 0.035811252892017365,
"num_tokens": 22632539.0,
"reward": 0.004687500186264515,
"reward_std": 0.010509217157959938,
"rewards/accuracy_reward_step": 0.046875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.18548457324504852,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.4957216680049896,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.09375,
"calib/avg_num_step_conf": 0.78515625,
"calib/ece": 0.3540909090909091,
"calib/final_conf_rate": 0.0859375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.22727272727272727,
"calib/gap": 0.030000000000000027,
"calib/mean_conf": 0.7913636363636364,
"calib/mu_c": 0.8063636363636363,
"calib/mu_w": 0.7763636363636363,
"calib/nonempty_final_conf_rate": 0.0859375,
"calib/nonempty_reasoning_rate": 0.87890625,
"calib/nonempty_step_conf_rate": 0.78515625,
"calib/pce": 0.32272727272727275,
"calib/std_conf": 0.21880450914718483,
"calib/step_conf_rate": 0.78515625,
"calib/step_q_w": 0.739004975124378,
"calib/step_q_w_n": 201.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3031.0,
"completions/max_terminated_length": 3031.0,
"completions/mean_length": 330.5859375,
"completions/mean_terminated_length": 330.5859375,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.11776480823755264,
"kl": 0.1688232421875,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0041,
"mask/has_final_conf_rate": 0.0859375,
"mask/share_final_conf": 0.00397895835340023,
"mask/share_reasoning": 0.9588773250579834,
"mask/share_step_conf": 0.037143707275390625,
"num_tokens": 22824513.0,
"reward": 0.0042968750931322575,
"reward_std": 0.01062716729938984,
"rewards/accuracy_reward_step": 0.04296875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.1469237357378006,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.43720394372940063,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.08984375,
"calib/avg_num_step_conf": 0.7578125,
"calib/ece": 0.3704761904761904,
"calib/final_conf_rate": 0.08203125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.23809523809523808,
"calib/gap": 0.015833333333333366,
"calib/mean_conf": 0.7876190476190476,
"calib/mu_c": 0.7966666666666666,
"calib/mu_w": 0.7808333333333333,
"calib/nonempty_final_conf_rate": 0.08203125,
"calib/nonempty_reasoning_rate": 0.84765625,
"calib/nonempty_step_conf_rate": 0.7578125,
"calib/pce": 0.36476190476190473,
"calib/std_conf": 0.15556057653056427,
"calib/step_conf_rate": 0.7578125,
"calib/step_q_w": 0.7216494845360825,
"calib/step_q_w_n": 194.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1675.0,
"completions/max_terminated_length": 1675.0,
"completions/mean_length": 355.66015625,
"completions/mean_terminated_length": 355.66015625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.1312,
"grad_norm": 0.09586463868618011,
"kl": 0.1557159423828125,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0006,
"mask/has_final_conf_rate": 0.08203125,
"mask/share_final_conf": 0.002528803190216422,
"mask/share_reasoning": 0.9620292782783508,
"mask/share_step_conf": 0.035441912710666656,
"num_tokens": 23020850.0,
"reward": 0.0035156249068677425,
"reward_std": 0.008417459204792976,
"rewards/accuracy_reward_step": 0.03515625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.17352376878261566,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.49564820528030396,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0625,
"calib/avg_num_step_conf": 0.7890625,
"calib/ece": 0.33187500000000003,
"calib/final_conf_rate": 0.0625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.4375,
"calib/gap": 0.04079365079365094,
"calib/mean_conf": 0.8943749999999999,
"calib/mu_c": 0.9122222222222223,
"calib/mu_w": 0.8714285714285713,
"calib/nonempty_final_conf_rate": 0.0625,
"calib/nonempty_reasoning_rate": 0.8515625,
"calib/nonempty_step_conf_rate": 0.7890625,
"calib/pce": 0.33187500000000003,
"calib/std_conf": 0.08276236690066324,
"calib/step_conf_rate": 0.7890625,
"calib/step_q_w": 0.7372772277227722,
"calib/step_q_w_n": 202.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1524.0,
"completions/max_terminated_length": 1524.0,
"completions/mean_length": 320.1875,
"completions/mean_terminated_length": 320.1875,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.1139259859919548,
"kl": 0.1735076904296875,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0024,
"mask/has_final_conf_rate": 0.0625,
"mask/share_final_conf": 0.0033107856288552284,
"mask/share_reasoning": 0.9576960206031799,
"mask/share_step_conf": 0.03899314999580383,
"num_tokens": 23209634.0,
"reward": 0.003515625139698386,
"reward_std": 0.009943688288331032,
"rewards/accuracy_reward_step": 0.03515625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 0.79296875,
"calib/ece": 0.6881818181818182,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.45454545454545453,
"calib/gap": -0.07000000000000006,
"calib/mean_conf": 0.860909090909091,
"calib/mu_c": 0.8099999999999999,
"calib/mu_w": 0.88,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.8359375,
"calib/nonempty_step_conf_rate": 0.79296875,
"calib/pce": 0.6381818181818182,
"calib/std_conf": 0.10457311886944293,
"calib/step_conf_rate": 0.79296875,
"calib/step_q_w": 0.7353201970443349,
"calib/step_q_w_n": 203.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1139.0,
"completions/max_terminated_length": 1139.0,
"completions/mean_length": 332.6484375,
"completions/mean_terminated_length": 333.9529724121094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.07615622878074646,
"kl": 0.156707763671875,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.003,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.0015664431266486645,
"mask/share_reasoning": 0.9517641067504883,
"mask/share_step_conf": 0.04276318848133087,
"num_tokens": 23399600.0,
"reward": 0.0011718750465661287,
"reward_std": 0.003314562840387225,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.044541239738464355,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.23372872173786163,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0546875,
"calib/avg_num_step_conf": 0.73828125,
"calib/ece": 0.6771428571428572,
"calib/final_conf_rate": 0.0546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": -0.16606060606060602,
"calib/mean_conf": 0.8371428571428571,
"calib/mu_c": 0.7066666666666667,
"calib/mu_w": 0.8727272727272727,
"calib/nonempty_final_conf_rate": 0.0546875,
"calib/nonempty_reasoning_rate": 0.79296875,
"calib/nonempty_step_conf_rate": 0.73828125,
"calib/pce": 0.65,
"calib/std_conf": 0.10470561258176396,
"calib/step_conf_rate": 0.73828125,
"calib/step_q_w": 0.737037037037037,
"calib/step_q_w_n": 189.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1928.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 332.74609375,
"completions/mean_terminated_length": 332.74609375,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.1344,
"grad_norm": 0.05615081265568733,
"kl": 0.1518402099609375,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0113,
"mask/has_final_conf_rate": 0.0546875,
"mask/share_final_conf": 0.002635817974805832,
"mask/share_reasoning": 0.9557403922080994,
"mask/share_step_conf": 0.041623782366514206,
"num_tokens": 23590247.0,
"reward": 0.0011718750465661287,
"reward_std": 0.002551448065787554,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.07712167501449585,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.33043214678764343,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0390625,
"calib/avg_num_step_conf": 0.84375,
"calib/ece": 0.43666666666666665,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2222222222222222,
"calib/gap": 0.12150000000000005,
"calib/mean_conf": 0.7100000000000001,
"calib/mu_c": 0.7775,
"calib/mu_w": 0.6559999999999999,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.8828125,
"calib/nonempty_step_conf_rate": 0.84375,
"calib/pce": 0.35111111111111104,
"calib/std_conf": 0.21974732965132682,
"calib/step_conf_rate": 0.84375,
"calib/step_q_w": 0.7301388888888888,
"calib/step_q_w_n": 216.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 303.4453125,
"completions/mean_terminated_length": 303.4453125,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.07867080718278885,
"kl": 0.1750335693359375,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0151,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.00164020957890898,
"mask/share_reasoning": 0.9506195783615112,
"mask/share_step_conf": 0.04774019867181778,
"num_tokens": 23771601.0,
"reward": 0.0015625000232830644,
"reward_std": 0.0044194171205163,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.06382165849208832,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.2862262725830078,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.05078125,
"calib/avg_num_step_conf": 0.734375,
"calib/ece": 0.6200000000000001,
"calib/final_conf_rate": 0.0546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5714285714285714,
"calib/gap": 0.02949999999999997,
"calib/mean_conf": 0.8314285714285715,
"calib/mu_c": 0.8525,
"calib/mu_w": 0.8230000000000001,
"calib/nonempty_final_conf_rate": 0.0546875,
"calib/nonempty_reasoning_rate": 0.78515625,
"calib/nonempty_step_conf_rate": 0.734375,
"calib/pce": 0.582857142857143,
"calib/std_conf": 0.21390442267708334,
"calib/step_conf_rate": 0.734375,
"calib/step_q_w": 0.7459042553191491,
"calib/step_q_w_n": 188.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2892.0,
"completions/max_terminated_length": 2892.0,
"completions/mean_length": 342.48046875,
"completions/mean_terminated_length": 343.82354736328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.06135067716240883,
"kl": 0.1589813232421875,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0225,
"mask/has_final_conf_rate": 0.0546875,
"mask/share_final_conf": 0.001771321753039956,
"mask/share_reasoning": 0.9519810676574707,
"mask/share_step_conf": 0.04234137386083603,
"num_tokens": 23965940.0,
"reward": 0.0015625000232830644,
"reward_std": 0.003656302345916629,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.13063138723373413,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.4048003852367401,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.04296875,
"calib/avg_num_step_conf": 0.8203125,
"calib/ece": 0.15909090909090903,
"calib/final_conf_rate": 0.04296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.36363636363636365,
"calib/gap": -0.01666666666666672,
"calib/mean_conf": 0.8663636363636363,
"calib/mu_c": 0.8633333333333333,
"calib/mu_w": 0.88,
"calib/nonempty_final_conf_rate": 0.04296875,
"calib/nonempty_reasoning_rate": 0.86328125,
"calib/nonempty_step_conf_rate": 0.8203125,
"calib/pce": 0.10363636363636357,
"calib/std_conf": 0.09334907383995227,
"calib/step_conf_rate": 0.8203125,
"calib/step_q_w": 0.743,
"calib/step_q_w_n": 210.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 2653.0,
"completions/mean_length": 301.328125,
"completions/mean_terminated_length": 301.328125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1376,
"grad_norm": 0.24197635054588318,
"kl": 0.2081451416015625,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0295,
"mask/has_final_conf_rate": 0.04296875,
"mask/share_final_conf": 0.002114505972713232,
"mask/share_reasoning": 0.9501706957817078,
"mask/share_step_conf": 0.04771481826901436,
"num_tokens": 24145464.0,
"reward": 0.0035156249068677425,
"reward_std": 0.007483351975679398,
"rewards/accuracy_reward_step": 0.03515625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.20476499199867249,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5225287675857544,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.078125,
"calib/avg_num_step_conf": 0.76171875,
"calib/ece": 0.29700000000000004,
"calib/final_conf_rate": 0.078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": -0.053333333333333344,
"calib/mean_conf": 0.8229999999999998,
"calib/mu_c": 0.8016666666666666,
"calib/mu_w": 0.855,
"calib/nonempty_final_conf_rate": 0.078125,
"calib/nonempty_reasoning_rate": 0.83984375,
"calib/nonempty_step_conf_rate": 0.76171875,
"calib/pce": 0.26000000000000006,
"calib/std_conf": 0.13550276749941306,
"calib/step_conf_rate": 0.76171875,
"calib/step_q_w": 0.7591794871794871,
"calib/step_q_w_n": 195.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1085.0,
"completions/max_terminated_length": 1085.0,
"completions/mean_length": 310.625,
"completions/mean_terminated_length": 310.625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.1571102887392044,
"kl": 0.165740966796875,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.0206,
"mask/has_final_conf_rate": 0.078125,
"mask/share_final_conf": 0.0029551468323916197,
"mask/share_reasoning": 0.9555857181549072,
"mask/share_step_conf": 0.041459180414676666,
"num_tokens": 24330272.0,
"reward": 0.004687500186264515,
"reward_std": 0.01173202134668827,
"rewards/accuracy_reward_step": 0.046875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.13959984481334686,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.4048607349395752,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0703125,
"calib/avg_num_step_conf": 0.8125,
"calib/ece": 0.31166666666666665,
"calib/final_conf_rate": 0.0703125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2777777777777778,
"calib/gap": 0.027142857142857135,
"calib/mean_conf": 0.7994444444444444,
"calib/mu_c": 0.81,
"calib/mu_w": 0.7828571428571429,
"calib/nonempty_final_conf_rate": 0.0703125,
"calib/nonempty_reasoning_rate": 0.8828125,
"calib/nonempty_step_conf_rate": 0.8125,
"calib/pce": 0.25,
"calib/std_conf": 0.2205919365460483,
"calib/step_conf_rate": 0.8125,
"calib/step_q_w": 0.7614903846153847,
"calib/step_q_w_n": 208.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1010.0,
"completions/max_terminated_length": 1010.0,
"completions/mean_length": 267.1875,
"completions/mean_terminated_length": 267.1875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.11208275705575943,
"kl": 0.180572509765625,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0063,
"mask/has_final_conf_rate": 0.0703125,
"mask/share_final_conf": 0.0038356492295861244,
"mask/share_reasoning": 0.9527290463447571,
"mask/share_step_conf": 0.04343531280755997,
"num_tokens": 24504880.0,
"reward": 0.0042968750931322575,
"reward_std": 0.007995839230716228,
"rewards/accuracy_reward_step": 0.04296875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.16620416939258575,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.46737951040267944,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.078125,
"calib/avg_num_step_conf": 0.7578125,
"calib/ece": 0.3385,
"calib/final_conf_rate": 0.078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.45,
"calib/gap": 0.14500000000000002,
"calib/mean_conf": 0.8384999999999998,
"calib/mu_c": 0.9109999999999999,
"calib/mu_w": 0.7659999999999999,
"calib/nonempty_final_conf_rate": 0.078125,
"calib/nonempty_reasoning_rate": 0.8359375,
"calib/nonempty_step_conf_rate": 0.7578125,
"calib/pce": 0.3385,
"calib/std_conf": 0.19799684340918167,
"calib/step_conf_rate": 0.7578125,
"calib/step_q_w": 0.7511340206185567,
"calib/step_q_w_n": 194.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1256.0,
"completions/max_terminated_length": 1256.0,
"completions/mean_length": 329.43359375,
"completions/mean_terminated_length": 329.43359375,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.1408,
"grad_norm": 0.1098121926188469,
"kl": 0.14825439453125,
"learning_rate": 1.916666666666667e-06,
"loss": 0.0104,
"mask/has_final_conf_rate": 0.078125,
"mask/share_final_conf": 0.002711489563807845,
"mask/share_reasoning": 0.948211669921875,
"mask/share_step_conf": 0.049076832830905914,
"num_tokens": 24694807.0,
"reward": 0.00390625,
"reward_std": 0.009522313252091408,
"rewards/accuracy_reward_step": 0.0390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.09640209376811981,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.3694343566894531,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.125,
"calib/avg_num_step_conf": 0.69140625,
"calib/ece": 0.7000000000000001,
"calib/final_conf_rate": 0.125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.375,
"calib/gap": 0.040000000000000036,
"calib/mean_conf": 0.85625,
"calib/mu_c": 0.89,
"calib/mu_w": 0.85,
"calib/nonempty_final_conf_rate": 0.125,
"calib/nonempty_reasoning_rate": 0.81640625,
"calib/nonempty_step_conf_rate": 0.69140625,
"calib/pce": 0.7000000000000001,
"calib/std_conf": 0.10400570897792102,
"calib/step_conf_rate": 0.69140625,
"calib/step_q_w": 0.743728813559322,
"calib/step_q_w_n": 177.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3002.0,
"completions/max_terminated_length": 3002.0,
"completions/mean_length": 392.83203125,
"completions/mean_terminated_length": 392.83203125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.07171526551246643,
"kl": 0.135406494140625,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0111,
"mask/has_final_conf_rate": 0.125,
"mask/share_final_conf": 0.004062256310135126,
"mask/share_reasoning": 0.9640213251113892,
"mask/share_step_conf": 0.03191642463207245,
"num_tokens": 24901716.0,
"reward": 0.001953125,
"reward_std": 0.005524271167814732,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.1396041214466095,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.4048749804496765,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0859375,
"calib/avg_num_step_conf": 0.671875,
"calib/ece": 0.3680952380952381,
"calib/final_conf_rate": 0.08203125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": 0.06981818181818189,
"calib/mean_conf": 0.7614285714285715,
"calib/mu_c": 0.798,
"calib/mu_w": 0.7281818181818182,
"calib/nonempty_final_conf_rate": 0.08203125,
"calib/nonempty_reasoning_rate": 0.7578125,
"calib/nonempty_step_conf_rate": 0.671875,
"calib/pce": 0.32666666666666666,
"calib/std_conf": 0.26380522602653567,
"calib/step_conf_rate": 0.671875,
"calib/step_q_w": 0.7805232558139535,
"calib/step_q_w_n": 172.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1758.0,
"completions/max_terminated_length": 1758.0,
"completions/mean_length": 369.51171875,
"completions/mean_terminated_length": 369.51171875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.09373670071363449,
"kl": 0.136810302734375,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.0003,
"mask/has_final_conf_rate": 0.08203125,
"mask/share_final_conf": 0.00392060587182641,
"mask/share_reasoning": 0.9603133201599121,
"mask/share_step_conf": 0.035766102373600006,
"num_tokens": 25105263.0,
"reward": 0.00390625,
"reward_std": 0.007996084168553352,
"rewards/accuracy_reward_step": 0.0390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.21806500852108002,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.5479929447174072,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.08984375,
"calib/avg_num_step_conf": 0.75390625,
"calib/ece": 0.3695238095238095,
"calib/final_conf_rate": 0.08203125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": -0.019722222222222308,
"calib/mean_conf": 0.8342857142857143,
"calib/mu_c": 0.8258333333333333,
"calib/mu_w": 0.8455555555555556,
"calib/nonempty_final_conf_rate": 0.08203125,
"calib/nonempty_reasoning_rate": 0.84375,
"calib/nonempty_step_conf_rate": 0.75390625,
"calib/pce": 0.31619047619047613,
"calib/std_conf": 0.11692779328943745,
"calib/step_conf_rate": 0.75390625,
"calib/step_q_w": 0.7612435233160623,
"calib/step_q_w_n": 193.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2106.0,
"completions/max_terminated_length": 2106.0,
"completions/mean_length": 349.265625,
"completions/mean_terminated_length": 352.0157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.144,
"grad_norm": 0.12835043668746948,
"kl": 0.135101318359375,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0023,
"mask/has_final_conf_rate": 0.08203125,
"mask/share_final_conf": 0.0038112555630505085,
"mask/share_reasoning": 0.9503839015960693,
"mask/share_step_conf": 0.03799235075712204,
"num_tokens": 25300555.0,
"reward": 0.004687500186264515,
"reward_std": 0.012495135888457298,
"rewards/accuracy_reward_step": 0.046875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.019303197041153908,
"adv/mean_abs_reasoning": 0.19146069884300232,
"adv/mean_abs_step_conf": 0.019317565485835075,
"adv/ratio_final_to_reasoning": 0.10082067577212031,
"adv/ratio_step_to_reasoning": 0.10089572221647154,
"adv/std_final_conf": 0.16541126370429993,
"adv/std_reasoning": 0.49574676156044006,
"adv/std_step_conf": 0.1655343919992447,
"calib/answer_extract_rate": 0.12890625,
"calib/avg_num_step_conf": 0.72265625,
"calib/ece": 0.4067741935483871,
"calib/final_conf_rate": 0.12109375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.2903225806451613,
"calib/gap": 0.08273109243697474,
"calib/mean_conf": 0.8067741935483872,
"calib/mu_c": 0.8521428571428571,
"calib/mu_w": 0.7694117647058824,
"calib/nonempty_final_conf_rate": 0.12109375,
"calib/nonempty_reasoning_rate": 0.84765625,
"calib/nonempty_step_conf_rate": 0.72265625,
"calib/pce": 0.3809677419354839,
"calib/std_conf": 0.19141893081919178,
"calib/step_conf_rate": 0.72265625,
"calib/step_q_w": 0.7596216216216218,
"calib/step_q_w_n": 185.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 990.0,
"completions/max_terminated_length": 990.0,
"completions/mean_length": 302.9140625,
"completions/mean_terminated_length": 304.10198974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.7878153324127197,
"kl": 0.1512451171875,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0754,
"mask/has_final_conf_rate": 0.1171875,
"mask/share_final_conf": 0.00546366348862648,
"mask/share_reasoning": 0.9488362073898315,
"mask/share_step_conf": 0.04179389774799347,
"num_tokens": 25486589.0,
"reward": 0.005303375422954559,
"reward_std": 0.01033155806362629,
"rewards/accuracy_reward_step": 0.0546875,
"rewards/final_brier_reward_step": 0.001343359355814755,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.002455358626320958,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.33405715227127075,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6400879621505737,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.13671875,
"calib/avg_num_step_conf": 0.703125,
"calib/ece": 0.26722222222222214,
"calib/final_conf_rate": 0.140625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.4166666666666667,
"calib/gap": 0.051839464882943,
"calib/mean_conf": 0.8338888888888889,
"calib/mu_c": 0.8526086956521738,
"calib/mu_w": 0.8007692307692308,
"calib/nonempty_final_conf_rate": 0.140625,
"calib/nonempty_reasoning_rate": 0.83984375,
"calib/nonempty_step_conf_rate": 0.703125,
"calib/pce": 0.23111111111111104,
"calib/std_conf": 0.19532420026682956,
"calib/step_conf_rate": 0.703125,
"calib/step_q_w": 0.7707222222222223,
"calib/step_q_w_n": 180.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2962.0,
"completions/max_terminated_length": 2962.0,
"completions/mean_length": 346.734375,
"completions/mean_terminated_length": 346.734375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.1472981870174408,
"kl": 0.14693450927734375,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0257,
"mask/has_final_conf_rate": 0.140625,
"mask/share_final_conf": 0.007967180572450161,
"mask/share_reasoning": 0.9528160095214844,
"mask/share_step_conf": 0.03921680524945259,
"num_tokens": 25682337.0,
"reward": 0.008984374813735485,
"reward_std": 0.019135739654302597,
"rewards/accuracy_reward_step": 0.08984375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.3383844494819641,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.6609932780265808,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.125,
"calib/avg_num_step_conf": 0.75,
"calib/ece": 0.1984375000000001,
"calib/final_conf_rate": 0.125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.28125,
"calib/gap": -0.046454545454545526,
"calib/mean_conf": 0.8340625,
"calib/mu_c": 0.8195454545454545,
"calib/mu_w": 0.866,
"calib/nonempty_final_conf_rate": 0.125,
"calib/nonempty_reasoning_rate": 0.875,
"calib/nonempty_step_conf_rate": 0.75,
"calib/pce": 0.17250000000000004,
"calib/std_conf": 0.09512686841134843,
"calib/step_conf_rate": 0.75,
"calib/step_q_w": 0.7651562500000001,
"calib/step_q_w_n": 192.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 314.4140625,
"completions/mean_terminated_length": 314.4140625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1472,
"grad_norm": 0.1709873080253601,
"kl": 0.15203857421875,
"learning_rate": 1.75e-06,
"loss": 0.0176,
"mask/has_final_conf_rate": 0.125,
"mask/share_final_conf": 0.006228435784578323,
"mask/share_reasoning": 0.9524450898170471,
"mask/share_step_conf": 0.041326455771923065,
"num_tokens": 25867163.0,
"reward": 0.008593750186264515,
"reward_std": 0.01938612200319767,
"rewards/accuracy_reward_step": 0.0859375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.27115947008132935,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.572519838809967,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.12109375,
"calib/avg_num_step_conf": 0.81640625,
"calib/ece": 0.21266666666666662,
"calib/final_conf_rate": 0.1171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3,
"calib/gap": 0.05301587301587296,
"calib/mean_conf": 0.8393333333333332,
"calib/mu_c": 0.8552380952380951,
"calib/mu_w": 0.8022222222222222,
"calib/nonempty_final_conf_rate": 0.1171875,
"calib/nonempty_reasoning_rate": 0.9375,
"calib/nonempty_step_conf_rate": 0.81640625,
"calib/pce": 0.176,
"calib/std_conf": 0.11093341346150952,
"calib/step_conf_rate": 0.81640625,
"calib/step_q_w": 0.7753574162679426,
"calib/step_q_w_n": 209.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1039.0,
"completions/max_terminated_length": 1039.0,
"completions/mean_length": 292.64453125,
"completions/mean_terminated_length": 292.64453125,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.13704362511634827,
"kl": 0.1703948974609375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.0029,
"mask/has_final_conf_rate": 0.1171875,
"mask/share_final_conf": 0.00667218491435051,
"mask/share_reasoning": 0.9529383182525635,
"mask/share_step_conf": 0.040389493107795715,
"num_tokens": 26045176.0,
"reward": 0.008203125558793545,
"reward_std": 0.015532232820987701,
"rewards/accuracy_reward_step": 0.08203125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.43746131658554077,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7013096809387207,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.2265625,
"calib/avg_num_step_conf": 0.671875,
"calib/ece": 0.14245614035087711,
"calib/final_conf_rate": 0.22265625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2982456140350877,
"calib/gap": 0.0723323170731709,
"calib/mean_conf": 0.8414035087719298,
"calib/mu_c": 0.8617073170731708,
"calib/mu_w": 0.7893749999999999,
"calib/nonempty_final_conf_rate": 0.22265625,
"calib/nonempty_reasoning_rate": 0.8984375,
"calib/nonempty_step_conf_rate": 0.671875,
"calib/pce": 0.13228070175438586,
"calib/std_conf": 0.15132674763270382,
"calib/step_conf_rate": 0.671875,
"calib/step_q_w": 0.7836627906976744,
"calib/step_q_w_n": 172.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1428.0,
"completions/max_terminated_length": 1428.0,
"completions/mean_length": 321.37109375,
"completions/mean_terminated_length": 321.37109375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.3020021319389343,
"kl": 0.177581787109375,
"learning_rate": 1.6944444444444446e-06,
"loss": 0.0181,
"mask/has_final_conf_rate": 0.22265625,
"mask/share_final_conf": 0.012400241568684578,
"mask/share_reasoning": 0.951915979385376,
"mask/share_step_conf": 0.03568378463387489,
"num_tokens": 26232463.0,
"reward": 0.01640624925494194,
"reward_std": 0.02505391091108322,
"rewards/accuracy_reward_step": 0.1640625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5413906574249268,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8096631765365601,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.3046875,
"calib/avg_num_step_conf": 0.55078125,
"calib/ece": 0.31581081081081086,
"calib/final_conf_rate": 0.2890625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.40540540540540543,
"calib/gap": 0.016208425720620867,
"calib/mean_conf": 0.8471621621621621,
"calib/mu_c": 0.8543902439024389,
"calib/mu_w": 0.838181818181818,
"calib/nonempty_final_conf_rate": 0.2890625,
"calib/nonempty_reasoning_rate": 0.85546875,
"calib/nonempty_step_conf_rate": 0.55078125,
"calib/pce": 0.3044594594594595,
"calib/std_conf": 0.16851914856526806,
"calib/step_conf_rate": 0.55078125,
"calib/step_q_w": 0.7974468085106383,
"calib/step_q_w_n": 141.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1516.0,
"completions/max_terminated_length": 1516.0,
"completions/mean_length": 335.48046875,
"completions/mean_terminated_length": 336.7961120605469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1504,
"grad_norm": 0.17797711491584778,
"kl": 0.150115966796875,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0117,
"mask/has_final_conf_rate": 0.2890625,
"mask/share_final_conf": 0.01352442055940628,
"mask/share_reasoning": 0.9501509666442871,
"mask/share_step_conf": 0.032418347895145416,
"num_tokens": 26425442.0,
"reward": 0.01679687574505806,
"reward_std": 0.031011424958705902,
"rewards/accuracy_reward_step": 0.16796875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.01929006353020668,
"adv/mean_abs_reasoning": 0.5003547668457031,
"adv/mean_abs_step_conf": 0.019317567348480225,
"adv/ratio_final_to_reasoning": 0.038552772569378266,
"adv/ratio_step_to_reasoning": 0.03860774120382724,
"adv/std_final_conf": 0.1652987152338028,
"adv/std_reasoning": 0.7574677467346191,
"adv/std_step_conf": 0.16553440690040588,
"calib/answer_extract_rate": 0.296875,
"calib/avg_num_step_conf": 0.578125,
"calib/ece": 0.2581818181818183,
"calib/final_conf_rate": 0.30078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.3246753246753247,
"calib/gap": 0.019607293127629544,
"calib/mean_conf": 0.8555844155844157,
"calib/mu_c": 0.8634782608695651,
"calib/mu_w": 0.8438709677419356,
"calib/nonempty_final_conf_rate": 0.30078125,
"calib/nonempty_reasoning_rate": 0.87109375,
"calib/nonempty_step_conf_rate": 0.578125,
"calib/pce": 0.2581818181818183,
"calib/std_conf": 0.12876530050548884,
"calib/step_conf_rate": 0.578125,
"calib/step_q_w": 0.7908108108108107,
"calib/step_q_w_n": 148.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2038.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 369.17578125,
"completions/mean_terminated_length": 370.6235656738281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.26313072443008423,
"kl": 0.1321258544921875,
"learning_rate": 1.638888888888889e-06,
"loss": 0.009,
"mask/has_final_conf_rate": 0.30078125,
"mask/share_final_conf": 0.013783589005470276,
"mask/share_reasoning": 0.9493716955184937,
"mask/share_step_conf": 0.03293842077255249,
"num_tokens": 26625111.0,
"reward": 0.017606385052204132,
"reward_std": 0.028986668214201927,
"rewards/accuracy_reward_step": 0.1796875,
"rewards/final_brier_reward_step": 0.000949609384406358,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.002455588662996888,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.4870590269565582,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7392624616622925,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.390625,
"calib/avg_num_step_conf": 0.4765625,
"calib/ece": 0.39294736842105266,
"calib/final_conf_rate": 0.37109375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3473684210526316,
"calib/gap": -0.003908199643493981,
"calib/mean_conf": 0.8350526315789472,
"calib/mu_c": 0.8329545454545453,
"calib/mu_w": 0.8368627450980393,
"calib/nonempty_final_conf_rate": 0.37109375,
"calib/nonempty_reasoning_rate": 0.8671875,
"calib/nonempty_step_conf_rate": 0.4765625,
"calib/pce": 0.382421052631579,
"calib/std_conf": 0.16809066278091542,
"calib/step_conf_rate": 0.4765625,
"calib/step_q_w": 0.7695081967213114,
"calib/step_q_w_n": 122.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2310.0,
"completions/max_terminated_length": 2310.0,
"completions/mean_length": 355.2265625,
"completions/mean_terminated_length": 356.61962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.14126311242580414,
"kl": 0.14892578125,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0178,
"mask/has_final_conf_rate": 0.37109375,
"mask/share_final_conf": 0.02051900513470173,
"mask/share_reasoning": 0.951433002948761,
"mask/share_step_conf": 0.024141736328601837,
"num_tokens": 26823385.0,
"reward": 0.01796875149011612,
"reward_std": 0.02789430133998394,
"rewards/accuracy_reward_step": 0.1796875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5897383689880371,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928624153137207,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.49609375,
"calib/avg_num_step_conf": 0.390625,
"calib/ece": 0.34152000000000005,
"calib/final_conf_rate": 0.48828125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.344,
"calib/gap": -0.001836945304437343,
"calib/mean_conf": 0.8567199999999999,
"calib/mu_c": 0.8558823529411765,
"calib/mu_w": 0.8577192982456139,
"calib/nonempty_final_conf_rate": 0.48828125,
"calib/nonempty_reasoning_rate": 0.88671875,
"calib/nonempty_step_conf_rate": 0.390625,
"calib/pce": 0.32712,
"calib/std_conf": 0.13055436262339148,
"calib/step_conf_rate": 0.390625,
"calib/step_q_w": 0.7783,
"calib/step_q_w_n": 100.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2882.0,
"completions/max_terminated_length": 2882.0,
"completions/mean_length": 337.01171875,
"completions/mean_terminated_length": 337.01171875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.1536,
"grad_norm": 0.18569757044315338,
"kl": 0.173431396484375,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.0453,
"mask/has_final_conf_rate": 0.48828125,
"mask/share_final_conf": 0.029239937663078308,
"mask/share_reasoning": 0.9489139318466187,
"mask/share_step_conf": 0.0218461062759161,
"num_tokens": 27013788.0,
"reward": 0.02734375186264515,
"reward_std": 0.033771052956581116,
"rewards/accuracy_reward_step": 0.2734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.7038739919662476,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8747616410255432,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.4921875,
"calib/avg_num_step_conf": 0.421875,
"calib/ece": 0.24959016393442623,
"calib/final_conf_rate": 0.4765625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.319672131147541,
"calib/gap": 0.015188787185354768,
"calib/mean_conf": 0.845983606557377,
"calib/mu_c": 0.8517105263157895,
"calib/mu_w": 0.8365217391304347,
"calib/nonempty_final_conf_rate": 0.4765625,
"calib/nonempty_reasoning_rate": 0.9140625,
"calib/nonempty_step_conf_rate": 0.421875,
"calib/pce": 0.23631147540983605,
"calib/std_conf": 0.1444189025141314,
"calib/step_conf_rate": 0.421875,
"calib/step_q_w": 0.7674074074074073,
"calib/step_q_w_n": 108.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1629.0,
"completions/max_terminated_length": 1629.0,
"completions/mean_length": 311.83203125,
"completions/mean_terminated_length": 311.83203125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.17270921170711517,
"kl": 0.1587677001953125,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0137,
"mask/has_final_conf_rate": 0.4765625,
"mask/share_final_conf": 0.028850942850112915,
"mask/share_reasoning": 0.942456841468811,
"mask/share_step_conf": 0.028692251071333885,
"num_tokens": 27196321.0,
"reward": 0.03085937537252903,
"reward_std": 0.04030867666006088,
"rewards/accuracy_reward_step": 0.30859375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.4999437630176544,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7393062710762024,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.50390625,
"calib/avg_num_step_conf": 0.39453125,
"calib/ece": 0.44346456692913383,
"calib/final_conf_rate": 0.49609375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.4409448818897638,
"calib/gap": 0.0002205128205128304,
"calib/mean_conf": 0.8510236220472442,
"calib/mu_c": 0.8511538461538461,
"calib/mu_w": 0.8509333333333333,
"calib/nonempty_final_conf_rate": 0.49609375,
"calib/nonempty_reasoning_rate": 0.8984375,
"calib/nonempty_step_conf_rate": 0.39453125,
"calib/pce": 0.44251968503937,
"calib/std_conf": 0.1380222203134937,
"calib/step_conf_rate": 0.39453125,
"calib/step_q_w": 0.7799009900990098,
"calib/step_q_w_n": 101.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1190.0,
"completions/max_terminated_length": 1190.0,
"completions/mean_length": 333.91015625,
"completions/mean_terminated_length": 333.91015625,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.18089455366134644,
"kl": 0.1577911376953125,
"learning_rate": 1.527777777777778e-06,
"loss": -0.0297,
"mask/has_final_conf_rate": 0.49609375,
"mask/share_final_conf": 0.0274009071290493,
"mask/share_reasoning": 0.9498142600059509,
"mask/share_step_conf": 0.02278483659029007,
"num_tokens": 27389018.0,
"reward": 0.02031249925494194,
"reward_std": 0.0286305770277977,
"rewards/accuracy_reward_step": 0.203125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5508720874786377,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.792724072933197,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.59375,
"calib/avg_num_step_conf": 0.328125,
"calib/ece": 0.37421052631578944,
"calib/final_conf_rate": 0.59375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.4144736842105263,
"calib/gap": 0.003397402597402399,
"calib/mean_conf": 0.850921052631579,
"calib/mu_c": 0.8525974025974025,
"calib/mu_w": 0.8492000000000001,
"calib/nonempty_final_conf_rate": 0.59375,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.328125,
"calib/pce": 0.35927631578947367,
"calib/std_conf": 0.1498084534152911,
"calib/step_conf_rate": 0.328125,
"calib/step_q_w": 0.7676309523809524,
"calib/step_q_w_n": 84.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2313.0,
"completions/max_terminated_length": 2313.0,
"completions/mean_length": 351.90234375,
"completions/mean_terminated_length": 351.90234375,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.1568,
"grad_norm": 0.2241380661725998,
"kl": 0.1469268798828125,
"learning_rate": 1.5e-06,
"loss": -0.0022,
"mask/has_final_conf_rate": 0.59375,
"mask/share_final_conf": 0.03285910189151764,
"mask/share_reasoning": 0.9477626085281372,
"mask/share_step_conf": 0.01937827840447426,
"num_tokens": 27582785.0,
"reward": 0.03046875074505806,
"reward_std": 0.03155011311173439,
"rewards/accuracy_reward_step": 0.3046875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6207718849182129,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.809929609298706,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.59375,
"calib/avg_num_step_conf": 0.29296875,
"calib/ece": 0.2514093959731543,
"calib/final_conf_rate": 0.58203125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3624161073825503,
"calib/gap": 0.003634792626728389,
"calib/mean_conf": 0.8363758389261745,
"calib/mu_c": 0.8377419354838711,
"calib/mu_w": 0.8341071428571427,
"calib/nonempty_final_conf_rate": 0.58203125,
"calib/nonempty_reasoning_rate": 0.88671875,
"calib/nonempty_step_conf_rate": 0.29296875,
"calib/pce": 0.23181208053691266,
"calib/std_conf": 0.15188398418247342,
"calib/step_conf_rate": 0.29296875,
"calib/step_q_w": 0.7673333333333335,
"calib/step_q_w_n": 75.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1097.0,
"completions/max_terminated_length": 1097.0,
"completions/mean_length": 327.73046875,
"completions/mean_terminated_length": 329.0157165527344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.2297629415988922,
"kl": 0.1496734619140625,
"learning_rate": 1.4722222222222225e-06,
"loss": 0.0047,
"mask/has_final_conf_rate": 0.578125,
"mask/share_final_conf": 0.03815712779760361,
"mask/share_reasoning": 0.9385182857513428,
"mask/share_step_conf": 0.019418369978666306,
"num_tokens": 27771796.0,
"reward": 0.03750000149011612,
"reward_std": 0.03554752096533775,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5284003019332886,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.757583498954773,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.48828125,
"calib/avg_num_step_conf": 0.39453125,
"calib/ece": 0.31999999999999995,
"calib/final_conf_rate": 0.48046875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.36585365853658536,
"calib/gap": 0.01525935828877012,
"calib/mean_conf": 0.8486178861788619,
"calib/mu_c": 0.8554411764705881,
"calib/mu_w": 0.840181818181818,
"calib/nonempty_final_conf_rate": 0.48046875,
"calib/nonempty_reasoning_rate": 0.8828125,
"calib/nonempty_step_conf_rate": 0.39453125,
"calib/pce": 0.3078861788617886,
"calib/std_conf": 0.12848286212155452,
"calib/step_conf_rate": 0.39453125,
"calib/step_q_w": 0.7900396039603959,
"calib/step_q_w_n": 101.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2521.0,
"completions/max_terminated_length": 2521.0,
"completions/mean_length": 358.890625,
"completions/mean_terminated_length": 358.890625,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.17723193764686584,
"kl": 0.1447906494140625,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.002,
"mask/has_final_conf_rate": 0.48046875,
"mask/share_final_conf": 0.027915118262171745,
"mask/share_reasoning": 0.9477370977401733,
"mask/share_step_conf": 0.024347800761461258,
"num_tokens": 27968128.0,
"reward": 0.02656250074505806,
"reward_std": 0.030259788036346436,
"rewards/accuracy_reward_step": 0.265625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.019323909655213356,
"adv/mean_abs_reasoning": 0.628254771232605,
"adv/mean_abs_step_conf": 0.018246658146381378,
"adv/ratio_final_to_reasoning": 0.030758078633132853,
"adv/ratio_step_to_reasoning": 0.02904340560849595,
"adv/std_final_conf": 0.16558875143527985,
"adv/std_reasoning": 0.8266027569770813,
"adv/std_step_conf": 0.15635766088962555,
"calib/answer_extract_rate": 0.54296875,
"calib/avg_num_step_conf": 0.36328125,
"calib/ece": 0.28391304347826085,
"calib/final_conf_rate": 0.5390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.35507246376811596,
"calib/gap": 0.034206896551724264,
"calib/mean_conf": 0.863623188405797,
"calib/mu_c": 0.8780000000000001,
"calib/mu_w": 0.8437931034482758,
"calib/nonempty_final_conf_rate": 0.5390625,
"calib/nonempty_reasoning_rate": 0.8984375,
"calib/nonempty_step_conf_rate": 0.36328125,
"calib/pce": 0.28391304347826085,
"calib/std_conf": 0.10754804810271376,
"calib/step_conf_rate": 0.36328125,
"calib/step_q_c": 0.9,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.09152173913043482,
"calib/step_q_w": 0.8084782608695652,
"calib/step_q_w_n": 92.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2527.0,
"completions/max_terminated_length": 2527.0,
"completions/mean_length": 324.875,
"completions/mean_terminated_length": 326.1490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.16,
"grad_norm": 0.3182465434074402,
"kl": 0.1643829345703125,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.0247,
"mask/has_final_conf_rate": 0.5390625,
"mask/share_final_conf": 0.03170134127140045,
"mask/share_reasoning": 0.9385929703712463,
"mask/share_step_conf": 0.025799430906772614,
"num_tokens": 28156256.0,
"reward": 0.03355569392442703,
"reward_std": 0.04173934459686279,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.003867187537252903,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -3.704856499098241e-05,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.019322393462061882,
"adv/mean_abs_reasoning": 0.544094979763031,
"adv/mean_abs_step_conf": 0.01922667771577835,
"adv/ratio_final_to_reasoning": 0.03551290524767815,
"adv/ratio_step_to_reasoning": 0.035336987898973306,
"adv/std_final_conf": 0.1655757576227188,
"adv/std_reasoning": 0.7753925323486328,
"adv/std_step_conf": 0.16475556790828705,
"calib/answer_extract_rate": 0.55859375,
"calib/avg_num_step_conf": 0.33984375,
"calib/ece": 0.3914999999999999,
"calib/final_conf_rate": 0.546875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.34285714285714286,
"calib/gap": 0.03502467105263163,
"calib/mean_conf": 0.848642857142857,
"calib/mu_c": 0.86765625,
"calib/mu_w": 0.8326315789473684,
"calib/nonempty_final_conf_rate": 0.546875,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.33984375,
"calib/pce": 0.3914999999999999,
"calib/std_conf": 0.1224640513683547,
"calib/step_conf_rate": 0.33984375,
"calib/step_q_c": 0.64,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.1497674418604653,
"calib/step_q_w": 0.7897674418604653,
"calib/step_q_w_n": 86.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2525.0,
"completions/max_terminated_length": 2525.0,
"completions/mean_length": 369.09765625,
"completions/mean_terminated_length": 369.09765625,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.3350175619125366,
"kl": 0.1311187744140625,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0041,
"mask/has_final_conf_rate": 0.546875,
"mask/share_final_conf": 0.030342375859618187,
"mask/share_reasoning": 0.9506062269210815,
"mask/share_step_conf": 0.019051434472203255,
"num_tokens": 28357769.0,
"reward": 0.027675680816173553,
"reward_std": 0.03598960116505623,
"rewards/accuracy_reward_step": 0.2578125,
"rewards/final_brier_reward_step": 0.0034000000450760126,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.000392390153137967,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5678851008415222,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7927908301353455,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.51171875,
"calib/avg_num_step_conf": 0.36328125,
"calib/ece": 0.403515625,
"calib/final_conf_rate": 0.5,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.265625,
"calib/gap": 0.024904867803311026,
"calib/mean_conf": 0.843203125,
"calib/mu_c": 0.857017543859649,
"calib/mu_w": 0.832112676056338,
"calib/nonempty_final_conf_rate": 0.5,
"calib/nonempty_reasoning_rate": 0.875,
"calib/nonempty_step_conf_rate": 0.36328125,
"calib/pce": 0.400703125,
"calib/std_conf": 0.10895765801555381,
"calib/step_conf_rate": 0.36328125,
"calib/step_q_w": 0.767956989247312,
"calib/step_q_w_n": 93.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1683.0,
"completions/max_terminated_length": 1683.0,
"completions/mean_length": 362.5625,
"completions/mean_terminated_length": 362.5625,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.17765770852565765,
"kl": 0.1453704833984375,
"learning_rate": 1.3611111111111112e-06,
"loss": 0.0062,
"mask/has_final_conf_rate": 0.5,
"mask/share_final_conf": 0.030476320534944534,
"mask/share_reasoning": 0.9500786066055298,
"mask/share_step_conf": 0.019445102661848068,
"num_tokens": 28555977.0,
"reward": 0.02265625074505806,
"reward_std": 0.0325222909450531,
"rewards/accuracy_reward_step": 0.2265625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.615715503692627,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7929527163505554,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.53125,
"calib/avg_num_step_conf": 0.359375,
"calib/ece": 0.3260629921259843,
"calib/final_conf_rate": 0.49609375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3779527559055118,
"calib/gap": -0.025260651629072872,
"calib/mean_conf": 0.8674803149606297,
"calib/mu_c": 0.856142857142857,
"calib/mu_w": 0.8814035087719299,
"calib/nonempty_final_conf_rate": 0.49609375,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.359375,
"calib/pce": 0.32118110236220476,
"calib/std_conf": 0.09797129986251685,
"calib/step_conf_rate": 0.359375,
"calib/step_q_w": 0.7318478260869565,
"calib/step_q_w_n": 92.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2712.0,
"completions/max_terminated_length": 2712.0,
"completions/mean_length": 358.3671875,
"completions/mean_terminated_length": 358.3671875,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.1632,
"grad_norm": 0.15940669178962708,
"kl": 0.13603973388671875,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0379,
"mask/has_final_conf_rate": 0.49609375,
"mask/share_final_conf": 0.025707479566335678,
"mask/share_reasoning": 0.9523909687995911,
"mask/share_step_conf": 0.021901525557041168,
"num_tokens": 28755039.0,
"reward": 0.03007812611758709,
"reward_std": 0.03525547683238983,
"rewards/accuracy_reward_step": 0.30078125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5089123249053955,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7393391728401184,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5234375,
"calib/avg_num_step_conf": 0.40625,
"calib/ece": 0.4067938931297711,
"calib/final_conf_rate": 0.51171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3053435114503817,
"calib/gap": 0.03688474256022689,
"calib/mean_conf": 0.8396183206106871,
"calib/mu_c": 0.8601724137931035,
"calib/mu_w": 0.8232876712328766,
"calib/nonempty_final_conf_rate": 0.51171875,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.40625,
"calib/pce": 0.4018320610687024,
"calib/std_conf": 0.13053854731883202,
"calib/step_conf_rate": 0.40625,
"calib/step_q_w": 0.7641346153846154,
"calib/step_q_w_n": 104.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 962.0,
"completions/max_terminated_length": 962.0,
"completions/mean_length": 314.8828125,
"completions/mean_terminated_length": 316.11767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.17236383259296417,
"kl": 0.154296875,
"learning_rate": 1.3055555555555556e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.51171875,
"mask/share_final_conf": 0.027197513729333878,
"mask/share_reasoning": 0.9469050765037537,
"mask/share_step_conf": 0.02199118211865425,
"num_tokens": 28940089.0,
"reward": 0.0234375,
"reward_std": 0.029143065214157104,
"rewards/accuracy_reward_step": 0.234375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.03859548270702362,
"adv/mean_abs_reasoning": 0.540123462677002,
"adv/mean_abs_step_conf": 0.036589257419109344,
"adv/ratio_final_to_reasoning": 0.0714567786330438,
"adv/ratio_step_to_reasoning": 0.06774239585476036,
"adv/std_final_conf": 0.23386096954345703,
"adv/std_reasoning": 0.7753738760948181,
"adv/std_step_conf": 0.22205311059951782,
"calib/answer_extract_rate": 0.5390625,
"calib/avg_num_step_conf": 0.3984375,
"calib/ece": 0.4024812030075189,
"calib/final_conf_rate": 0.51953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.39097744360902253,
"calib/gap": 0.022089041095890827,
"calib/mean_conf": 0.840375939849624,
"calib/mu_c": 0.8525000000000003,
"calib/mu_w": 0.8304109589041094,
"calib/nonempty_final_conf_rate": 0.51953125,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.3984375,
"calib/pce": 0.39586466165413536,
"calib/std_conf": 0.16421334928306924,
"calib/step_conf_rate": 0.3984375,
"calib/step_q_c": 0.93,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.15534653465346548,
"calib/step_q_w": 0.7746534653465346,
"calib/step_q_w_n": 101.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1054.0,
"completions/max_terminated_length": 1054.0,
"completions/mean_length": 301.796875,
"completions/mean_terminated_length": 301.796875,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.7394980788230896,
"kl": 0.1728668212890625,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0145,
"mask/has_final_conf_rate": 0.51953125,
"mask/share_final_conf": 0.029195178300142288,
"mask/share_reasoning": 0.9469771385192871,
"mask/share_step_conf": 0.02382766455411911,
"num_tokens": 29124565.0,
"reward": 0.026127008721232414,
"reward_std": 0.037890445441007614,
"rewards/accuracy_reward_step": 0.24609375,
"rewards/final_brier_reward_step": 0.004552734550088644,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l2_reward": -0.0030799706000834703,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.592730700969696,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928775548934937,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5078125,
"calib/avg_num_step_conf": 0.41015625,
"calib/ece": 0.4214876033057851,
"calib/final_conf_rate": 0.47265625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.36363636363636365,
"calib/gap": -0.0020784964068546463,
"calib/mean_conf": 0.8396694214876034,
"calib/mu_c": 0.8385185185185184,
"calib/mu_w": 0.8405970149253731,
"calib/nonempty_final_conf_rate": 0.47265625,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.41015625,
"calib/pce": 0.40743801652892564,
"calib/std_conf": 0.17125658042154718,
"calib/step_conf_rate": 0.41015625,
"calib/step_q_w": 0.767904761904762,
"calib/step_q_w_n": 105.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1159.0,
"completions/max_terminated_length": 1159.0,
"completions/mean_length": 336.09765625,
"completions/mean_terminated_length": 337.41571044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.1664,
"grad_norm": 0.16044975817203522,
"kl": 0.149627685546875,
"learning_rate": 1.25e-06,
"loss": -0.0352,
"mask/has_final_conf_rate": 0.46875,
"mask/share_final_conf": 0.026372965425252914,
"mask/share_reasoning": 0.9464188814163208,
"mask/share_step_conf": 0.023301880806684494,
"num_tokens": 29315366.0,
"reward": 0.02265625074505806,
"reward_std": 0.033942047506570816,
"rewards/accuracy_reward_step": 0.2265625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.646448016166687,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8429199457168579,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5078125,
"calib/avg_num_step_conf": 0.3828125,
"calib/ece": 0.2988281249999999,
"calib/final_conf_rate": 0.5,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3046875,
"calib/gap": 0.022956349206349258,
"calib/mean_conf": 0.848984375,
"calib/mu_c": 0.8590277777777778,
"calib/mu_w": 0.8360714285714286,
"calib/nonempty_final_conf_rate": 0.5,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.3828125,
"calib/pce": 0.2926562499999999,
"calib/std_conf": 0.1319969213878088,
"calib/step_conf_rate": 0.3828125,
"calib/step_q_w": 0.7775510204081632,
"calib/step_q_w_n": 98.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1457.0,
"completions/max_terminated_length": 1457.0,
"completions/mean_length": 338.3359375,
"completions/mean_terminated_length": 338.3359375,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.15877458453178406,
"kl": 0.14886474609375,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.017,
"mask/has_final_conf_rate": 0.5,
"mask/share_final_conf": 0.029453996568918228,
"mask/share_reasoning": 0.9424052238464355,
"mask/share_step_conf": 0.02814079448580742,
"num_tokens": 29505708.0,
"reward": 0.02812500298023224,
"reward_std": 0.037020955234766006,
"rewards/accuracy_reward_step": 0.28125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.019062640145421028,
"adv/mean_abs_reasoning": 0.635623037815094,
"adv/mean_abs_step_conf": 0.019323568791151047,
"adv/ratio_final_to_reasoning": 0.029990480223856277,
"adv/ratio_step_to_reasoning": 0.030400988701690786,
"adv/std_final_conf": 0.163349911570549,
"adv/std_reasoning": 0.8428871035575867,
"adv/std_step_conf": 0.16558583080768585,
"calib/answer_extract_rate": 0.55859375,
"calib/avg_num_step_conf": 0.31640625,
"calib/ece": 0.24979020979020977,
"calib/final_conf_rate": 0.55859375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.27972027972027974,
"calib/gap": 0.00042655014565140537,
"calib/mean_conf": 0.8448951048951049,
"calib/mu_c": 0.845056179775281,
"calib/mu_w": 0.8446296296296296,
"calib/nonempty_final_conf_rate": 0.55859375,
"calib/nonempty_reasoning_rate": 0.87109375,
"calib/nonempty_step_conf_rate": 0.31640625,
"calib/pce": 0.23615384615384613,
"calib/std_conf": 0.1255247287528856,
"calib/step_conf_rate": 0.31640625,
"calib/step_q_w": 0.7923456790123455,
"calib/step_q_w_n": 81.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1992.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 342.07421875,
"completions/mean_terminated_length": 342.07421875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.4512387216091156,
"kl": 0.1579742431640625,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0406,
"mask/has_final_conf_rate": 0.55859375,
"mask/share_final_conf": 0.03258911520242691,
"mask/share_reasoning": 0.943278431892395,
"mask/share_step_conf": 0.024132438004016876,
"num_tokens": 29698519.0,
"reward": 0.03335782513022423,
"reward_std": 0.039448127150535583,
"rewards/accuracy_reward_step": 0.34765625,
"rewards/final_brier_reward_step": 0.00015468749916180968,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.003751535667106509,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.7048021554946899,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.874763548374176,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.53125,
"calib/avg_num_step_conf": 0.375,
"calib/ece": 0.248840579710145,
"calib/final_conf_rate": 0.5390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2971014492753623,
"calib/gap": -0.02636818181818157,
"calib/mean_conf": 0.8589855072463768,
"calib/mu_c": 0.8494318181818182,
"calib/mu_w": 0.8757999999999998,
"calib/nonempty_final_conf_rate": 0.5390625,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.375,
"calib/pce": 0.235072463768116,
"calib/std_conf": 0.10756386613173807,
"calib/step_conf_rate": 0.375,
"calib/step_q_w": 0.7956249999999999,
"calib/step_q_w_n": 96.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2233.0,
"completions/max_terminated_length": 2233.0,
"completions/mean_length": 336.46484375,
"completions/mean_terminated_length": 336.46484375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.1696,
"grad_norm": 0.17757733166217804,
"kl": 0.1535491943359375,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0185,
"mask/has_final_conf_rate": 0.5390625,
"mask/share_final_conf": 0.030048977583646774,
"mask/share_reasoning": 0.9474559426307678,
"mask/share_step_conf": 0.02249506302177906,
"num_tokens": 29889438.0,
"reward": 0.03437500447034836,
"reward_std": 0.04036171734333038,
"rewards/accuracy_reward_step": 0.34375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6747011542320251,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8589940667152405,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.51953125,
"calib/avg_num_step_conf": 0.375,
"calib/ece": 0.2803007518796993,
"calib/final_conf_rate": 0.51953125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3308270676691729,
"calib/gap": 0.011589743589743406,
"calib/mean_conf": 0.8587969924812029,
"calib/mu_c": 0.8635897435897434,
"calib/mu_w": 0.852,
"calib/nonempty_final_conf_rate": 0.51953125,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.375,
"calib/pce": 0.2763157894736842,
"calib/std_conf": 0.10119613646808662,
"calib/step_conf_rate": 0.375,
"calib/step_q_w": 0.7814583333333333,
"calib/step_q_w_n": 96.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2984.0,
"completions/max_terminated_length": 2984.0,
"completions/mean_length": 368.30078125,
"completions/mean_terminated_length": 368.30078125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.17369940876960754,
"kl": 0.14441680908203125,
"learning_rate": 1.138888888888889e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.51953125,
"mask/share_final_conf": 0.024254774674773216,
"mask/share_reasoning": 0.9520937204360962,
"mask/share_step_conf": 0.023651521652936935,
"num_tokens": 30088563.0,
"reward": 0.03046875074505806,
"reward_std": 0.03863853961229324,
"rewards/accuracy_reward_step": 0.3046875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6919175386428833,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8590571284294128,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.578125,
"calib/avg_num_step_conf": 0.3046875,
"calib/ece": 0.1623287671232877,
"calib/final_conf_rate": 0.5703125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2876712328767123,
"calib/gap": 0.016439393939394198,
"calib/mean_conf": 0.8367123287671233,
"calib/mu_c": 0.8416666666666668,
"calib/mu_w": 0.8252272727272726,
"calib/nonempty_final_conf_rate": 0.5703125,
"calib/nonempty_reasoning_rate": 0.8828125,
"calib/nonempty_step_conf_rate": 0.3046875,
"calib/pce": 0.1502054794520548,
"calib/std_conf": 0.12389848688939245,
"calib/step_conf_rate": 0.3046875,
"calib/step_q_w": 0.8043589743589742,
"calib/step_q_w_n": 78.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2270.0,
"completions/max_terminated_length": 2270.0,
"completions/mean_length": 347.9609375,
"completions/mean_terminated_length": 349.32550048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.18009351193904877,
"kl": 0.1587066650390625,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0317,
"mask/has_final_conf_rate": 0.5703125,
"mask/share_final_conf": 0.032218098640441895,
"mask/share_reasoning": 0.9410046935081482,
"mask/share_step_conf": 0.022870970889925957,
"num_tokens": 30281561.0,
"reward": 0.03984374925494194,
"reward_std": 0.039622340351343155,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.7094434499740601,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8591045141220093,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5625,
"calib/avg_num_step_conf": 0.32421875,
"calib/ece": 0.18833333333333327,
"calib/final_conf_rate": 0.5625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2847222222222222,
"calib/gap": 0.014152524167561742,
"calib/mean_conf": 0.8474999999999999,
"calib/mu_c": 0.8523157894736842,
"calib/mu_w": 0.8381632653061225,
"calib/nonempty_final_conf_rate": 0.5625,
"calib/nonempty_reasoning_rate": 0.88671875,
"calib/nonempty_step_conf_rate": 0.32421875,
"calib/pce": 0.1880555555555555,
"calib/std_conf": 0.09173103800422915,
"calib/step_conf_rate": 0.32421875,
"calib/step_q_w": 0.7798795180722892,
"calib/step_q_w_n": 83.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1232.0,
"completions/max_terminated_length": 1232.0,
"completions/mean_length": 327.3828125,
"completions/mean_terminated_length": 328.66668701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.1728,
"grad_norm": 0.20924623310565948,
"kl": 0.1875762939453125,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0228,
"mask/has_final_conf_rate": 0.5625,
"mask/share_final_conf": 0.029927421361207962,
"mask/share_reasoning": 0.9480806589126587,
"mask/share_step_conf": 0.018085699528455734,
"num_tokens": 30469515.0,
"reward": 0.037109375,
"reward_std": 0.04062382131814957,
"rewards/accuracy_reward_step": 0.37109375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6055176854133606,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8265026807785034,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55859375,
"calib/avg_num_step_conf": 0.26953125,
"calib/ece": 0.3418115942028985,
"calib/final_conf_rate": 0.5390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2971014492753623,
"calib/gap": 0.013413916333823828,
"calib/mean_conf": 0.8563043478260869,
"calib/mu_c": 0.8628169014084507,
"calib/mu_w": 0.8494029850746269,
"calib/nonempty_final_conf_rate": 0.5390625,
"calib/nonempty_reasoning_rate": 0.82421875,
"calib/nonempty_step_conf_rate": 0.265625,
"calib/pce": 0.3418115942028985,
"calib/std_conf": 0.10213713491635887,
"calib/step_conf_rate": 0.265625,
"calib/step_q_w": 0.7879710144927536,
"calib/step_q_w_n": 69.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1923.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 377.96875,
"completions/mean_terminated_length": 377.96875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.15629848837852478,
"kl": 0.1473236083984375,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0299,
"mask/has_final_conf_rate": 0.5390625,
"mask/share_final_conf": 0.029228312894701958,
"mask/share_reasoning": 0.953479528427124,
"mask/share_step_conf": 0.017292149364948273,
"num_tokens": 30671107.0,
"reward": 0.02851562574505806,
"reward_std": 0.03467895835638046,
"rewards/accuracy_reward_step": 0.28515625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5345840454101562,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7753315567970276,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.515625,
"calib/avg_num_step_conf": 0.3359375,
"calib/ece": 0.31358778625954187,
"calib/final_conf_rate": 0.51171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2824427480916031,
"calib/gap": 0.005791079812206279,
"calib/mean_conf": 0.8403053435114504,
"calib/mu_c": 0.842957746478873,
"calib/mu_w": 0.8371666666666667,
"calib/nonempty_final_conf_rate": 0.51171875,
"calib/nonempty_reasoning_rate": 0.8515625,
"calib/nonempty_step_conf_rate": 0.3359375,
"calib/pce": 0.30595419847328237,
"calib/std_conf": 0.12744068478859416,
"calib/step_conf_rate": 0.3359375,
"calib/step_q_w": 0.8046511627906977,
"calib/step_q_w_n": 86.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2152.0,
"completions/max_terminated_length": 2152.0,
"completions/mean_length": 374.9609375,
"completions/mean_terminated_length": 374.9609375,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.15789611637592316,
"kl": 0.1409149169921875,
"learning_rate": 1.0277777777777777e-06,
"loss": 0.0294,
"mask/has_final_conf_rate": 0.51171875,
"mask/share_final_conf": 0.026131562888622284,
"mask/share_reasoning": 0.9563469886779785,
"mask/share_step_conf": 0.017521433532238007,
"num_tokens": 30873233.0,
"reward": 0.02773437649011612,
"reward_std": 0.03061625175178051,
"rewards/accuracy_reward_step": 0.27734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.019196441397070885,
"adv/mean_abs_reasoning": 0.5598448514938354,
"adv/mean_abs_step_conf": 0.019322002306580544,
"adv/ratio_final_to_reasoning": 0.03428885939711506,
"adv/ratio_step_to_reasoning": 0.03451313744338917,
"adv/std_final_conf": 0.16449646651744843,
"adv/std_reasoning": 0.7927621603012085,
"adv/std_step_conf": 0.1655724197626114,
"calib/answer_extract_rate": 0.59765625,
"calib/avg_num_step_conf": 0.25390625,
"calib/ece": 0.36934210526315775,
"calib/final_conf_rate": 0.59375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.2631578947368421,
"calib/gap": 0.00613998613998612,
"calib/mean_conf": 0.851578947368421,
"calib/mu_c": 0.8547297297297297,
"calib/mu_w": 0.8485897435897436,
"calib/nonempty_final_conf_rate": 0.59375,
"calib/nonempty_reasoning_rate": 0.84765625,
"calib/nonempty_step_conf_rate": 0.25390625,
"calib/pce": 0.3670394736842104,
"calib/std_conf": 0.10792361616072617,
"calib/step_conf_rate": 0.25390625,
"calib/step_q_w": 0.7778461538461537,
"calib/step_q_w_n": 65.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2549.0,
"completions/max_terminated_length": 2549.0,
"completions/mean_length": 389.09375,
"completions/mean_terminated_length": 389.09375,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.176,
"grad_norm": 0.4201725721359253,
"kl": 0.1468963623046875,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0039,
"mask/has_final_conf_rate": 0.59375,
"mask/share_final_conf": 0.02832464501261711,
"mask/share_reasoning": 0.9569504857063293,
"mask/share_step_conf": 0.014724886044859886,
"num_tokens": 31078417.0,
"reward": 0.02780143730342388,
"reward_std": 0.03408287838101387,
"rewards/accuracy_reward_step": 0.2890625,
"rewards/final_brier_reward_step": 0.00030625000363215804,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.003297126619145274,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6967577934265137,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8590615391731262,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55078125,
"calib/avg_num_step_conf": 0.27734375,
"calib/ece": 0.2533576642335767,
"calib/final_conf_rate": 0.53515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.23357664233576642,
"calib/gap": -0.02477294007490627,
"calib/mean_conf": 0.8403649635036496,
"calib/mu_c": 0.831685393258427,
"calib/mu_w": 0.8564583333333333,
"calib/nonempty_final_conf_rate": 0.53515625,
"calib/nonempty_reasoning_rate": 0.828125,
"calib/nonempty_step_conf_rate": 0.27734375,
"calib/pce": 0.22204379562043802,
"calib/std_conf": 0.14374251492603518,
"calib/step_conf_rate": 0.27734375,
"calib/step_q_w": 0.7697183098591549,
"calib/step_q_w_n": 71.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2493.0,
"completions/max_terminated_length": 2493.0,
"completions/mean_length": 419.19921875,
"completions/mean_terminated_length": 420.8431701660156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.15491074323654175,
"kl": 0.120269775390625,
"learning_rate": 9.722222222222224e-07,
"loss": 0.003,
"mask/has_final_conf_rate": 0.53515625,
"mask/share_final_conf": 0.02692285180091858,
"mask/share_reasoning": 0.9514549374580383,
"mask/share_step_conf": 0.017715971916913986,
"num_tokens": 31291916.0,
"reward": 0.03593749925494194,
"reward_std": 0.03989892452955246,
"rewards/accuracy_reward_step": 0.359375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6552045941352844,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8266619443893433,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.53515625,
"calib/avg_num_step_conf": 0.31640625,
"calib/ece": 0.2191851851851852,
"calib/final_conf_rate": 0.52734375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2222222222222222,
"calib/gap": 0.014061624649859716,
"calib/mean_conf": 0.8399259259259261,
"calib/mu_c": 0.8452380952380951,
"calib/mu_w": 0.8311764705882354,
"calib/nonempty_final_conf_rate": 0.52734375,
"calib/nonempty_reasoning_rate": 0.8515625,
"calib/nonempty_step_conf_rate": 0.31640625,
"calib/pce": 0.21844444444444444,
"calib/std_conf": 0.0989687290402699,
"calib/step_conf_rate": 0.31640625,
"calib/step_q_w": 0.7817283950617283,
"calib/step_q_w_n": 81.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1431.0,
"completions/max_terminated_length": 1431.0,
"completions/mean_length": 362.19140625,
"completions/mean_terminated_length": 362.19140625,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.15898919105529785,
"kl": 0.14215087890625,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0005,
"mask/has_final_conf_rate": 0.52734375,
"mask/share_final_conf": 0.024741780012845993,
"mask/share_reasoning": 0.956134557723999,
"mask/share_step_conf": 0.019123634323477745,
"num_tokens": 31490245.0,
"reward": 0.03281250223517418,
"reward_std": 0.037518225610256195,
"rewards/accuracy_reward_step": 0.328125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6289219856262207,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8588364720344543,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.52734375,
"calib/avg_num_step_conf": 0.3359375,
"calib/ece": 0.32492537313432834,
"calib/final_conf_rate": 0.5234375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.26865671641791045,
"calib/gap": 0.060958751393534016,
"calib/mean_conf": 0.8398507462686566,
"calib/mu_c": 0.8694202898550726,
"calib/mu_w": 0.8084615384615386,
"calib/nonempty_final_conf_rate": 0.5234375,
"calib/nonempty_reasoning_rate": 0.86328125,
"calib/nonempty_step_conf_rate": 0.3359375,
"calib/pce": 0.32492537313432834,
"calib/std_conf": 0.12406657501823976,
"calib/step_conf_rate": 0.3359375,
"calib/step_q_w": 0.7769767441860465,
"calib/step_q_w_n": 86.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2660.0,
"completions/max_terminated_length": 2660.0,
"completions/mean_length": 382.296875,
"completions/mean_terminated_length": 382.296875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.1792,
"grad_norm": 0.20057442784309387,
"kl": 0.15277099609375,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0156,
"mask/has_final_conf_rate": 0.5234375,
"mask/share_final_conf": 0.02479960396885872,
"mask/share_reasoning": 0.9573352932929993,
"mask/share_step_conf": 0.01786513812839985,
"num_tokens": 31692785.0,
"reward": 0.02695312723517418,
"reward_std": 0.036022573709487915,
"rewards/accuracy_reward_step": 0.26953125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6322240233421326,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8265968561172485,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55859375,
"calib/avg_num_step_conf": 0.29296875,
"calib/ece": 0.3102222222222222,
"calib/final_conf_rate": 0.52734375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.25925925925925924,
"calib/gap": 0.005400974745237153,
"calib/mean_conf": 0.8583703703703703,
"calib/mu_c": 0.8608108108108109,
"calib/mu_w": 0.8554098360655737,
"calib/nonempty_final_conf_rate": 0.52734375,
"calib/nonempty_reasoning_rate": 0.8515625,
"calib/nonempty_step_conf_rate": 0.29296875,
"calib/pce": 0.3102222222222222,
"calib/std_conf": 0.08169688699275676,
"calib/step_conf_rate": 0.29296875,
"calib/step_q_w": 0.7885333333333333,
"calib/step_q_w_n": 75.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 377.17578125,
"completions/mean_terminated_length": 378.6549377441406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.15046626329421997,
"kl": 0.141265869140625,
"learning_rate": 8.88888888888889e-07,
"loss": 0.016,
"mask/has_final_conf_rate": 0.52734375,
"mask/share_final_conf": 0.024082526564598083,
"mask/share_reasoning": 0.9533636569976807,
"mask/share_step_conf": 0.018647566437721252,
"num_tokens": 31893526.0,
"reward": 0.02890625223517418,
"reward_std": 0.03620504215359688,
"rewards/accuracy_reward_step": 0.2890625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6629437208175659,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8589531183242798,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.515625,
"calib/avg_num_step_conf": 0.30859375,
"calib/ece": 0.23143939393939383,
"calib/final_conf_rate": 0.515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.22727272727272727,
"calib/gap": 0.010275962236746317,
"calib/mean_conf": 0.8335606060606061,
"calib/mu_c": 0.8375308641975308,
"calib/mu_w": 0.8272549019607844,
"calib/nonempty_final_conf_rate": 0.515625,
"calib/nonempty_reasoning_rate": 0.8203125,
"calib/nonempty_step_conf_rate": 0.30859375,
"calib/pce": 0.22568181818181807,
"calib/std_conf": 0.10313372149168089,
"calib/step_conf_rate": 0.30859375,
"calib/step_q_c": 0.8,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.010512820512820653,
"calib/step_q_w": 0.7894871794871794,
"calib/step_q_w_n": 78.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2919.0,
"completions/max_terminated_length": 2919.0,
"completions/mean_length": 401.484375,
"completions/mean_terminated_length": 403.058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.1561432182788849,
"kl": 0.1366424560546875,
"learning_rate": 8.611111111111112e-07,
"loss": 0.0187,
"mask/has_final_conf_rate": 0.515625,
"mask/share_final_conf": 0.02365146577358246,
"mask/share_reasoning": 0.9560713171958923,
"mask/share_step_conf": 0.016370952129364014,
"num_tokens": 32100458.0,
"reward": 0.03203124925494194,
"reward_std": 0.03796668350696564,
"rewards/accuracy_reward_step": 0.3203125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5713903903961182,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7754671573638916,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.484375,
"calib/avg_num_step_conf": 0.37109375,
"calib/ece": 0.2712096774193547,
"calib/final_conf_rate": 0.484375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.29838709677419356,
"calib/gap": 0.04846930640446456,
"calib/mean_conf": 0.843790322580645,
"calib/mu_c": 0.864507042253521,
"calib/mu_w": 0.8160377358490565,
"calib/nonempty_final_conf_rate": 0.484375,
"calib/nonempty_reasoning_rate": 0.85546875,
"calib/nonempty_step_conf_rate": 0.37109375,
"calib/pce": 0.2712096774193547,
"calib/std_conf": 0.10617691218217407,
"calib/step_conf_rate": 0.37109375,
"calib/step_q_w": 0.7707368421052632,
"calib/step_q_w_n": 95.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 988.0,
"completions/max_terminated_length": 988.0,
"completions/mean_length": 326.390625,
"completions/mean_terminated_length": 327.67059326171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.1824,
"grad_norm": 0.1624995768070221,
"kl": 0.14697265625,
"learning_rate": 8.333333333333333e-07,
"loss": 0.031,
"mask/has_final_conf_rate": 0.48046875,
"mask/share_final_conf": 0.025319751352071762,
"mask/share_reasoning": 0.9486113786697388,
"mask/share_step_conf": 0.02216263860464096,
"num_tokens": 32290910.0,
"reward": 0.02773437649011612,
"reward_std": 0.032719485461711884,
"rewards/accuracy_reward_step": 0.27734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6793380975723267,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8430353999137878,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.45703125,
"calib/avg_num_step_conf": 0.39453125,
"calib/ece": 0.28189655172413786,
"calib/final_conf_rate": 0.453125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.28448275862068967,
"calib/gap": -0.029559050262103037,
"calib/mean_conf": 0.8560344827586207,
"calib/mu_c": 0.8440579710144928,
"calib/mu_w": 0.8736170212765958,
"calib/nonempty_final_conf_rate": 0.453125,
"calib/nonempty_reasoning_rate": 0.8515625,
"calib/nonempty_step_conf_rate": 0.39453125,
"calib/pce": 0.27155172413793094,
"calib/std_conf": 0.09696141527596773,
"calib/step_conf_rate": 0.39453125,
"calib/step_q_w": 0.7874257425742576,
"calib/step_q_w_n": 101.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1716.0,
"completions/max_terminated_length": 1716.0,
"completions/mean_length": 348.8828125,
"completions/mean_terminated_length": 348.8828125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.1543116569519043,
"kl": 0.143310546875,
"learning_rate": 8.055555555555557e-07,
"loss": 0.0325,
"mask/has_final_conf_rate": 0.453125,
"mask/share_final_conf": 0.022162608802318573,
"mask/share_reasoning": 0.9561766386032104,
"mask/share_step_conf": 0.021660756319761276,
"num_tokens": 32483576.0,
"reward": 0.02734375,
"reward_std": 0.03890039771795273,
"rewards/accuracy_reward_step": 0.2734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.669437050819397,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8429975509643555,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5234375,
"calib/avg_num_step_conf": 0.33203125,
"calib/ece": 0.2572727272727272,
"calib/final_conf_rate": 0.515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2803030303030303,
"calib/gap": 0.02381179842369241,
"calib/mean_conf": 0.8465151515151514,
"calib/mu_c": 0.8560759493670885,
"calib/mu_w": 0.8322641509433961,
"calib/nonempty_final_conf_rate": 0.515625,
"calib/nonempty_reasoning_rate": 0.85546875,
"calib/nonempty_step_conf_rate": 0.33203125,
"calib/pce": 0.2526515151515151,
"calib/std_conf": 0.10169759724586959,
"calib/step_conf_rate": 0.33203125,
"calib/step_q_w": 0.7796470588235294,
"calib/step_q_w_n": 85.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2597.0,
"completions/max_terminated_length": 2597.0,
"completions/mean_length": 412.375,
"completions/mean_terminated_length": 412.375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.14386945962905884,
"kl": 0.133056640625,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0083,
"mask/has_final_conf_rate": 0.515625,
"mask/share_final_conf": 0.024806944653391838,
"mask/share_reasoning": 0.9530544281005859,
"mask/share_step_conf": 0.022138644009828568,
"num_tokens": 32692304.0,
"reward": 0.03125,
"reward_std": 0.038334622979164124,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5568525195121765,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7927470207214355,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.453125,
"calib/avg_num_step_conf": 0.38671875,
"calib/ece": 0.368157894736842,
"calib/final_conf_rate": 0.4453125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2894736842105263,
"calib/gap": 0.021827426810477557,
"calib/mean_conf": 0.8481578947368422,
"calib/mu_c": 0.8594545454545454,
"calib/mu_w": 0.8376271186440678,
"calib/nonempty_final_conf_rate": 0.4453125,
"calib/nonempty_reasoning_rate": 0.83984375,
"calib/nonempty_step_conf_rate": 0.38671875,
"calib/pce": 0.3669298245614034,
"calib/std_conf": 0.12635626179107026,
"calib/step_conf_rate": 0.38671875,
"calib/step_q_w": 0.7844444444444444,
"calib/step_q_w_n": 99.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2045.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 379.87109375,
"completions/mean_terminated_length": 381.3608093261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.1856,
"grad_norm": 0.1287498027086258,
"kl": 0.12885284423828125,
"learning_rate": 7.5e-07,
"loss": 0.0281,
"mask/has_final_conf_rate": 0.4453125,
"mask/share_final_conf": 0.02070237696170807,
"mask/share_reasoning": 0.9495202302932739,
"mask/share_step_conf": 0.02587110549211502,
"num_tokens": 32893783.0,
"reward": 0.021484375,
"reward_std": 0.031891852617263794,
"rewards/accuracy_reward_step": 0.21484375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.019277554005384445,
"adv/mean_abs_reasoning": 0.4558134973049164,
"adv/mean_abs_step_conf": 0.019320236518979073,
"adv/ratio_final_to_reasoning": 0.042292635297916,
"adv/ratio_step_to_reasoning": 0.04238627560002858,
"adv/std_final_conf": 0.16519151628017426,
"adv/std_reasoning": 0.7205056548118591,
"adv/std_step_conf": 0.16555728018283844,
"calib/answer_extract_rate": 0.4296875,
"calib/avg_num_step_conf": 0.3515625,
"calib/ece": 0.43385321100917423,
"calib/final_conf_rate": 0.42578125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.22935779816513763,
"calib/gap": 0.030764622973925237,
"calib/mean_conf": 0.8283486238532111,
"calib/mu_c": 0.8469767441860464,
"calib/mu_w": 0.8162121212121212,
"calib/nonempty_final_conf_rate": 0.42578125,
"calib/nonempty_reasoning_rate": 0.77734375,
"calib/nonempty_step_conf_rate": 0.3515625,
"calib/pce": 0.43385321100917423,
"calib/std_conf": 0.11379616221474019,
"calib/step_conf_rate": 0.3515625,
"calib/step_q_w": 0.7782222222222223,
"calib/step_q_w_n": 90.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2090.0,
"completions/max_terminated_length": 2090.0,
"completions/mean_length": 394.86328125,
"completions/mean_terminated_length": 394.86328125,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.5348300933837891,
"kl": 0.145294189453125,
"learning_rate": 7.222222222222222e-07,
"loss": 0.0214,
"mask/has_final_conf_rate": 0.42578125,
"mask/share_final_conf": 0.020652402192354202,
"mask/share_reasoning": 0.9563436508178711,
"mask/share_step_conf": 0.023003987967967987,
"num_tokens": 33100692.0,
"reward": 0.016107818111777306,
"reward_std": 0.026949819177389145,
"rewards/accuracy_reward_step": 0.16796875,
"rewards/final_brier_reward_step": 0.0007421874906867743,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0029015520121902227,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6810925006866455,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.87469482421875,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.46484375,
"calib/avg_num_step_conf": 0.34765625,
"calib/ece": 0.25808333333333344,
"calib/final_conf_rate": 0.46875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": 0.023779821787870192,
"calib/mean_conf": 0.8414166666666667,
"calib/mu_c": 0.8511267605633803,
"calib/mu_w": 0.8273469387755101,
"calib/nonempty_final_conf_rate": 0.46875,
"calib/nonempty_reasoning_rate": 0.8125,
"calib/nonempty_step_conf_rate": 0.34765625,
"calib/pce": 0.2539166666666668,
"calib/std_conf": 0.10912756933465019,
"calib/step_conf_rate": 0.34765625,
"calib/step_q_w": 0.7676404494382022,
"calib/step_q_w_n": 89.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1901.0,
"completions/max_terminated_length": 1901.0,
"completions/mean_length": 363.68359375,
"completions/mean_terminated_length": 366.5472412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.17209787666797638,
"kl": 0.140167236328125,
"learning_rate": 6.944444444444446e-07,
"loss": -0.005,
"mask/has_final_conf_rate": 0.46875,
"mask/share_final_conf": 0.024472713470458984,
"mask/share_reasoning": 0.9470512270927429,
"mask/share_step_conf": 0.020663540810346603,
"num_tokens": 33297859.0,
"reward": 0.02812499925494194,
"reward_std": 0.03900687023997307,
"rewards/accuracy_reward_step": 0.28125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5837579369544983,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928394675254822,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.4921875,
"calib/avg_num_step_conf": 0.3359375,
"calib/ece": 0.30134920634920626,
"calib/final_conf_rate": 0.4921875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.20634920634920634,
"calib/gap": 0.012134888438133928,
"calib/mean_conf": 0.8410317460317461,
"calib/mu_c": 0.8466176470588235,
"calib/mu_w": 0.8344827586206895,
"calib/nonempty_final_conf_rate": 0.4921875,
"calib/nonempty_reasoning_rate": 0.828125,
"calib/nonempty_step_conf_rate": 0.3359375,
"calib/pce": 0.30134920634920626,
"calib/std_conf": 0.09712395422475838,
"calib/step_conf_rate": 0.3359375,
"calib/step_q_w": 0.7973255813953486,
"calib/step_q_w_n": 86.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1311.0,
"completions/max_terminated_length": 1311.0,
"completions/mean_length": 357.1328125,
"completions/mean_terminated_length": 358.5333557128906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1888,
"grad_norm": 0.14639951288700104,
"kl": 0.14056396484375,
"learning_rate": 6.666666666666667e-07,
"loss": -0.0141,
"mask/has_final_conf_rate": 0.4921875,
"mask/share_final_conf": 0.026274647563695908,
"mask/share_reasoning": 0.9458457827568054,
"mask/share_step_conf": 0.02397332713007927,
"num_tokens": 33493117.0,
"reward": 0.02695312723517418,
"reward_std": 0.03342931345105171,
"rewards/accuracy_reward_step": 0.26953125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6184110045433044,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8265557289123535,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.45703125,
"calib/avg_num_step_conf": 0.41796875,
"calib/ece": 0.263728813559322,
"calib/final_conf_rate": 0.4609375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2796610169491525,
"calib/gap": 0.01724303266406968,
"calib/mean_conf": 0.8518644067796611,
"calib/mu_c": 0.8587323943661971,
"calib/mu_w": 0.8414893617021274,
"calib/nonempty_final_conf_rate": 0.4609375,
"calib/nonempty_reasoning_rate": 0.875,
"calib/nonempty_step_conf_rate": 0.41796875,
"calib/pce": 0.2569491525423728,
"calib/std_conf": 0.12919535301858673,
"calib/step_conf_rate": 0.41796875,
"calib/step_q_w": 0.7772616822429906,
"calib/step_q_w_n": 107.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2151.0,
"completions/max_terminated_length": 2151.0,
"completions/mean_length": 349.08984375,
"completions/mean_terminated_length": 349.08984375,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.15279927849769592,
"kl": 0.1522064208984375,
"learning_rate": 6.388888888888889e-07,
"loss": 0.0331,
"mask/has_final_conf_rate": 0.4609375,
"mask/share_final_conf": 0.025390885770320892,
"mask/share_reasoning": 0.9491012096405029,
"mask/share_step_conf": 0.02550787664949894,
"num_tokens": 33688556.0,
"reward": 0.02773437649011612,
"reward_std": 0.03541572391986847,
"rewards/accuracy_reward_step": 0.27734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.585411012172699,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928422093391418,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.453125,
"calib/avg_num_step_conf": 0.39453125,
"calib/ece": 0.23249999999999998,
"calib/final_conf_rate": 0.453125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.22413793103448276,
"calib/gap": 0.010006259780907745,
"calib/mean_conf": 0.8445689655172414,
"calib/mu_c": 0.8484507042253523,
"calib/mu_w": 0.8384444444444445,
"calib/nonempty_final_conf_rate": 0.453125,
"calib/nonempty_reasoning_rate": 0.84765625,
"calib/nonempty_step_conf_rate": 0.39453125,
"calib/pce": 0.23249999999999998,
"calib/std_conf": 0.09401386766176015,
"calib/step_conf_rate": 0.39453125,
"calib/step_q_w": 0.7893069306930695,
"calib/step_q_w_n": 101.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2767.0,
"completions/max_terminated_length": 2767.0,
"completions/mean_length": 352.37890625,
"completions/mean_terminated_length": 352.37890625,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.15809383988380432,
"kl": 0.157135009765625,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0264,
"mask/has_final_conf_rate": 0.453125,
"mask/share_final_conf": 0.024328168481588364,
"mask/share_reasoning": 0.9527326226234436,
"mask/share_step_conf": 0.02293919399380684,
"num_tokens": 33885029.0,
"reward": 0.02773437649011612,
"reward_std": 0.03352377563714981,
"rewards/accuracy_reward_step": 0.27734375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5974774360656738,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8264751434326172,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.4921875,
"calib/avg_num_step_conf": 0.25,
"calib/ece": 0.22779527559055118,
"calib/final_conf_rate": 0.49609375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2204724409448819,
"calib/gap": 0.01615770042194098,
"calib/mean_conf": 0.8498425196850394,
"calib/mu_c": 0.8559493670886077,
"calib/mu_w": 0.8397916666666667,
"calib/nonempty_final_conf_rate": 0.49609375,
"calib/nonempty_reasoning_rate": 0.7421875,
"calib/nonempty_step_conf_rate": 0.25,
"calib/pce": 0.22779527559055118,
"calib/std_conf": 0.0795259998287937,
"calib/step_conf_rate": 0.25,
"calib/step_q_w": 0.770625,
"calib/step_q_w_n": 64.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2306.0,
"completions/max_terminated_length": 2306.0,
"completions/mean_length": 444.04296875,
"completions/mean_terminated_length": 445.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.192,
"grad_norm": 0.1322283148765564,
"kl": 0.1263427734375,
"learning_rate": 5.833333333333334e-07,
"loss": 0.01,
"mask/has_final_conf_rate": 0.49609375,
"mask/share_final_conf": 0.021843096241354942,
"mask/share_reasoning": 0.9585437178611755,
"mask/share_step_conf": 0.015706941485404968,
"num_tokens": 34102560.0,
"reward": 0.03125,
"reward_std": 0.03421951085329056,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.019196441397070885,
"adv/mean_abs_reasoning": 0.5584971308708191,
"adv/mean_abs_step_conf": 0.01929621398448944,
"adv/ratio_final_to_reasoning": 0.03437160253113465,
"adv/ratio_step_to_reasoning": 0.03455024729384451,
"adv/std_final_conf": 0.16449646651744843,
"adv/std_reasoning": 0.7754106521606445,
"adv/std_step_conf": 0.16535142064094543,
"calib/answer_extract_rate": 0.48828125,
"calib/avg_num_step_conf": 0.33984375,
"calib/ece": 0.37024193548387097,
"calib/final_conf_rate": 0.484375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.24193548387096775,
"calib/gap": 0.013681877444589263,
"calib/mean_conf": 0.8460483870967744,
"calib/mu_c": 0.8532203389830508,
"calib/mu_w": 0.8395384615384616,
"calib/nonempty_final_conf_rate": 0.484375,
"calib/nonempty_reasoning_rate": 0.82421875,
"calib/nonempty_step_conf_rate": 0.33984375,
"calib/pce": 0.37024193548387097,
"calib/std_conf": 0.10215278364820002,
"calib/step_conf_rate": 0.33984375,
"calib/step_q_w": 0.7693103448275863,
"calib/step_q_w_n": 87.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1841.0,
"completions/max_terminated_length": 1841.0,
"completions/mean_length": 343.0234375,
"completions/mean_terminated_length": 343.0234375,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.39187780022621155,
"kl": 0.1638336181640625,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0494,
"mask/has_final_conf_rate": 0.484375,
"mask/share_final_conf": 0.025939784944057465,
"mask/share_reasoning": 0.9525427222251892,
"mask/share_step_conf": 0.021517515182495117,
"num_tokens": 34296638.0,
"reward": 0.023430868983268738,
"reward_std": 0.030896620824933052,
"rewards/accuracy_reward_step": 0.234375,
"rewards/final_brier_reward_step": 0.00030625000363215804,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0011007614666596055,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6490250825881958,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8266454935073853,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.4765625,
"calib/avg_num_step_conf": 0.35546875,
"calib/ece": 0.1580327868852458,
"calib/final_conf_rate": 0.4765625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.22131147540983606,
"calib/gap": 0.014944356120826785,
"calib/mean_conf": 0.8531147540983607,
"calib/mu_c": 0.8576470588235294,
"calib/mu_w": 0.8427027027027026,
"calib/nonempty_final_conf_rate": 0.4765625,
"calib/nonempty_reasoning_rate": 0.83203125,
"calib/nonempty_step_conf_rate": 0.35546875,
"calib/pce": 0.15721311475409824,
"calib/std_conf": 0.07827109224149895,
"calib/step_conf_rate": 0.35546875,
"calib/step_q_w": 0.7895604395604395,
"calib/step_q_w_n": 91.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2317.0,
"completions/max_terminated_length": 2317.0,
"completions/mean_length": 372.8203125,
"completions/mean_terminated_length": 372.8203125,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.16000139713287354,
"kl": 0.1412506103515625,
"learning_rate": 5.277777777777779e-07,
"loss": 0.0138,
"mask/has_final_conf_rate": 0.4765625,
"mask/share_final_conf": 0.02340656891465187,
"mask/share_reasoning": 0.9566706418991089,
"mask/share_step_conf": 0.01992282271385193,
"num_tokens": 34498240.0,
"reward": 0.03359375149011612,
"reward_std": 0.03716510534286499,
"rewards/accuracy_reward_step": 0.3359375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5607688426971436,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7927569150924683,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5,
"calib/avg_num_step_conf": 0.2578125,
"calib/ece": 0.317109375,
"calib/final_conf_rate": 0.5,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2890625,
"calib/gap": 0.04363836554930267,
"calib/mean_conf": 0.840546875,
"calib/mu_c": 0.8613432835820896,
"calib/mu_w": 0.8177049180327869,
"calib/nonempty_final_conf_rate": 0.5,
"calib/nonempty_reasoning_rate": 0.7578125,
"calib/nonempty_step_conf_rate": 0.2578125,
"calib/pce": 0.317109375,
"calib/std_conf": 0.12856363085155295,
"calib/step_conf_rate": 0.2578125,
"calib/step_q_w": 0.793939393939394,
"calib/step_q_w_n": 66.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1457.0,
"completions/max_terminated_length": 1457.0,
"completions/mean_length": 401.2890625,
"completions/mean_terminated_length": 401.2890625,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.1952,
"grad_norm": 0.14064468443393707,
"kl": 0.134674072265625,
"learning_rate": 5.000000000000001e-07,
"loss": -0.014,
"mask/has_final_conf_rate": 0.5,
"mask/share_final_conf": 0.022872116416692734,
"mask/share_reasoning": 0.9605451226234436,
"mask/share_step_conf": 0.016582757234573364,
"num_tokens": 34707650.0,
"reward": 0.02617187425494194,
"reward_std": 0.032115641981363297,
"rewards/accuracy_reward_step": 0.26171875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.617270827293396,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8265382647514343,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.56640625,
"calib/avg_num_step_conf": 0.30078125,
"calib/ece": 0.18647887323943663,
"calib/final_conf_rate": 0.5546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.21830985915492956,
"calib/gap": 0.01684210526315766,
"calib/mean_conf": 0.8412676056338028,
"calib/mu_c": 0.8468421052631577,
"calib/mu_w": 0.8300000000000001,
"calib/nonempty_final_conf_rate": 0.5546875,
"calib/nonempty_reasoning_rate": 0.8671875,
"calib/nonempty_step_conf_rate": 0.30078125,
"calib/pce": 0.1793661971830986,
"calib/std_conf": 0.09578334674663085,
"calib/step_conf_rate": 0.30078125,
"calib/step_q_w": 0.7942857142857142,
"calib/step_q_w_n": 77.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2078.0,
"completions/max_terminated_length": 2078.0,
"completions/mean_length": 386.53515625,
"completions/mean_terminated_length": 386.53515625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.14544470608234406,
"kl": 0.1382904052734375,
"learning_rate": 4.7222222222222226e-07,
"loss": 0.0319,
"mask/has_final_conf_rate": 0.5546875,
"mask/share_final_conf": 0.0245208777487278,
"mask/share_reasoning": 0.9588320255279541,
"mask/share_step_conf": 0.016647107899188995,
"num_tokens": 34911883.0,
"reward": 0.03750000149011612,
"reward_std": 0.03535057231783867,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5579884052276611,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7927578687667847,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5078125,
"calib/avg_num_step_conf": 0.265625,
"calib/ece": 0.2853076923076923,
"calib/final_conf_rate": 0.5078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.27692307692307694,
"calib/gap": 0.018460606060606066,
"calib/mean_conf": 0.8459230769230769,
"calib/mu_c": 0.8537333333333333,
"calib/mu_w": 0.8352727272727273,
"calib/nonempty_final_conf_rate": 0.5078125,
"calib/nonempty_reasoning_rate": 0.7734375,
"calib/nonempty_step_conf_rate": 0.265625,
"calib/pce": 0.2771538461538462,
"calib/std_conf": 0.12126066109416558,
"calib/step_conf_rate": 0.265625,
"calib/step_q_w": 0.768970588235294,
"calib/step_q_w_n": 68.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2631.0,
"completions/max_terminated_length": 2631.0,
"completions/mean_length": 382.76953125,
"completions/mean_terminated_length": 385.7834777832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.14634037017822266,
"kl": 0.1418609619140625,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0199,
"mask/has_final_conf_rate": 0.5078125,
"mask/share_final_conf": 0.024744169786572456,
"mask/share_reasoning": 0.9503279328346252,
"mask/share_step_conf": 0.017115432769060135,
"num_tokens": 35116792.0,
"reward": 0.02929687686264515,
"reward_std": 0.031956762075424194,
"rewards/accuracy_reward_step": 0.29296875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.019286589697003365,
"adv/mean_abs_reasoning": 0.6044795513153076,
"adv/mean_abs_step_conf": 0.019319185987114906,
"adv/ratio_final_to_reasoning": 0.03190610775010837,
"adv/ratio_step_to_reasoning": 0.03196003230395078,
"adv/std_final_conf": 0.16526895761489868,
"adv/std_reasoning": 0.7929074168205261,
"adv/std_step_conf": 0.16554827988147736,
"calib/answer_extract_rate": 0.5546875,
"calib/avg_num_step_conf": 0.2578125,
"calib/ece": 0.25496453900709215,
"calib/final_conf_rate": 0.55078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.23404255319148937,
"calib/gap": -0.01642367066895356,
"calib/mean_conf": 0.8439007092198583,
"calib/mu_c": 0.8377272727272728,
"calib/mu_w": 0.8541509433962263,
"calib/nonempty_final_conf_rate": 0.55078125,
"calib/nonempty_reasoning_rate": 0.80859375,
"calib/nonempty_step_conf_rate": 0.2578125,
"calib/pce": 0.23737588652482264,
"calib/std_conf": 0.11490219738558534,
"calib/step_conf_rate": 0.2578125,
"calib/step_q_w": 0.7868787878787877,
"calib/step_q_w_n": 66.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2897.0,
"completions/max_terminated_length": 2897.0,
"completions/mean_length": 391.9296875,
"completions/mean_terminated_length": 393.4666748046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.1984,
"grad_norm": 0.22775553166866302,
"kl": 0.134429931640625,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0012,
"mask/has_final_conf_rate": 0.55078125,
"mask/share_final_conf": 0.02644348330795765,
"mask/share_reasoning": 0.9552706480026245,
"mask/share_step_conf": 0.014379597268998623,
"num_tokens": 35322166.0,
"reward": 0.03424294292926788,
"reward_std": 0.035134799778461456,
"rewards/accuracy_reward_step": 0.34765625,
"rewards/final_brier_reward_step": 0.0008812500163912773,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0027078634593635798,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5996350646018982,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928953170776367,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.51171875,
"calib/avg_num_step_conf": 0.3046875,
"calib/ece": 0.3649242424242423,
"calib/final_conf_rate": 0.515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2727272727272727,
"calib/gap": -0.003750000000000031,
"calib/mean_conf": 0.8643181818181819,
"calib/mu_c": 0.8624999999999999,
"calib/mu_w": 0.86625,
"calib/nonempty_final_conf_rate": 0.515625,
"calib/nonempty_reasoning_rate": 0.81640625,
"calib/nonempty_step_conf_rate": 0.3046875,
"calib/pce": 0.35704545454545444,
"calib/std_conf": 0.08028541116030398,
"calib/step_conf_rate": 0.3046875,
"calib/step_q_w": 0.7730769230769231,
"calib/step_q_w_n": 78.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2431.0,
"completions/max_terminated_length": 2431.0,
"completions/mean_length": 410.00390625,
"completions/mean_terminated_length": 410.00390625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.14223752915859222,
"kl": 0.1377716064453125,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0046,
"mask/has_final_conf_rate": 0.515625,
"mask/share_final_conf": 0.022872790694236755,
"mask/share_reasoning": 0.9567241072654724,
"mask/share_step_conf": 0.020403096452355385,
"num_tokens": 35528671.0,
"reward": 0.02695312537252903,
"reward_std": 0.03433658182621002,
"rewards/accuracy_reward_step": 0.26953125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6717088222503662,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8589801788330078,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.51171875,
"calib/avg_num_step_conf": 0.33203125,
"calib/ece": 0.2512878787878786,
"calib/final_conf_rate": 0.515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.29545454545454547,
"calib/gap": 0.005778846153846162,
"calib/mean_conf": 0.8473484848484849,
"calib/mu_c": 0.849625,
"calib/mu_w": 0.8438461538461538,
"calib/nonempty_final_conf_rate": 0.515625,
"calib/nonempty_reasoning_rate": 0.84375,
"calib/nonempty_step_conf_rate": 0.33203125,
"calib/pce": 0.2462878787878786,
"calib/std_conf": 0.09837975161590594,
"calib/step_conf_rate": 0.33203125,
"calib/step_q_w": 0.7765882352941176,
"calib/step_q_w_n": 85.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1527.0,
"completions/max_terminated_length": 1527.0,
"completions/mean_length": 437.89453125,
"completions/mean_terminated_length": 437.89453125,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.17902015149593353,
"kl": 0.115936279296875,
"learning_rate": 3.611111111111111e-07,
"loss": 0.0032,
"mask/has_final_conf_rate": 0.515625,
"mask/share_final_conf": 0.02250046841800213,
"mask/share_reasoning": 0.96071457862854,
"mask/share_step_conf": 0.016784997656941414,
"num_tokens": 35744844.0,
"reward": 0.03125,
"reward_std": 0.03846754878759384,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5805579423904419,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7754928469657898,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55859375,
"calib/avg_num_step_conf": 0.2578125,
"calib/ece": 0.2944366197183097,
"calib/final_conf_rate": 0.5546875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.22535211267605634,
"calib/gap": 0.007846774193548622,
"calib/mean_conf": 0.8523239436619718,
"calib/mu_c": 0.8557500000000001,
"calib/mu_w": 0.8479032258064515,
"calib/nonempty_final_conf_rate": 0.5546875,
"calib/nonempty_reasoning_rate": 0.81640625,
"calib/nonempty_step_conf_rate": 0.2578125,
"calib/pce": 0.2916901408450702,
"calib/std_conf": 0.08090740223589067,
"calib/step_conf_rate": 0.2578125,
"calib/step_q_w": 0.7725757575757575,
"calib/step_q_w_n": 66.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2571.0,
"completions/max_terminated_length": 2571.0,
"completions/mean_length": 361.56640625,
"completions/mean_terminated_length": 361.56640625,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.2016,
"grad_norm": 0.15060287714004517,
"kl": 0.1468963623046875,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0419,
"mask/has_final_conf_rate": 0.5546875,
"mask/share_final_conf": 0.028109043836593628,
"mask/share_reasoning": 0.9566246271133423,
"mask/share_step_conf": 0.015266265720129013,
"num_tokens": 35945173.0,
"reward": 0.03125,
"reward_std": 0.03324335068464279,
"rewards/accuracy_reward_step": 0.3125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5268406271934509,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7393897771835327,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.46484375,
"calib/avg_num_step_conf": 0.328125,
"calib/ece": 0.25932773109243684,
"calib/final_conf_rate": 0.46484375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.18487394957983194,
"calib/gap": 0.022919600938966944,
"calib/mean_conf": 0.855966386554622,
"calib/mu_c": 0.8652112676056337,
"calib/mu_w": 0.8422916666666668,
"calib/nonempty_final_conf_rate": 0.46484375,
"calib/nonempty_reasoning_rate": 0.79296875,
"calib/nonempty_step_conf_rate": 0.328125,
"calib/pce": 0.25932773109243684,
"calib/std_conf": 0.0751393028748793,
"calib/step_conf_rate": 0.328125,
"calib/step_q_w": 0.7607142857142858,
"calib/step_q_w_n": 84.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 418.125,
"completions/mean_terminated_length": 419.7647399902344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.1258547306060791,
"kl": 0.12890625,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0258,
"mask/has_final_conf_rate": 0.46484375,
"mask/share_final_conf": 0.018940530717372894,
"mask/share_reasoning": 0.9564769268035889,
"mask/share_step_conf": 0.02067631483078003,
"num_tokens": 36157821.0,
"reward": 0.02812499925494194,
"reward_std": 0.03016754984855652,
"rewards/accuracy_reward_step": 0.28125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.01930162124335766,
"adv/mean_abs_reasoning": 0.5160157084465027,
"adv/mean_abs_step_conf": 0.01927414909005165,
"adv/ratio_final_to_reasoning": 0.03740510400636133,
"adv/ratio_step_to_reasoning": 0.03735186501991901,
"adv/std_final_conf": 0.1653977632522583,
"adv/std_reasoning": 0.7393521666526794,
"adv/std_step_conf": 0.1651623547077179,
"calib/answer_extract_rate": 0.48828125,
"calib/avg_num_step_conf": 0.359375,
"calib/ece": 0.37543307086614175,
"calib/final_conf_rate": 0.49609375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.2440944881889764,
"calib/gap": 0.00689518132141087,
"calib/mean_conf": 0.8554330708661417,
"calib/mu_c": 0.859016393442623,
"calib/mu_w": 0.8521212121212122,
"calib/nonempty_final_conf_rate": 0.49609375,
"calib/nonempty_reasoning_rate": 0.8203125,
"calib/nonempty_step_conf_rate": 0.3359375,
"calib/pce": 0.3752755905511811,
"calib/std_conf": 0.09370548522597767,
"calib/step_conf_rate": 0.3359375,
"calib/step_q_w": 0.7830434782608695,
"calib/step_q_w_n": 92.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1628.0,
"completions/max_terminated_length": 1628.0,
"completions/mean_length": 359.9296875,
"completions/mean_terminated_length": 361.3411865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.845258891582489,
"kl": 0.1506805419921875,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0766,
"mask/has_final_conf_rate": 0.49609375,
"mask/share_final_conf": 0.024587150663137436,
"mask/share_reasoning": 0.9499251246452332,
"mask/share_step_conf": 0.021581534296274185,
"num_tokens": 36354131.0,
"reward": 0.024508347734808922,
"reward_std": 0.030368084087967873,
"rewards/accuracy_reward_step": 0.23828125,
"rewards/final_brier_reward_step": 0.0012796875089406967,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l2_reward": -0.0007004928193055093,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5777860879898071,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928310632705688,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.51171875,
"calib/avg_num_step_conf": 0.3203125,
"calib/ece": 0.2725757575757576,
"calib/final_conf_rate": 0.515625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.24242424242424243,
"calib/gap": 0.019642857142857184,
"calib/mean_conf": 0.8466666666666667,
"calib/mu_c": 0.8550000000000001,
"calib/mu_w": 0.8353571428571429,
"calib/nonempty_final_conf_rate": 0.515625,
"calib/nonempty_reasoning_rate": 0.83203125,
"calib/nonempty_step_conf_rate": 0.3203125,
"calib/pce": 0.27174242424242423,
"calib/std_conf": 0.08751334674542996,
"calib/step_conf_rate": 0.3203125,
"calib/step_q_w": 0.7858536585365853,
"calib/step_q_w_n": 82.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1652.0,
"completions/max_terminated_length": 1652.0,
"completions/mean_length": 395.37890625,
"completions/mean_terminated_length": 395.37890625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.2048,
"grad_norm": 0.1448807716369629,
"kl": 0.1321868896484375,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0108,
"mask/has_final_conf_rate": 0.515625,
"mask/share_final_conf": 0.02366378903388977,
"mask/share_reasoning": 0.9502225518226624,
"mask/share_step_conf": 0.026113644242286682,
"num_tokens": 36560324.0,
"reward": 0.02968749962747097,
"reward_std": 0.033088065683841705,
"rewards/accuracy_reward_step": 0.296875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6848012208938599,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8590258955955505,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.546875,
"calib/avg_num_step_conf": 0.24609375,
"calib/ece": 0.30107913669064745,
"calib/final_conf_rate": 0.54296875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.26618705035971224,
"calib/gap": 0.046375000000000055,
"calib/mean_conf": 0.8406474820143884,
"calib/mu_c": 0.8620000000000001,
"calib/mu_w": 0.815625,
"calib/nonempty_final_conf_rate": 0.54296875,
"calib/nonempty_reasoning_rate": 0.79296875,
"calib/nonempty_step_conf_rate": 0.24609375,
"calib/pce": 0.30107913669064745,
"calib/std_conf": 0.12079597302052868,
"calib/step_conf_rate": 0.24609375,
"calib/step_q_w": 0.7687301587301588,
"calib/step_q_w_n": 63.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1258.0,
"completions/max_terminated_length": 1258.0,
"completions/mean_length": 383.0859375,
"completions/mean_terminated_length": 383.0859375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.15478582680225372,
"kl": 0.134765625,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0317,
"mask/has_final_conf_rate": 0.54296875,
"mask/share_final_conf": 0.02465350739657879,
"mask/share_reasoning": 0.9587289690971375,
"mask/share_step_conf": 0.016617517918348312,
"num_tokens": 36764106.0,
"reward": 0.029296875,
"reward_std": 0.03921569138765335,
"rewards/accuracy_reward_step": 0.29296875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6990294456481934,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8747507333755493,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.55078125,
"calib/avg_num_step_conf": 0.28515625,
"calib/ece": 0.23746376811594205,
"calib/final_conf_rate": 0.5390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.18115942028985507,
"calib/gap": 0.012724867724867583,
"calib/mean_conf": 0.8331159420289856,
"calib/mu_c": 0.8380952380952381,
"calib/mu_w": 0.8253703703703705,
"calib/nonempty_final_conf_rate": 0.5390625,
"calib/nonempty_reasoning_rate": 0.8359375,
"calib/nonempty_step_conf_rate": 0.28515625,
"calib/pce": 0.23094202898550725,
"calib/std_conf": 0.11233877803412114,
"calib/step_conf_rate": 0.28515625,
"calib/step_q_w": 0.7839726027397261,
"calib/step_q_w_n": 73.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1739.0,
"completions/max_terminated_length": 1739.0,
"completions/mean_length": 353.5234375,
"completions/mean_terminated_length": 353.5234375,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.1833369880914688,
"kl": 0.1504364013671875,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0135,
"mask/has_final_conf_rate": 0.5390625,
"mask/share_final_conf": 0.028572622686624527,
"mask/share_reasoning": 0.9558173418045044,
"mask/share_step_conf": 0.01561001781374216,
"num_tokens": 36960552.0,
"reward": 0.03359375149011612,
"reward_std": 0.04003185033798218,
"rewards/accuracy_reward_step": 0.3359375,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5826220512390137,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928285002708435,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.48046875,
"calib/avg_num_step_conf": 0.31640625,
"calib/ece": 0.33798387096774196,
"calib/final_conf_rate": 0.484375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2661290322580645,
"calib/gap": 0.0128839634941329,
"calib/mean_conf": 0.8621774193548388,
"calib/mu_c": 0.8683076923076922,
"calib/mu_w": 0.8554237288135593,
"calib/nonempty_final_conf_rate": 0.484375,
"calib/nonempty_reasoning_rate": 0.796875,
"calib/nonempty_step_conf_rate": 0.31640625,
"calib/pce": 0.33798387096774196,
"calib/std_conf": 0.08323655063181201,
"calib/step_conf_rate": 0.31640625,
"calib/step_q_w": 0.7564197530864198,
"calib/step_q_w_n": 81.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2433.0,
"completions/max_terminated_length": 2433.0,
"completions/mean_length": 396.2890625,
"completions/mean_terminated_length": 397.8431701660156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.208,
"grad_norm": 0.14063632488250732,
"kl": 0.1334228515625,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.003,
"mask/has_final_conf_rate": 0.484375,
"mask/share_final_conf": 0.02396385371685028,
"mask/share_reasoning": 0.9530671834945679,
"mask/share_step_conf": 0.019062696024775505,
"num_tokens": 37167986.0,
"reward": 0.025390625,
"reward_std": 0.03336440399289131,
"rewards/accuracy_reward_step": 0.25390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5906708240509033,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.7928717732429504,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.51171875,
"calib/avg_num_step_conf": 0.3515625,
"calib/ece": 0.3270229007633587,
"calib/final_conf_rate": 0.51171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.21374045801526717,
"calib/gap": 0.022651727357609674,
"calib/mean_conf": 0.8396946564885497,
"calib/mu_c": 0.8505882352941176,
"calib/mu_w": 0.827936507936508,
"calib/nonempty_final_conf_rate": 0.51171875,
"calib/nonempty_reasoning_rate": 0.86328125,
"calib/nonempty_step_conf_rate": 0.3515625,
"calib/pce": 0.32381679389312973,
"calib/std_conf": 0.10209215214059696,
"calib/step_conf_rate": 0.3515625,
"calib/step_q_w": 0.7533333333333333,
"calib/step_q_w_n": 90.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 319.04296875,
"completions/mean_terminated_length": 319.04296875,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.15355850756168365,
"kl": 0.1542510986328125,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0033,
"mask/has_final_conf_rate": 0.51171875,
"mask/share_final_conf": 0.026806414127349854,
"mask/share_reasoning": 0.9513841271400452,
"mask/share_step_conf": 0.021809469908475876,
"num_tokens": 37352205.0,
"reward": 0.02656250074505806,
"reward_std": 0.03382434323430061,
"rewards/accuracy_reward_step": 0.265625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6188219785690308,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8428393006324768,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5703125,
"calib/avg_num_step_conf": 0.2578125,
"calib/ece": 0.3242758620689655,
"calib/final_conf_rate": 0.56640625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.25517241379310346,
"calib/gap": 0.03402555301296706,
"calib/mean_conf": 0.8484137931034483,
"calib/mu_c": 0.8646052631578947,
"calib/mu_w": 0.8305797101449276,
"calib/nonempty_final_conf_rate": 0.56640625,
"calib/nonempty_reasoning_rate": 0.828125,
"calib/nonempty_step_conf_rate": 0.2578125,
"calib/pce": 0.3242758620689655,
"calib/std_conf": 0.08987866044203009,
"calib/step_conf_rate": 0.2578125,
"calib/step_q_w": 0.7756060606060606,
"calib/step_q_w_n": 66.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1683.0,
"completions/max_terminated_length": 1683.0,
"completions/mean_length": 393.34765625,
"completions/mean_terminated_length": 393.34765625,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.15462549030780792,
"kl": 0.1384735107421875,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.0193,
"mask/has_final_conf_rate": 0.56640625,
"mask/share_final_conf": 0.026654046028852463,
"mask/share_reasoning": 0.9595667719841003,
"mask/share_step_conf": 0.01377915684133768,
"num_tokens": 37557958.0,
"reward": 0.02968750149011612,
"reward_std": 0.03544231504201889,
"rewards/accuracy_reward_step": 0.296875,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.5998426675796509,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8098545670509338,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.47265625,
"calib/avg_num_step_conf": 0.3671875,
"calib/ece": 0.3226446280991735,
"calib/final_conf_rate": 0.47265625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.24793388429752067,
"calib/gap": 0.014246162280701746,
"calib/mean_conf": 0.8515702479338844,
"calib/mu_c": 0.85828125,
"calib/mu_w": 0.8440350877192982,
"calib/nonempty_final_conf_rate": 0.47265625,
"calib/nonempty_reasoning_rate": 0.83984375,
"calib/nonempty_step_conf_rate": 0.3671875,
"calib/pce": 0.3226446280991735,
"calib/std_conf": 0.08843925182618231,
"calib/step_conf_rate": 0.3671875,
"calib/step_q_w": 0.7740425531914893,
"calib/step_q_w_n": 94.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2778.0,
"completions/max_terminated_length": 2778.0,
"completions/mean_length": 333.3046875,
"completions/mean_terminated_length": 334.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.2112,
"grad_norm": 0.19545160233974457,
"kl": 0.2003021240234375,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0158,
"mask/has_final_conf_rate": 0.47265625,
"mask/share_final_conf": 0.02741253189742565,
"mask/share_reasoning": 0.9429019093513489,
"mask/share_step_conf": 0.02577931620180607,
"num_tokens": 37748668.0,
"reward": 0.02499999850988388,
"reward_std": 0.03435155376791954,
"rewards/accuracy_reward_step": 0.25,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6346948146820068,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8428850173950195,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.4609375,
"calib/avg_num_step_conf": 0.34765625,
"calib/ece": 0.3185217391304347,
"calib/final_conf_rate": 0.44921875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.20869565217391303,
"calib/gap": -0.03951899509803902,
"calib/mean_conf": 0.8348695652173913,
"calib/mu_c": 0.8173437499999999,
"calib/mu_w": 0.856862745098039,
"calib/nonempty_final_conf_rate": 0.44921875,
"calib/nonempty_reasoning_rate": 0.80859375,
"calib/nonempty_step_conf_rate": 0.34765625,
"calib/pce": 0.2984347826086956,
"calib/std_conf": 0.10425191244114926,
"calib/step_conf_rate": 0.34765625,
"calib/step_q_w": 0.7644943820224718,
"calib/step_q_w_n": 89.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 410.59765625,
"completions/mean_terminated_length": 412.2078552246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.14168593287467957,
"kl": 0.1448822021484375,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0453,
"mask/has_final_conf_rate": 0.44921875,
"mask/share_final_conf": 0.023079898208379745,
"mask/share_reasoning": 0.9513913989067078,
"mask/share_step_conf": 0.021622436121106148,
"num_tokens": 37957981.0,
"reward": 0.025390625,
"reward_std": 0.0363493412733078,
"rewards/accuracy_reward_step": 0.25390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.6308720111846924,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.8099633455276489,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.5078125,
"calib/avg_num_step_conf": 0.28125,
"calib/ece": 0.23124999999999987,
"calib/final_conf_rate": 0.5,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.2109375,
"calib/gap": 0.02359080340997155,
"calib/mean_conf": 0.8484375000000001,
"calib/mu_c": 0.8574683544303797,
"calib/mu_w": 0.8338775510204082,
"calib/nonempty_final_conf_rate": 0.5,
"calib/nonempty_reasoning_rate": 0.7890625,
"calib/nonempty_step_conf_rate": 0.28125,
"calib/pce": 0.23124999999999987,
"calib/std_conf": 0.0858636045932734,
"calib/step_conf_rate": 0.28125,
"calib/step_q_w": 0.7734722222222222,
"calib/step_q_w_n": 72.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2057.0,
"completions/max_terminated_length": 2057.0,
"completions/mean_length": 389.375,
"completions/mean_terminated_length": 390.9019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.15176671743392944,
"kl": 0.129180908203125,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0024,
"mask/has_final_conf_rate": 0.5,
"mask/share_final_conf": 0.02337362989783287,
"mask/share_reasoning": 0.9549142122268677,
"mask/share_step_conf": 0.017805900424718857,
"num_tokens": 38165709.0,
"reward": 0.03164062649011612,
"reward_std": 0.03612466901540756,
"rewards/accuracy_reward_step": 0.31640625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l2_reward": 0.0,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.0440832228428917,
"train_runtime": 5329.4705,
"train_samples_per_second": 9.607,
"train_steps_per_second": 0.038
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 38165709,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}