{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.7654397487640381, "adv/mean_abs_reasoning": 0.424932599067688, "adv/mean_abs_step_conf": 0.7687661647796631, "adv/ratio_final_to_reasoning": 1.801320375145213, "adv/ratio_step_to_reasoning": 1.8091484778206095, "adv/std_final_conf": 0.9287529587745667, "adv/std_reasoning": 0.7013161778450012, "adv/std_step_conf": 0.9334626793861389, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.21378906250000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3046875, "calib/gap": 0.00016580667354659795, "calib/mean_conf": 0.8817578125000001, "calib/mu_c": 0.8818128654970762, "calib/mu_w": 0.8816470588235296, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21378906250000007, "calib/std_conf": 0.048946278921025696, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8082866741321388, "calib/step_q_c_n": 893.0, "calib/step_q_gap": 0.019531399870535426, "calib/step_q_w": 0.7887552742616034, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 451.0703125, "completions/mean_terminated_length": 452.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.0010666666666666667, "grad_norm": 1.8525424003601074, "kl": 0.00033098459243774414, "learning_rate": 0.0, "loss": 0.0618, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03566828370094299, "mask/share_reasoning": 0.8323470950126648, "mask/share_step_conf": 0.1280784010887146, "num_tokens": 223058.0, "reward": 0.40438759326934814, "reward_std": 0.16610386967658997, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7295855283737183, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.253622829914093, "step": 1 }, { "adv/mean_abs_final_conf": 0.7925335168838501, "adv/mean_abs_reasoning": 0.4465929865837097, "adv/mean_abs_step_conf": 0.7833997011184692, "adv/ratio_final_to_reasoning": 1.7746215025597967, "adv/ratio_step_to_reasoning": 1.754169287590521, "adv/std_final_conf": 0.9316501617431641, "adv/std_reasoning": 0.7013890147209167, "adv/std_step_conf": 0.9330483675003052, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.2887058823529413, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2549019607843137, "calib/gap": -0.004035076611371591, "calib/mean_conf": 0.8730196078431373, "calib/mu_c": 0.8713422818791945, "calib/mu_w": 0.8753773584905661, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2887058823529413, "calib/std_conf": 0.04379877793804094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7949142857142858, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.006359152634437981, "calib/step_q_w": 0.7885551330798478, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 490.6640625, "completions/mean_terminated_length": 492.5882568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0021333333333333334, "grad_norm": 1.1731619834899902, "kl": 0.0003865659236907959, "learning_rate": 2.5000000000000004e-07, "loss": 0.022, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033253151923418045, "mask/share_reasoning": 0.8548593521118164, "mask/share_step_conf": 0.10798123478889465, "num_tokens": 451956.0, "reward": 0.3687654733657837, "reward_std": 0.16945478320121765, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6672624945640564, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.24535658955574036, "step": 2 }, { "adv/mean_abs_final_conf": 0.7793192863464355, "adv/mean_abs_reasoning": 0.5044786930084229, "adv/mean_abs_step_conf": 0.7530263662338257, "adv/ratio_final_to_reasoning": 1.5448011920959839, "adv/ratio_step_to_reasoning": 1.492682201785781, "adv/std_final_conf": 0.9305400252342224, "adv/std_reasoning": 0.757487416267395, "adv/std_step_conf": 0.9341217875480652, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.22257812500000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.34765625, "calib/gap": 0.003928571428571503, "calib/mean_conf": 0.878828125, "calib/mu_c": 0.8801785714285715, "calib/mu_w": 0.87625, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22257812500000002, "calib/std_conf": 0.07099318952536486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7881730769230769, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.022878959276018018, "calib/step_q_w": 0.7652941176470589, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 487.234375, "completions/mean_terminated_length": 489.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.0032, "grad_norm": 1.6487758159637451, "kl": 0.00039315223693847656, "learning_rate": 5.000000000000001e-07, "loss": 0.0013, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033384665846824646, "mask/share_reasoning": 0.8496053218841553, "mask/share_step_conf": 0.11310373246669769, "num_tokens": 681944.0, "reward": 0.40173280239105225, "reward_std": 0.19228878617286682, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7216054797172546, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.24938993155956268, "step": 3 }, { "adv/mean_abs_final_conf": 0.7682288885116577, "adv/mean_abs_reasoning": 0.45164400339126587, "adv/mean_abs_step_conf": 0.7649694681167603, "adv/ratio_final_to_reasoning": 1.7009611170374153, "adv/ratio_step_to_reasoning": 1.693744326001946, "adv/std_final_conf": 0.930860161781311, "adv/std_reasoning": 0.7206089496612549, "adv/std_step_conf": 0.9333772659301758, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.8515625, "calib/ece": 0.27234126984126983, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.23412698412698413, "calib/gap": 0.006078947368421184, "calib/mean_conf": 0.8741666666666666, "calib/mu_c": 0.8765789473684211, "calib/mu_w": 0.8704999999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.27166666666666667, "calib/std_conf": 0.050195056039351635, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8030393487109906, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.0183858833644559, "calib/step_q_w": 0.7846534653465347, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 503.55078125, "completions/mean_terminated_length": 507.5157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.004266666666666667, "grad_norm": 1.3600562810897827, "kl": 0.0003751814365386963, "learning_rate": 7.5e-07, "loss": 0.0245, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03287990763783455, "mask/share_reasoning": 0.8468308448791504, "mask/share_step_conf": 0.11247673630714417, "num_tokens": 917021.0, "reward": 0.3587590754032135, "reward_std": 0.17810939252376556, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6715430021286011, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.2673061192035675, "step": 4 }, { "adv/mean_abs_final_conf": 0.776305079460144, "adv/mean_abs_reasoning": 0.3951473832130432, "adv/mean_abs_step_conf": 0.7590612173080444, "adv/ratio_final_to_reasoning": 1.9645962808808484, "adv/ratio_step_to_reasoning": 1.9209572163579216, "adv/std_final_conf": 0.9308260083198547, "adv/std_reasoning": 0.6612785458564758, "adv/std_step_conf": 0.9333109855651855, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 4.8203125, "calib/ece": 0.35124, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.368, "calib/gap": -0.00788702525544649, "calib/mean_conf": 0.88324, "calib/mu_c": 0.879548872180451, "calib/mu_w": 0.8874358974358975, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.35124, "calib/std_conf": 0.04294534200585671, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.805139092240117, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.0162280214597178, "calib/step_q_w": 0.7889110707803992, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 512.76953125, "completions/mean_terminated_length": 512.76953125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.005333333333333333, "grad_norm": 1.6314057111740112, "kl": 0.0002815425395965576, "learning_rate": 1.0000000000000002e-06, "loss": 0.035, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033720463514328, "mask/share_reasoning": 0.8549020290374756, "mask/share_step_conf": 0.11137749999761581, "num_tokens": 1154978.0, "reward": 0.3144644796848297, "reward_std": 0.14418430626392365, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6046000123023987, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.27254605293273926, "step": 5 }, { "adv/mean_abs_final_conf": 0.7703990936279297, "adv/mean_abs_reasoning": 0.4255276918411255, "adv/mean_abs_step_conf": 0.7566233277320862, "adv/ratio_final_to_reasoning": 1.810455837303216, "adv/ratio_step_to_reasoning": 1.7780824661690364, "adv/std_final_conf": 0.9292229413986206, "adv/std_reasoning": 0.7013127207756042, "adv/std_step_conf": 0.9343317747116089, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.295275590551181, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.30708661417322836, "calib/gap": 0.0041955896452540165, "calib/mean_conf": 0.8818897637795275, "calib/mu_c": 0.8836241610738255, "calib/mu_w": 0.8794285714285714, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.295275590551181, "calib/std_conf": 0.03836690428010434, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.799431968295905, "calib/step_q_c_n": 757.0, "calib/step_q_gap": -0.002575136499832098, "calib/step_q_w": 0.8020071047957371, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 447.765625, "completions/mean_terminated_length": 447.765625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.0064, "grad_norm": 1.4825108051300049, "kl": 0.0004347562789916992, "learning_rate": 1.25e-06, "loss": 0.0162, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036845672875642776, "mask/share_reasoning": 0.8360493183135986, "mask/share_step_conf": 0.12710505723953247, "num_tokens": 1375558.0, "reward": 0.34460651874542236, "reward_std": 0.16137650609016418, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6606269478797913, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.28469520807266235, "step": 6 }, { "adv/mean_abs_final_conf": 0.7863481044769287, "adv/mean_abs_reasoning": 0.469906747341156, "adv/mean_abs_step_conf": 0.769094705581665, "adv/ratio_final_to_reasoning": 1.6734130951008324, "adv/ratio_step_to_reasoning": 1.6366964508030275, "adv/std_final_conf": 0.9292763471603394, "adv/std_reasoning": 0.7206797003746033, "adv/std_step_conf": 0.9334774017333984, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.2578125, "calib/ece": 0.25059288537549407, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3438735177865613, "calib/gap": 0.00951075268817192, "calib/mean_conf": 0.88300395256917, "calib/mu_c": 0.8865000000000001, "calib/mu_w": 0.8769892473118281, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25059288537549407, "calib/std_conf": 0.045860977239795646, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7959679408138101, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.051538034271754074, "calib/step_q_w": 0.744429906542056, "calib/step_q_w_n": 535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 544.1171875, "completions/mean_terminated_length": 546.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.007466666666666667, "grad_norm": 2.106478452682495, "kl": 0.00030663609504699707, "learning_rate": 1.5e-06, "loss": 0.0688, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029625695198774338, "mask/share_reasoning": 0.8616422414779663, "mask/share_step_conf": 0.10482584685087204, "num_tokens": 1622276.0, "reward": 0.3923969268798828, "reward_std": 0.17531530559062958, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6933324337005615, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.22885112464427948, "step": 7 }, { "adv/mean_abs_final_conf": 0.7503417730331421, "adv/mean_abs_reasoning": 0.37843891978263855, "adv/mean_abs_step_conf": 0.7550092935562134, "adv/ratio_final_to_reasoning": 1.9827288732990542, "adv/ratio_step_to_reasoning": 1.9950624898460843, "adv/std_final_conf": 0.9293138980865479, "adv/std_reasoning": 0.6815049052238464, "adv/std_step_conf": 0.9338960647583008, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.83984375, "calib/ece": 0.30968127490039843, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.32669322709163345, "calib/gap": -0.007965991692627239, "calib/mean_conf": 0.8807569721115538, "calib/mu_c": 0.8773611111111111, "calib/mu_w": 0.8853271028037384, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3083665338645418, "calib/std_conf": 0.07351602734159984, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8087121212121212, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.025499685978960573, "calib/step_q_w": 0.7832124352331606, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 557.890625, "completions/mean_terminated_length": 557.890625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.008533333333333334, "grad_norm": 1.6290005445480347, "kl": 0.0003802478313446045, "learning_rate": 1.75e-06, "loss": 0.065, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03190384805202484, "mask/share_reasoning": 0.8644559979438782, "mask/share_step_conf": 0.10364013910293579, "num_tokens": 1871608.0, "reward": 0.3481142520904541, "reward_std": 0.1533786803483963, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.637712836265564, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.2492968589067459, "step": 8 }, { "adv/mean_abs_final_conf": 0.7698661088943481, "adv/mean_abs_reasoning": 0.47346532344818115, "adv/mean_abs_step_conf": 0.7808018326759338, "adv/ratio_final_to_reasoning": 1.626024274148573, "adv/ratio_step_to_reasoning": 1.6491214752317325, "adv/std_final_conf": 0.9296618700027466, "adv/std_reasoning": 0.7394227385520935, "adv/std_step_conf": 0.9349858164787292, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.3006827309236948, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.0015334485968878653, "calib/mean_conf": 0.8842971887550201, "calib/mu_c": 0.884931506849315, "calib/mu_w": 0.8833980582524271, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2993172690763052, "calib/std_conf": 0.04356143277983223, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7792553191489362, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.03448179998517309, "calib/step_q_w": 0.7447735191637631, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 526.42578125, "completions/mean_terminated_length": 528.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.0096, "grad_norm": 1.390598177909851, "kl": 0.0004108846187591553, "learning_rate": 2.0000000000000003e-06, "loss": 0.0238, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03303675353527069, "mask/share_reasoning": 0.8570089340209961, "mask/share_step_conf": 0.10604804754257202, "num_tokens": 2113909.0, "reward": 0.33846351504325867, "reward_std": 0.20083707571029663, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6485316753387451, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.27941715717315674, "step": 9 }, { "adv/mean_abs_final_conf": 0.7514711618423462, "adv/mean_abs_reasoning": 0.47852805256843567, "adv/mean_abs_step_conf": 0.758353054523468, "adv/ratio_final_to_reasoning": 1.5703805823063552, "adv/ratio_step_to_reasoning": 1.5847619600420684, "adv/std_final_conf": 0.9294180870056152, "adv/std_reasoning": 0.7574200630187988, "adv/std_step_conf": 0.9331981539726257, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.30456692913385847, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.43700787401574803, "calib/gap": 0.008992307692307744, "calib/mean_conf": 0.8951181102362206, "calib/mu_c": 0.8987999999999999, "calib/mu_w": 0.8898076923076922, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30456692913385847, "calib/std_conf": 0.04612318565208294, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954441260744987, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.015725321153584826, "calib/step_q_w": 0.7797188049209138, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 502.77734375, "completions/mean_terminated_length": 504.7490539550781, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.010666666666666666, "grad_norm": 1.6552625894546509, "kl": 0.0006910562515258789, "learning_rate": 2.25e-06, "loss": 0.0153, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03246258199214935, "mask/share_reasoning": 0.8544189929962158, "mask/share_step_conf": 0.10921218991279602, "num_tokens": 2349420.0, "reward": 0.3554859757423401, "reward_std": 0.17813757061958313, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6623660326004028, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.2662378251552582, "step": 10 }, { "adv/mean_abs_final_conf": 0.7631397247314453, "adv/mean_abs_reasoning": 0.448530912399292, "adv/mean_abs_step_conf": 0.7707614898681641, "adv/ratio_final_to_reasoning": 1.7014205791285166, "adv/ratio_step_to_reasoning": 1.7184133101221246, "adv/std_final_conf": 0.9275975227355957, "adv/std_reasoning": 0.7205783724784851, "adv/std_step_conf": 0.9346963763237, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.33494023904382475, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5537848605577689, "calib/gap": -0.013346128822381287, "calib/mean_conf": 0.9037051792828684, "calib/mu_c": 0.8980689655172414, "calib/mu_w": 0.9114150943396226, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33047808764940245, "calib/std_conf": 0.06001507723137061, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7798165137614679, "calib/step_q_c_n": 763.0, "calib/step_q_gap": -0.00830342140871032, "calib/step_q_w": 0.7881199351701782, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 544.1875, "completions/mean_terminated_length": 544.1875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.011733333333333333, "grad_norm": 1.134965419769287, "kl": 0.0011256933212280273, "learning_rate": 2.5e-06, "loss": 0.0807, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03155206888914108, "mask/share_reasoning": 0.8536930084228516, "mask/share_step_conf": 0.11475491523742676, "num_tokens": 2593212.0, "reward": 0.3331599533557892, "reward_std": 0.16903448104858398, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6271425485610962, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.27019768953323364, "step": 11 }, { "adv/mean_abs_final_conf": 0.7283312678337097, "adv/mean_abs_reasoning": 0.48717015981674194, "adv/mean_abs_step_conf": 0.7630271315574646, "adv/ratio_final_to_reasoning": 1.4950243834878658, "adv/ratio_step_to_reasoning": 1.5662435725630062, "adv/std_final_conf": 0.9263463020324707, "adv/std_reasoning": 0.7575986981391907, "adv/std_step_conf": 0.9330752491950989, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.21960000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.592, "calib/gap": -0.008522383545069512, "calib/mean_conf": 0.9092, "calib/mu_c": 0.906609195402299, "calib/mu_w": 0.9151315789473685, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2164000000000001, "calib/std_conf": 0.04393813833106724, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7855875831485588, "calib/step_q_c_n": 902.0, "calib/step_q_gap": 0.015312353790760569, "calib/step_q_w": 0.7702752293577982, "calib/step_q_w_n": 545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 503.01171875, "completions/mean_terminated_length": 503.01171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0128, "grad_norm": 1.2536349296569824, "kl": 0.0021218061447143555, "learning_rate": 2.7500000000000004e-06, "loss": 0.0789, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03501136228442192, "mask/share_reasoning": 0.839159369468689, "mask/share_step_conf": 0.12582923471927643, "num_tokens": 2826159.0, "reward": 0.4238912761211395, "reward_std": 0.18918515741825104, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7163012027740479, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.19898740947246552, "step": 12 }, { "adv/mean_abs_final_conf": 0.737905740737915, "adv/mean_abs_reasoning": 0.41767221689224243, "adv/mean_abs_step_conf": 0.7694048285484314, "adv/ratio_final_to_reasoning": 1.7667101398997085, "adv/ratio_step_to_reasoning": 1.8421259481258108, "adv/std_final_conf": 0.9262003898620605, "adv/std_reasoning": 0.70124351978302, "adv/std_step_conf": 0.9342942833900452, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.3045098039215687, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.008605283605283565, "calib/mean_conf": 0.9162745098039217, "calib/mu_c": 0.9196153846153845, "calib/mu_w": 0.9110101010101009, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3045098039215687, "calib/std_conf": 0.03954450381054809, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7824812030075189, "calib/step_q_c_n": 798.0, "calib/step_q_gap": 0.018387635756056953, "calib/step_q_w": 0.7640935672514619, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 482.05078125, "completions/mean_terminated_length": 483.9411926269531, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.013866666666666666, "grad_norm": 1.1540279388427734, "kl": 0.0023784637451171875, "learning_rate": 3e-06, "loss": 0.0281, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03376149386167526, "mask/share_reasoning": 0.8471423983573914, "mask/share_step_conf": 0.11518988013267517, "num_tokens": 3054156.0, "reward": 0.3762282729148865, "reward_std": 0.17012429237365723, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6658226847648621, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.23445990681648254, "step": 13 }, { "adv/mean_abs_final_conf": 0.7835434079170227, "adv/mean_abs_reasoning": 0.5537022352218628, "adv/mean_abs_step_conf": 0.7725571393966675, "adv/ratio_final_to_reasoning": 1.4150988709718049, "adv/ratio_step_to_reasoning": 1.3952573969420798, "adv/std_final_conf": 0.924622118473053, "adv/std_reasoning": 0.7754649519920349, "adv/std_step_conf": 0.9346968531608582, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.37145161290322576, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.907258064516129, "calib/gap": -0.0037807383840392506, "calib/mean_conf": 0.94, "calib/mu_c": 0.9383687943262411, "calib/mu_w": 0.9421495327102803, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.37145161290322576, "calib/std_conf": 0.027119805879978955, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7759945504087195, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.027471060475833586, "calib/step_q_w": 0.7485234899328859, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 547.07421875, "completions/mean_terminated_length": 551.3818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.014933333333333333, "grad_norm": 0.9899579882621765, "kl": 0.005403995513916016, "learning_rate": 3.2500000000000002e-06, "loss": 0.017, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031641483306884766, "mask/share_reasoning": 0.8485899567604065, "mask/share_step_conf": 0.11195607483386993, "num_tokens": 3299607.0, "reward": 0.32880502939224243, "reward_std": 0.21657250821590424, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5910734534263611, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.23658838868141174, "step": 14 }, { "adv/mean_abs_final_conf": 0.7555526494979858, "adv/mean_abs_reasoning": 0.4017358422279358, "adv/mean_abs_step_conf": 0.7688524127006531, "adv/ratio_final_to_reasoning": 1.8807200405815483, "adv/ratio_step_to_reasoning": 1.9138257827252159, "adv/std_final_conf": 0.906000554561615, "adv/std_reasoning": 0.6815659999847412, "adv/std_step_conf": 0.9339501857757568, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.3578823529411764, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9176470588235294, "calib/gap": 0.006999999999999784, "calib/mean_conf": 0.9461176470588234, "calib/mu_c": 0.9489999999999997, "calib/mu_w": 0.942, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3578823529411764, "calib/std_conf": 0.05374801302622813, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7532305630026809, "calib/step_q_c_n": 746.0, "calib/step_q_gap": -0.001677787302818201, "calib/step_q_w": 0.7549083503054991, "calib/step_q_w_n": 491.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 472.23046875, "completions/mean_terminated_length": 474.0823669433594, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.016, "grad_norm": 1.7005994319915771, "kl": 0.009075164794921875, "learning_rate": 3.5e-06, "loss": 0.0522, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033927664160728455, "mask/share_reasoning": 0.8490947484970093, "mask/share_step_conf": 0.11307130753993988, "num_tokens": 3528378.0, "reward": 0.35110145807266235, "reward_std": 0.18108385801315308, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6277461051940918, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.24194946885108948, "step": 15 }, { "adv/mean_abs_final_conf": 0.7394810914993286, "adv/mean_abs_reasoning": 0.43682926893234253, "adv/mean_abs_step_conf": 0.7458778619766235, "adv/ratio_final_to_reasoning": 1.6928377837563393, "adv/ratio_step_to_reasoning": 1.7074814235768332, "adv/std_final_conf": 0.8953023552894592, "adv/std_reasoning": 0.7014684677124023, "adv/std_step_conf": 0.9345353841781616, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.390625, "calib/ece": 0.3287351778656126, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9723320158102767, "calib/gap": 0.0006128730095009693, "calib/mean_conf": 0.957193675889328, "calib/mu_c": 0.9574213836477988, "calib/mu_w": 0.9568085106382979, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3287351778656126, "calib/std_conf": 0.019470004281586368, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7212183692596064, "calib/step_q_c_n": 1067.0, "calib/step_q_gap": -0.011488133376597487, "calib/step_q_w": 0.7327065026362038, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 650.8515625, "completions/mean_terminated_length": 650.8515625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.017066666666666667, "grad_norm": 1.4386796951293945, "kl": 0.010073661804199219, "learning_rate": 3.7500000000000005e-06, "loss": 0.0856, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.025310292840003967, "mask/share_reasoning": 0.8630415201187134, "mask/share_step_conf": 0.11164823174476624, "num_tokens": 3803844.0, "reward": 0.3730993866920471, "reward_std": 0.18799588084220886, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6497913599014282, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.22390510141849518, "step": 16 }, { "adv/mean_abs_final_conf": 0.7376053929328918, "adv/mean_abs_reasoning": 0.44774770736694336, "adv/mean_abs_step_conf": 0.7699299454689026, "adv/ratio_final_to_reasoning": 1.647368329969808, "adv/ratio_step_to_reasoning": 1.719562005122498, "adv/std_final_conf": 0.8852930665016174, "adv/std_reasoning": 0.7014374732971191, "adv/std_step_conf": 0.9343675971031189, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.65625, "calib/ece": 0.24218253968253975, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": 0.010055555555555484, "calib/mean_conf": 0.9549603174603174, "calib/mu_c": 0.9578333333333332, "calib/mu_w": 0.9477777777777777, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2414285714285715, "calib/std_conf": 0.05771095200091226, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7282984790874525, "calib/step_q_c_n": 1052.0, "calib/step_q_gap": 0.0283489841379575, "calib/step_q_w": 0.699949494949495, "calib/step_q_w_n": 396.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 528.109375, "completions/mean_terminated_length": 532.2677001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.018133333333333335, "grad_norm": 0.9054005146026611, "kl": 0.016378402709960938, "learning_rate": 4.000000000000001e-06, "loss": 0.0752, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031545430421829224, "mask/share_reasoning": 0.8398832082748413, "mask/share_step_conf": 0.12075883150100708, "num_tokens": 4042568.0, "reward": 0.43897920846939087, "reward_std": 0.18807503581047058, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7148038744926453, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.17043927311897278, "step": 17 }, { "adv/mean_abs_final_conf": 0.7131336331367493, "adv/mean_abs_reasoning": 0.40515708923339844, "adv/mean_abs_step_conf": 0.7440693378448486, "adv/ratio_final_to_reasoning": 1.7601410714201648, "adv/ratio_step_to_reasoning": 1.8364959113826917, "adv/std_final_conf": 0.8951680660247803, "adv/std_reasoning": 0.7013642191886902, "adv/std_step_conf": 0.9341275691986084, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.42048387096774187, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": -0.003303834808259798, "calib/mean_conf": 0.9648387096774195, "calib/mu_c": 0.9633333333333332, "calib/mu_w": 0.966637168141593, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.42048387096774187, "calib/std_conf": 0.015527492318140311, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6932142857142858, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.002466622162883847, "calib/step_q_w": 0.6907476635514019, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 552.58984375, "completions/mean_terminated_length": 556.94091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.0192, "grad_norm": 1.1966159343719482, "kl": 0.016684532165527344, "learning_rate": 4.25e-06, "loss": 0.0388, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030978208407759666, "mask/share_reasoning": 0.8578979969024658, "mask/share_step_conf": 0.10331130772829056, "num_tokens": 4294751.0, "reward": 0.3016693890094757, "reward_std": 0.1707407385110855, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5470273494720459, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.23978236317634583, "step": 18 }, { "adv/mean_abs_final_conf": 0.7240345478057861, "adv/mean_abs_reasoning": 0.41203486919403076, "adv/mean_abs_step_conf": 0.7667274475097656, "adv/ratio_final_to_reasoning": 1.7572166870780832, "adv/ratio_step_to_reasoning": 1.860831460719668, "adv/std_final_conf": 0.8740739822387695, "adv/std_reasoning": 0.7013679146766663, "adv/std_step_conf": 0.9340393543243408, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 4.6484375, "calib/ece": 0.3597628458498022, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004545454545452632, "calib/mean_conf": 0.9684584980237153, "calib/mu_c": 0.9686363636363635, "calib/mu_w": 0.9681818181818183, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3597628458498022, "calib/std_conf": 0.010576058843868474, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7163469187675069, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.0026074229691875894, "calib/step_q_w": 0.7137394957983193, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 496.23828125, "completions/mean_terminated_length": 498.1843566894531, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.020266666666666665, "grad_norm": 1.6525537967681885, "kl": 0.0213470458984375, "learning_rate": 4.5e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030582614243030548, "mask/share_reasoning": 0.8619511723518372, "mask/share_step_conf": 0.1035599410533905, "num_tokens": 4526548.0, "reward": 0.3647327721118927, "reward_std": 0.187033012509346, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6250780820846558, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.21358135342597961, "step": 19 }, { "adv/mean_abs_final_conf": 0.705518364906311, "adv/mean_abs_reasoning": 0.42651718854904175, "adv/mean_abs_step_conf": 0.7572118043899536, "adv/ratio_final_to_reasoning": 1.6541381774235089, "adv/ratio_step_to_reasoning": 1.7753371369765745, "adv/std_final_conf": 0.8713658452033997, "adv/std_reasoning": 0.7204932570457458, "adv/std_step_conf": 0.9350305199623108, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.42578125, "calib/ece": 0.42270916334661346, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.001246633320508006, "calib/mean_conf": 0.970996015936255, "calib/mu_c": 0.9704347826086958, "calib/mu_w": 0.9716814159292038, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42195219123505967, "calib/std_conf": 0.015773672656279998, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6602646239554317, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.00811857328479082, "calib/step_q_w": 0.6521460506706409, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 500.640625, "completions/mean_terminated_length": 502.60394287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.021333333333333333, "grad_norm": 0.9578653573989868, "kl": 0.029819488525390625, "learning_rate": 4.75e-06, "loss": 0.0468, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03496409207582474, "mask/share_reasoning": 0.8334662318229675, "mask/share_step_conf": 0.12766346335411072, "num_tokens": 4759584.0, "reward": 0.3266947865486145, "reward_std": 0.17774325609207153, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5629937648773193, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.2135103940963745, "step": 20 }, { "adv/mean_abs_final_conf": 0.7249806523323059, "adv/mean_abs_reasoning": 0.5042846202850342, "adv/mean_abs_step_conf": 0.7544502019882202, "adv/ratio_final_to_reasoning": 1.43764180617392, "adv/ratio_step_to_reasoning": 1.4960801334012255, "adv/std_final_conf": 0.8839868903160095, "adv/std_reasoning": 0.757595956325531, "adv/std_step_conf": 0.9354380965232849, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.35011857707509875, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0021065552016986677, "calib/mean_conf": 0.9706719367588932, "calib/mu_c": 0.9698726114649681, "calib/mu_w": 0.9719791666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35011857707509875, "calib/std_conf": 0.01113797242974695, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6385108958837773, "calib/step_q_c_n": 826.0, "calib/step_q_gap": -0.01804961367036284, "calib/step_q_w": 0.6565605095541401, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 536.6796875, "completions/mean_terminated_length": 536.6796875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0224, "grad_norm": 0.8330198526382446, "kl": 0.030071258544921875, "learning_rate": 5e-06, "loss": 0.0666, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03229574114084244, "mask/share_reasoning": 0.84485924243927, "mask/share_step_conf": 0.12284499406814575, "num_tokens": 4999934.0, "reward": 0.3799642324447632, "reward_std": 0.2192906141281128, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6333242058753967, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.19370830059051514, "step": 21 }, { "adv/mean_abs_final_conf": 0.699131965637207, "adv/mean_abs_reasoning": 0.37140166759490967, "adv/mean_abs_step_conf": 0.7832655906677246, "adv/ratio_final_to_reasoning": 1.8824147187183744, "adv/ratio_step_to_reasoning": 2.108944732908517, "adv/std_final_conf": 0.8496929407119751, "adv/std_reasoning": 0.6402788758277893, "adv/std_step_conf": 0.9349262714385986, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.3171764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00029667936853572385, "calib/mean_conf": 0.972078431372549, "calib/mu_c": 0.9719760479041916, "calib/mu_w": 0.9722727272727273, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3171764705882353, "calib/std_conf": 0.011681799380022513, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6708108108108108, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.033048164896413934, "calib/step_q_w": 0.6377626459143969, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 501.46875, "completions/mean_terminated_length": 501.46875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.023466666666666667, "grad_norm": 1.0240269899368286, "kl": 0.044139862060546875, "learning_rate": 4.9722222222222224e-06, "loss": 0.0309, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03162510693073273, "mask/share_reasoning": 0.8491679430007935, "mask/share_step_conf": 0.1192069873213768, "num_tokens": 5230126.0, "reward": 0.4074317514896393, "reward_std": 0.1798522025346756, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6704937219619751, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.185317724943161, "step": 22 }, { "adv/mean_abs_final_conf": 0.7653679847717285, "adv/mean_abs_reasoning": 0.5065703392028809, "adv/mean_abs_step_conf": 0.7755952477455139, "adv/ratio_final_to_reasoning": 1.5108819556551247, "adv/ratio_step_to_reasoning": 1.5310711814788842, "adv/std_final_conf": 0.88029944896698, "adv/std_reasoning": 0.7393872737884521, "adv/std_step_conf": 0.9353004693984985, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.41898039215686267, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.362457933226871e-06, "calib/mean_conf": 0.9758431372549019, "calib/mu_c": 0.975845070422535, "calib/mu_w": 0.9758407079646018, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41898039215686267, "calib/std_conf": 0.011747961940927244, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6223529411764707, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.018685760899120885, "calib/step_q_w": 0.6036671802773498, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 524.51171875, "completions/mean_terminated_length": 526.5686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.024533333333333334, "grad_norm": 1.0188655853271484, "kl": 0.0431671142578125, "learning_rate": 4.944444444444445e-06, "loss": 0.0684, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0323941633105278, "mask/share_reasoning": 0.841258704662323, "mask/share_step_conf": 0.1224408820271492, "num_tokens": 5468337.0, "reward": 0.3318406045436859, "reward_std": 0.2227940410375595, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5750659704208374, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.2207598090171814, "step": 23 }, { "adv/mean_abs_final_conf": 0.7638611197471619, "adv/mean_abs_reasoning": 0.6413958072662354, "adv/mean_abs_step_conf": 0.7876379489898682, "adv/ratio_final_to_reasoning": 1.1909356299083083, "adv/ratio_step_to_reasoning": 1.2280060768512782, "adv/std_final_conf": 0.9069669842720032, "adv/std_reasoning": 0.8429003357887268, "adv/std_step_conf": 0.9353696703910828, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.4378968253968255, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0004918864097362974, "calib/mean_conf": 0.9775793650793652, "calib/mu_c": 0.9773529411764708, "calib/mu_w": 0.977844827586207, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4378968253968255, "calib/std_conf": 0.011551027697426637, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6079061371841155, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.04093734075023725, "calib/step_q_w": 0.5669687964338782, "calib/step_q_w_n": 673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 565.125, "completions/mean_terminated_length": 567.3411865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.0256, "grad_norm": 1.043018102645874, "kl": 0.03503227233886719, "learning_rate": 4.9166666666666665e-06, "loss": 0.0101, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030982669442892075, "mask/share_reasoning": 0.84433513879776, "mask/share_step_conf": 0.12077593803405762, "num_tokens": 5717521.0, "reward": 0.32456642389297485, "reward_std": 0.24878433346748352, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5507019758224487, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.20469412207603455, "step": 24 }, { "adv/mean_abs_final_conf": 0.6897413730621338, "adv/mean_abs_reasoning": 0.4197363257408142, "adv/mean_abs_step_conf": 0.7734383344650269, "adv/ratio_final_to_reasoning": 1.6432730044147925, "adv/ratio_step_to_reasoning": 1.8426766687394658, "adv/std_final_conf": 0.8677418231964111, "adv/std_reasoning": 0.7012581825256348, "adv/std_step_conf": 0.9352645874023438, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.38632812500000013, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0030465587044534725, "calib/mean_conf": 0.9800781250000001, "calib/mu_c": 0.9813157894736844, "calib/mu_w": 0.9782692307692309, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38632812500000013, "calib/std_conf": 0.010532397470869356, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6058703481392557, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.05295916392872935, "calib/step_q_w": 0.5529111842105263, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 486.05859375, "completions/mean_terminated_length": 487.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.02666666666666667, "grad_norm": 0.9559767842292786, "kl": 0.03623199462890625, "learning_rate": 4.888888888888889e-06, "loss": 0.0248, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.032504092901945114, "mask/share_reasoning": 0.8402522206306458, "mask/share_step_conf": 0.12333747744560242, "num_tokens": 5945176.0, "reward": 0.3616005778312683, "reward_std": 0.18630656599998474, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6108984351158142, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.2064473032951355, "step": 25 }, { "adv/mean_abs_final_conf": 0.6565980911254883, "adv/mean_abs_reasoning": 0.3444192707538605, "adv/mean_abs_step_conf": 0.767449140548706, "adv/ratio_final_to_reasoning": 1.906391851095715, "adv/ratio_step_to_reasoning": 2.228241000768985, "adv/std_final_conf": 0.8418591022491455, "adv/std_reasoning": 0.6611375212669373, "adv/std_step_conf": 0.9351149201393127, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.33865079365079376, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.876543209874633e-05, "calib/mean_conf": 0.9815079365079367, "calib/mu_c": 0.9815432098765432, "calib/mu_w": 0.9814444444444445, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33865079365079376, "calib/std_conf": 0.010842850055649682, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6031436314363143, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.025001868984206976, "calib/step_q_w": 0.5781417624521074, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 483.078125, "completions/mean_terminated_length": 484.9725646972656, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.027733333333333332, "grad_norm": 0.9195064306259155, "kl": 0.03989410400390625, "learning_rate": 4.861111111111111e-06, "loss": 0.0103, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031222233548760414, "mask/share_reasoning": 0.8557568788528442, "mask/share_step_conf": 0.10911465436220169, "num_tokens": 6174084.0, "reward": 0.387053519487381, "reward_std": 0.143305242061615, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.641502320766449, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.1900515854358673, "step": 26 }, { "adv/mean_abs_final_conf": 0.7401793003082275, "adv/mean_abs_reasoning": 0.5729160308837891, "adv/mean_abs_step_conf": 0.7543942928314209, "adv/ratio_final_to_reasoning": 1.291950757891022, "adv/ratio_step_to_reasoning": 1.3167624087384686, "adv/std_final_conf": 0.9016159176826477, "adv/std_reasoning": 0.8098499774932861, "adv/std_step_conf": 0.9356675148010254, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.47172549019607846, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": -0.0069422556020685144, "calib/mean_conf": 0.9763529411764706, "calib/mu_c": 0.9729770992366412, "calib/mu_w": 0.9799193548387097, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4671764705882353, "calib/std_conf": 0.06286331999786045, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5627042253521127, "calib/step_q_c_n": 710.0, "calib/step_q_gap": -0.017269903334214298, "calib/step_q_w": 0.579974128686327, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 512.48828125, "completions/mean_terminated_length": 512.48828125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.0288, "grad_norm": 0.8841757774353027, "kl": 0.05023193359375, "learning_rate": 4.833333333333333e-06, "loss": 0.0316, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03174913674592972, "mask/share_reasoning": 0.84727942943573, "mask/share_step_conf": 0.1209714263677597, "num_tokens": 6410497.0, "reward": 0.3019501566886902, "reward_std": 0.2588861584663391, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5266007781028748, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.22348174452781677, "step": 27 }, { "adv/mean_abs_final_conf": 0.7030376195907593, "adv/mean_abs_reasoning": 0.42762160301208496, "adv/mean_abs_step_conf": 0.7977121472358704, "adv/ratio_final_to_reasoning": 1.644064786808469, "adv/ratio_step_to_reasoning": 1.865462693224426, "adv/std_final_conf": 0.850563108921051, "adv/std_reasoning": 0.7014147043228149, "adv/std_step_conf": 0.9355003237724304, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 4.77734375, "calib/ece": 0.3548790322580647, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": -0.0013978494623654303, "calib/mean_conf": 0.9798790322580647, "calib/mu_c": 0.9793548387096775, "calib/mu_w": 0.980752688172043, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3548790322580647, "calib/std_conf": 0.012296139403274755, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5986900662251655, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.023647331182430453, "calib/step_q_w": 0.5750427350427351, "calib/step_q_w_n": 468.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 557.98046875, "completions/mean_terminated_length": 560.1686401367188, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.029866666666666666, "grad_norm": 0.7727410197257996, "kl": 0.034938812255859375, "learning_rate": 4.805555555555556e-06, "loss": 0.0384, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02986774779856205, "mask/share_reasoning": 0.8633028268814087, "mask/share_step_conf": 0.10292316228151321, "num_tokens": 6660284.0, "reward": 0.3603275418281555, "reward_std": 0.18742801249027252, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6148539185523987, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.20748008787631989, "step": 28 }, { "adv/mean_abs_final_conf": 0.7449028491973877, "adv/mean_abs_reasoning": 0.4773523807525635, "adv/mean_abs_step_conf": 0.7774635553359985, "adv/ratio_final_to_reasoning": 1.5604883922921284, "adv/ratio_step_to_reasoning": 1.6286994402548047, "adv/std_final_conf": 0.8901979923248291, "adv/std_reasoning": 0.7394049167633057, "adv/std_step_conf": 0.935321033000946, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.4923015873015873, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": -0.0033163168840989465, "calib/mean_conf": 0.9803968253968254, "calib/mu_c": 0.9786991869918701, "calib/mu_w": 0.982015503875969, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4923015873015873, "calib/std_conf": 0.012176010348925173, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5727431059506531, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.04853050752545629, "calib/step_q_w": 0.5242125984251969, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 585.17578125, "completions/mean_terminated_length": 585.17578125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.030933333333333334, "grad_norm": 0.8804644346237183, "kl": 0.03813934326171875, "learning_rate": 4.777777777777778e-06, "loss": -0.0372, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02827608771622181, "mask/share_reasoning": 0.8626615405082703, "mask/share_step_conf": 0.10906237363815308, "num_tokens": 6917217.0, "reward": 0.2944219708442688, "reward_std": 0.21288272738456726, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.4901074171066284, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.19188852608203888, "step": 29 }, { "adv/mean_abs_final_conf": 0.7416458129882812, "adv/mean_abs_reasoning": 0.5515131950378418, "adv/mean_abs_step_conf": 0.760871946811676, "adv/ratio_final_to_reasoning": 1.3447471786008558, "adv/ratio_step_to_reasoning": 1.379607874584885, "adv/std_final_conf": 0.8862450122833252, "adv/std_reasoning": 0.7928551435470581, "adv/std_step_conf": 0.9354478716850281, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.4210714285714285, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.00931240848029502, "calib/mean_conf": 0.9726587301587302, "calib/mu_c": 0.9768345323741006, "calib/mu_w": 0.9675221238938055, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.4210714285714285, "calib/std_conf": 0.057172762787333534, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5627169274537696, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.011031166406306414, "calib/step_q_w": 0.5516857610474631, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 593.671875, "completions/mean_terminated_length": 596.0000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.032, "grad_norm": 0.8357536196708679, "kl": 0.035503387451171875, "learning_rate": 4.75e-06, "loss": -0.0279, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.027449211105704308, "mask/share_reasoning": 0.8674349784851074, "mask/share_step_conf": 0.1012095957994461, "num_tokens": 7176181.0, "reward": 0.31889694929122925, "reward_std": 0.22138473391532898, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5547477006912231, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.21617251634597778, "step": 30 }, { "adv/mean_abs_final_conf": 0.7557934522628784, "adv/mean_abs_reasoning": 0.476267009973526, "adv/mean_abs_step_conf": 0.7879382371902466, "adv/ratio_final_to_reasoning": 1.586911199885313, "adv/ratio_step_to_reasoning": 1.6544044006618164, "adv/std_final_conf": 0.8960928916931152, "adv/std_reasoning": 0.7394196391105652, "adv/std_step_conf": 0.9349035024642944, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.5464081632653063, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.0024523809523813167, "calib/mean_conf": 0.9749795918367347, "calib/mu_c": 0.9763809523809525, "calib/mu_w": 0.9739285714285711, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5464081632653063, "calib/std_conf": 0.014863308879714873, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5909352517985611, "calib/step_q_c_n": 556.0, "calib/step_q_gap": 0.03200219740525567, "calib/step_q_w": 0.5589330543933054, "calib/step_q_w_n": 956.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 618.41015625, "completions/mean_terminated_length": 623.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.03306666666666667, "grad_norm": 0.7795594930648804, "kl": 0.03357505798339844, "learning_rate": 4.722222222222222e-06, "loss": 0.0062, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02844122424721718, "mask/share_reasoning": 0.8597810864448547, "mask/share_step_conf": 0.10396520793437958, "num_tokens": 7440406.0, "reward": 0.2383827120065689, "reward_std": 0.2029481828212738, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.4378613233566284, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": -0.23609593510627747, "step": 31 }, { "adv/mean_abs_final_conf": 0.6904056668281555, "adv/mean_abs_reasoning": 0.43361717462539673, "adv/mean_abs_step_conf": 0.7783961892127991, "adv/ratio_final_to_reasoning": 1.592200925677354, "adv/ratio_step_to_reasoning": 1.7951230596095693, "adv/std_final_conf": 0.8829302191734314, "adv/std_reasoning": 0.7205833196640015, "adv/std_step_conf": 0.9355403184890747, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.47612000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.972, "calib/gap": 0.007701811663785918, "calib/mean_conf": 0.9681200000000001, "calib/mu_c": 0.9720325203252034, "calib/mu_w": 0.9643307086614175, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.47612000000000015, "calib/std_conf": 0.060432322477296865, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5733505154639176, "calib/step_q_c_n": 582.0, "calib/step_q_gap": 0.02332039498198979, "calib/step_q_w": 0.5500301204819278, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 514.97265625, "completions/mean_terminated_length": 523.1468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.034133333333333335, "grad_norm": 0.8769909739494324, "kl": 0.0423736572265625, "learning_rate": 4.694444444444445e-06, "loss": 0.0049, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030418388545513153, "mask/share_reasoning": 0.8499599695205688, "mask/share_step_conf": 0.1039966493844986, "num_tokens": 7678943.0, "reward": 0.29528889060020447, "reward_std": 0.19717194139957428, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5112226009368896, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.21126985549926758, "step": 32 }, { "adv/mean_abs_final_conf": 0.686201274394989, "adv/mean_abs_reasoning": 0.5028985738754272, "adv/mean_abs_step_conf": 0.7896302938461304, "adv/ratio_final_to_reasoning": 1.3644923848302015, "adv/ratio_step_to_reasoning": 1.5701581489108165, "adv/std_final_conf": 0.8819124698638916, "adv/std_reasoning": 0.775467574596405, "adv/std_step_conf": 0.9355705380439758, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.46453441295546577, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9757085020242915, "calib/gap": 0.026363278688524838, "calib/mean_conf": 0.9584615384615386, "calib/mu_c": 0.9718032786885249, "calib/mu_w": 0.9454400000000001, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.46453441295546577, "calib/std_conf": 0.10703239103764979, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5873088685015291, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.037124732524191906, "calib/step_q_w": 0.5501841359773372, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 562.0234375, "completions/mean_terminated_length": 562.0234375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.0352, "grad_norm": 0.8330094814300537, "kl": 0.039539337158203125, "learning_rate": 4.666666666666667e-06, "loss": 0.0501, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029015641659498215, "mask/share_reasoning": 0.8654122948646545, "mask/share_step_conf": 0.10557205975055695, "num_tokens": 7929693.0, "reward": 0.29371070861816406, "reward_std": 0.21325403451919556, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.512758195400238, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.21127429604530334, "step": 33 }, { "adv/mean_abs_final_conf": 0.7370667457580566, "adv/mean_abs_reasoning": 0.5098717212677002, "adv/mean_abs_step_conf": 0.7471521496772766, "adv/ratio_final_to_reasoning": 1.44559251869368, "adv/ratio_step_to_reasoning": 1.4653727957683615, "adv/std_final_conf": 0.9055343270301819, "adv/std_reasoning": 0.757660984992981, "adv/std_step_conf": 0.9352141618728638, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.3965476190476191, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9682539682539683, "calib/gap": -0.0023428939735739007, "calib/mean_conf": 0.9599603174603174, "calib/mu_c": 0.9589655172413794, "calib/mu_w": 0.9613084112149533, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3905555555555556, "calib/std_conf": 0.071727537913533, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48412466843501323, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.010698606941601796, "calib/step_q_w": 0.47342606149341143, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 488.24609375, "completions/mean_terminated_length": 490.16082763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.03626666666666667, "grad_norm": 0.707058310508728, "kl": 0.05702972412109375, "learning_rate": 4.638888888888889e-06, "loss": 0.0112, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033386290073394775, "mask/share_reasoning": 0.8376938104629517, "mask/share_step_conf": 0.12501364946365356, "num_tokens": 8159796.0, "reward": 0.36031782627105713, "reward_std": 0.20753872394561768, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5918011665344238, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.180540531873703, "step": 34 }, { "adv/mean_abs_final_conf": 0.6628189086914062, "adv/mean_abs_reasoning": 0.46881189942359924, "adv/mean_abs_step_conf": 0.7591454982757568, "adv/ratio_final_to_reasoning": 1.4138269730489716, "adv/ratio_step_to_reasoning": 1.6192965648037532, "adv/std_final_conf": 0.8670271039009094, "adv/std_reasoning": 0.72071373462677, "adv/std_step_conf": 0.9348909854888916, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.4212826446280994, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9380165289256198, "calib/gap": 0.0023757575757575866, "calib/mean_conf": 0.9474776859504134, "calib/mu_c": 0.9485575757575757, "calib/mu_w": 0.9461818181818181, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4116528925619837, "calib/std_conf": 0.11901983547972321, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.49606580829756797, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.07246083592187746, "calib/step_q_w": 0.4236049723756905, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 633.07421875, "completions/mean_terminated_length": 633.07421875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.037333333333333336, "grad_norm": 0.7305748462677002, "kl": 0.0447235107421875, "learning_rate": 4.611111111111112e-06, "loss": -0.0196, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.027223873883485794, "mask/share_reasoning": 0.8737444877624512, "mask/share_step_conf": 0.09903167188167572, "num_tokens": 8431119.0, "reward": 0.3382720947265625, "reward_std": 0.18578404188156128, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5458762645721436, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": -0.16230084002017975, "step": 35 }, { "adv/mean_abs_final_conf": 0.701039731502533, "adv/mean_abs_reasoning": 0.5507223606109619, "adv/mean_abs_step_conf": 0.7414873242378235, "adv/ratio_final_to_reasoning": 1.2729458283204835, "adv/ratio_step_to_reasoning": 1.3463904451150852, "adv/std_final_conf": 0.9193742871284485, "adv/std_reasoning": 0.8100778460502625, "adv/std_step_conf": 0.9348658919334412, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.24663865546218486, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9369747899159664, "calib/gap": -0.0024999999999999467, "calib/mean_conf": 0.9580672268907563, "calib/mu_c": 0.9573529411764706, "calib/mu_w": 0.9598529411764706, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24521008403361344, "calib/std_conf": 0.04645006917027477, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.445568669527897, "calib/step_q_c_n": 932.0, "calib/step_q_gap": -0.018136793655000816, "calib/step_q_w": 0.4637054631828978, "calib/step_q_w_n": 421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 521.45703125, "completions/mean_terminated_length": 521.45703125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.0384, "grad_norm": 1.119794249534607, "kl": 0.064483642578125, "learning_rate": 4.583333333333333e-06, "loss": 0.0233, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.03289055451750755, "mask/share_reasoning": 0.8433002233505249, "mask/share_step_conf": 0.12380918860435486, "num_tokens": 8667324.0, "reward": 0.42279964685440063, "reward_std": 0.19623100757598877, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6815191507339478, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": -0.15466980636119843, "step": 36 }, { "adv/mean_abs_final_conf": 0.6451778411865234, "adv/mean_abs_reasoning": 0.47128838300704956, "adv/mean_abs_step_conf": 0.7719460725784302, "adv/ratio_final_to_reasoning": 1.3689661456749143, "adv/ratio_step_to_reasoning": 1.6379484417864028, "adv/std_final_conf": 0.8708301782608032, "adv/std_reasoning": 0.7394452691078186, "adv/std_step_conf": 0.9352310299873352, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.5019915254237289, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.9449152542372882, "calib/gap": -0.014523809523809605, "calib/mean_conf": 0.9527542372881357, "calib/mu_c": 0.945, "calib/mu_w": 0.9595238095238096, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.49432203389830515, "calib/std_conf": 0.0742336481384533, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.47167597765363134, "calib/step_q_c_n": 537.0, "calib/step_q_gap": 0.034392958785706806, "calib/step_q_w": 0.43728301886792453, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 564.5625, "completions/mean_terminated_length": 571.2569580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.039466666666666664, "grad_norm": 1.040130615234375, "kl": 0.05255126953125, "learning_rate": 4.555555555555556e-06, "loss": -0.0745, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.028845518827438354, "mask/share_reasoning": 0.8554621934890747, "mask/share_step_conf": 0.10397352278232574, "num_tokens": 8918948.0, "reward": 0.2830325961112976, "reward_std": 0.15572383999824524, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.46216249465942383, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": -0.16640979051589966, "step": 37 }, { "adv/mean_abs_final_conf": 0.7251741290092468, "adv/mean_abs_reasoning": 0.5883480906486511, "adv/mean_abs_step_conf": 0.7738329172134399, "adv/ratio_final_to_reasoning": 1.2325596709419855, "adv/ratio_step_to_reasoning": 1.315263752042252, "adv/std_final_conf": 0.9110890626907349, "adv/std_reasoning": 0.8268707394599915, "adv/std_step_conf": 0.934368908405304, "calib/answer_extract_rate": 0.91015625, "calib/avg_num_step_conf": 5.234375, "calib/ece": 0.40309012875536493, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.9699570815450643, "calib/gap": 0.0037602688573562526, "calib/mean_conf": 0.961030042918455, "calib/mu_c": 0.9626923076923075, "calib/mu_w": 0.9589320388349513, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.40309012875536493, "calib/std_conf": 0.04745690566931552, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48682027649769577, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.02317731568492365, "calib/step_q_w": 0.4636429608127721, "calib/step_q_w_n": 689.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 553.5859375, "completions/mean_terminated_length": 555.7568969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.04053333333333333, "grad_norm": 0.9234789609909058, "kl": 0.050746917724609375, "learning_rate": 4.527777777777778e-06, "loss": -0.1114, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.029032420367002487, "mask/share_reasoning": 0.8594563007354736, "mask/share_step_conf": 0.10760502517223358, "num_tokens": 9167554.0, "reward": 0.32611435651779175, "reward_std": 0.21093352138996124, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5374273657798767, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": -0.16879235208034515, "step": 38 }, { "adv/mean_abs_final_conf": 0.7383555173873901, "adv/mean_abs_reasoning": 0.6348795890808105, "adv/mean_abs_step_conf": 0.7644864916801453, "adv/ratio_final_to_reasoning": 1.1629851236143751, "adv/ratio_step_to_reasoning": 1.2041440689359408, "adv/std_final_conf": 0.9126340746879578, "adv/std_reasoning": 0.8593624234199524, "adv/std_step_conf": 0.9349706172943115, "calib/answer_extract_rate": 0.8828125, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.44205357142857155, "calib/final_conf_rate": 0.875, "calib/format_rate": 0.875, "calib/frac_conf_gt_0.9": 0.9151785714285714, "calib/gap": 0.02399266523160315, "calib/mean_conf": 0.9465178571428573, "calib/mu_c": 0.9584070796460177, "calib/mu_w": 0.9344144144144145, "calib/nonempty_final_conf_rate": 0.875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44205357142857155, "calib/std_conf": 0.07916088749940765, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4808552631578948, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.02749925408760001, "calib/step_q_w": 0.4533560090702948, "calib/step_q_w_n": 882.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 561.3046875, "completions/mean_terminated_length": 565.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.0416, "grad_norm": 1.360388159751892, "kl": 0.048961639404296875, "learning_rate": 4.5e-06, "loss": -0.1002, "mask/has_final_conf_rate": 0.875, "mask/share_final_conf": 0.02830466441810131, "mask/share_reasoning": 0.8525670766830444, "mask/share_step_conf": 0.11131571978330612, "num_tokens": 9417336.0, "reward": 0.3034813106060028, "reward_std": 0.217860609292984, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.49029532074928284, "rewards/format_reward_step": 0.875, "rewards/step_l2_reward": -0.1466139405965805, "step": 39 }, { "adv/mean_abs_final_conf": 0.7583894729614258, "adv/mean_abs_reasoning": 0.6187953352928162, "adv/mean_abs_step_conf": 0.7595531940460205, "adv/ratio_final_to_reasoning": 1.2255901583397573, "adv/ratio_step_to_reasoning": 1.2274707818968889, "adv/std_final_conf": 0.9252941012382507, "adv/std_reasoning": 0.85927414894104, "adv/std_step_conf": 0.9349480271339417, "calib/answer_extract_rate": 0.88671875, "calib/avg_num_step_conf": 5.12890625, "calib/ece": 0.4759911894273128, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.8149779735682819, "calib/gap": 0.02241496062992121, "calib/mean_conf": 0.9095594713656387, "calib/mu_c": 0.9221, "calib/mu_w": 0.8996850393700788, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.47251101321585903, "calib/std_conf": 0.14125235871308942, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.450984251968504, "calib/step_q_c_n": 508.0, "calib/step_q_gap": 0.005580525260429459, "calib/step_q_w": 0.44540372670807454, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1913.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 556.08984375, "completions/mean_terminated_length": 556.08984375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.042666666666666665, "grad_norm": 1.4263373613357544, "kl": 0.0540618896484375, "learning_rate": 4.472222222222223e-06, "loss": -0.1697, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.029433563351631165, "mask/share_reasoning": 0.8610671758651733, "mask/share_step_conf": 0.10949921607971191, "num_tokens": 9666455.0, "reward": 0.2889021635055542, "reward_std": 0.21637332439422607, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.46521133184432983, "rewards/format_reward_step": 0.88671875, "rewards/step_l2_reward": -0.1428757607936859, "step": 40 }, { "adv/mean_abs_final_conf": 0.7723698616027832, "adv/mean_abs_reasoning": 0.6076771020889282, "adv/mean_abs_step_conf": 0.7377324104309082, "adv/ratio_final_to_reasoning": 1.271020183165884, "adv/ratio_step_to_reasoning": 1.2140204195532573, "adv/std_final_conf": 0.9363389611244202, "adv/std_reasoning": 0.8595664501190186, "adv/std_step_conf": 0.9345912933349609, "calib/answer_extract_rate": 0.8515625, "calib/avg_num_step_conf": 5.2578125, "calib/ece": 0.15857798165137627, "calib/final_conf_rate": 0.8515625, "calib/format_rate": 0.84375, "calib/frac_conf_gt_0.9": 0.48623853211009177, "calib/gap": 0.06253179190751423, "calib/mean_conf": 0.7936238532110093, "calib/mu_c": 0.8065317919075143, "calib/mu_w": 0.7440000000000001, "calib/nonempty_final_conf_rate": 0.8515625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.07931192660550469, "calib/std_conf": 0.21936770224630264, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4551341890315052, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.07533615861675663, "calib/step_q_w": 0.5304703476482618, "calib/step_q_w_n": 489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 472.890625, "completions/mean_terminated_length": 474.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.04373333333333333, "grad_norm": 1.5240364074707031, "kl": 0.0609893798828125, "learning_rate": 4.444444444444444e-06, "loss": -0.2322, "mask/has_final_conf_rate": 0.8515625, "mask/share_final_conf": 0.030579429119825363, "mask/share_reasoning": 0.845970630645752, "mask/share_step_conf": 0.11954362690448761, "num_tokens": 9894763.0, "reward": 0.4309816360473633, "reward_std": 0.23047995567321777, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6809769868850708, "rewards/format_reward_step": 0.84375, "rewards/step_l2_reward": -0.12291993945837021, "step": 41 }, { "adv/mean_abs_final_conf": 0.7998873591423035, "adv/mean_abs_reasoning": 0.5951753854751587, "adv/mean_abs_step_conf": 0.757778525352478, "adv/ratio_final_to_reasoning": 1.3439523519671648, "adv/ratio_step_to_reasoning": 1.2732020574868113, "adv/std_final_conf": 0.935519814491272, "adv/std_reasoning": 0.8270787596702576, "adv/std_step_conf": 0.9347375631332397, "calib/answer_extract_rate": 0.82421875, "calib/avg_num_step_conf": 5.34765625, "calib/ece": 0.25783018867924523, "calib/final_conf_rate": 0.828125, "calib/format_rate": 0.82421875, "calib/frac_conf_gt_0.9": 0.42924528301886794, "calib/gap": 0.056300287356321754, "calib/mean_conf": 0.7922641509433961, "calib/mu_c": 0.817758620689655, "calib/mu_w": 0.7614583333333332, "calib/nonempty_final_conf_rate": 0.828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2514622641509433, "calib/std_conf": 0.1821920147105221, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48056506849315067, "calib/step_q_c_n": 584.0, "calib/step_q_gap": 0.009609654480411844, "calib/step_q_w": 0.4709554140127388, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 455.0390625, "completions/mean_terminated_length": 455.0390625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0448, "grad_norm": 0.8913173079490662, "kl": 0.06402587890625, "learning_rate": 4.416666666666667e-06, "loss": -0.2165, "mask/has_final_conf_rate": 0.828125, "mask/share_final_conf": 0.03204452618956566, "mask/share_reasoning": 0.8392115831375122, "mask/share_step_conf": 0.1287439465522766, "num_tokens": 10115621.0, "reward": 0.3496522307395935, "reward_std": 0.2136489748954773, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.5659554600715637, "rewards/format_reward_step": 0.82421875, "rewards/step_l2_reward": -0.12211980670690536, "step": 42 }, { "adv/mean_abs_final_conf": 0.852458655834198, "adv/mean_abs_reasoning": 0.8048035502433777, "adv/mean_abs_step_conf": 0.7613873481750488, "adv/ratio_final_to_reasoning": 1.0592133391762613, "adv/ratio_step_to_reasoning": 0.9460536648288895, "adv/std_final_conf": 0.9367731809616089, "adv/std_reasoning": 0.9359363913536072, "adv/std_step_conf": 0.9346553683280945, "calib/answer_extract_rate": 0.71484375, "calib/avg_num_step_conf": 4.671875, "calib/ece": 0.20497267759562837, "calib/final_conf_rate": 0.71484375, "calib/format_rate": 0.70703125, "calib/frac_conf_gt_0.9": 0.2568306010928962, "calib/gap": 0.08441549127420511, "calib/mean_conf": 0.7023497267759564, "calib/mu_c": 0.7434042553191489, "calib/mu_w": 0.6589887640449438, "calib/nonempty_final_conf_rate": 0.71484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19683060109289613, "calib/std_conf": 0.20942139411753757, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4550574712643678, "calib/step_q_c_n": 435.0, "calib/step_q_gap": -0.008792725844699256, "calib/step_q_w": 0.46385019710906705, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 480.94140625, "completions/mean_terminated_length": 480.94140625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.04586666666666667, "grad_norm": 1.3835124969482422, "kl": 0.0644989013671875, "learning_rate": 4.388888888888889e-06, "loss": -0.4611, "mask/has_final_conf_rate": 0.71484375, "mask/share_final_conf": 0.025213249027729034, "mask/share_reasoning": 0.8631592392921448, "mask/share_step_conf": 0.11162751913070679, "num_tokens": 10343966.0, "reward": 0.3104777932167053, "reward_std": 0.2599700689315796, "rewards/accuracy_reward_step": 0.3671875, "rewards/final_brier_reward_step": 0.507054328918457, "rewards/format_reward_step": 0.70703125, "rewards/step_l2_reward": -0.1009424701333046, "step": 43 }, { "adv/mean_abs_final_conf": 0.8195953369140625, "adv/mean_abs_reasoning": 0.8131282925605774, "adv/mean_abs_step_conf": 0.7810917496681213, "adv/ratio_final_to_reasoning": 1.0079532890598606, "adv/ratio_step_to_reasoning": 0.9606008754269618, "adv/std_final_conf": 0.9367455840110779, "adv/std_reasoning": 0.9358900785446167, "adv/std_step_conf": 0.9341350793838501, "calib/answer_extract_rate": 0.53125, "calib/avg_num_step_conf": 4.80859375, "calib/ece": 0.2119402985074627, "calib/final_conf_rate": 0.5234375, "calib/format_rate": 0.5234375, "calib/frac_conf_gt_0.9": 0.15671641791044777, "calib/gap": 0.026965240641711463, "calib/mean_conf": 0.663134328358209, "calib/mu_c": 0.6768181818181819, "calib/mu_w": 0.6498529411764704, "calib/nonempty_final_conf_rate": 0.5234375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1912686567164179, "calib/std_conf": 0.20979171586467474, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4602666666666667, "calib/step_q_c_n": 300.0, "calib/step_q_gap": -0.002343430003580338, "calib/step_q_w": 0.46261009667024705, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 496.44921875, "completions/mean_terminated_length": 498.3961181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.046933333333333334, "grad_norm": 1.691990852355957, "kl": 0.06795501708984375, "learning_rate": 4.361111111111112e-06, "loss": -0.6901, "mask/has_final_conf_rate": 0.5234375, "mask/share_final_conf": 0.019329769536852837, "mask/share_reasoning": 0.8671712875366211, "mask/share_step_conf": 0.10959267616271973, "num_tokens": 10577377.0, "reward": 0.22037556767463684, "reward_std": 0.24473640322685242, "rewards/accuracy_reward_step": 0.26171875, "rewards/final_brier_reward_step": 0.3613913953304291, "rewards/format_reward_step": 0.5234375, "rewards/step_l2_reward": -0.07767152786254883, "step": 44 }, { "adv/mean_abs_final_conf": 0.6867586374282837, "adv/mean_abs_reasoning": 0.7163244485855103, "adv/mean_abs_step_conf": 0.6675255298614502, "adv/ratio_final_to_reasoning": 0.9587256707269888, "adv/ratio_step_to_reasoning": 0.9318759553433911, "adv/std_final_conf": 0.8759419322013855, "adv/std_reasoning": 0.8753003478050232, "adv/std_step_conf": 0.8735520243644714, "calib/answer_extract_rate": 0.2734375, "calib/avg_num_step_conf": 4.5, "calib/ece": 0.26422535211267606, "calib/final_conf_rate": 0.27734375, "calib/format_rate": 0.2734375, "calib/frac_conf_gt_0.9": 0.11267605633802817, "calib/gap": 0.017556089743589798, "calib/mean_conf": 0.6481690140845071, "calib/mu_c": 0.6578125, "calib/mu_w": 0.6402564102564102, "calib/nonempty_final_conf_rate": 0.27734375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.23084507042253524, "calib/std_conf": 0.2252279587575002, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.4766923076923077, "calib/step_q_c_n": 130.0, "calib/step_q_gap": 0.011105223543579679, "calib/step_q_w": 0.465587084148728, "calib/step_q_w_n": 1022.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 430.234375, "completions/mean_terminated_length": 430.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.048, "grad_norm": 2.014629364013672, "kl": 0.08644866943359375, "learning_rate": 4.333333333333334e-06, "loss": -1.075, "mask/has_final_conf_rate": 0.27734375, "mask/share_final_conf": 0.012363248504698277, "mask/share_reasoning": 0.8680437803268433, "mask/share_step_conf": 0.11959296464920044, "num_tokens": 10792565.0, "reward": 0.10859501361846924, "reward_std": 0.17341598868370056, "rewards/accuracy_reward_step": 0.125, "rewards/final_brier_reward_step": 0.18230313062667847, "rewards/format_reward_step": 0.2734375, "rewards/step_l2_reward": -0.04480060189962387, "step": 45 }, { "adv/mean_abs_final_conf": 0.2659332752227783, "adv/mean_abs_reasoning": 0.27259397506713867, "adv/mean_abs_step_conf": 0.2573040723800659, "adv/ratio_final_to_reasoning": 0.9755654913402989, "adv/ratio_step_to_reasoning": 0.9439096088484461, "adv/std_final_conf": 0.5735336542129517, "adv/std_reasoning": 0.5728173851966858, "adv/std_step_conf": 0.5705223083496094, "calib/answer_extract_rate": 0.08203125, "calib/avg_num_step_conf": 3.69140625, "calib/ece": 0.27476190476190476, "calib/final_conf_rate": 0.08203125, "calib/format_rate": 0.08203125, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/gap": 0.16142857142857125, "calib/mean_conf": 0.6080952380952381, "calib/mu_c": 0.7157142857142856, "calib/mu_w": 0.5542857142857144, "calib/nonempty_final_conf_rate": 0.08203125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.27476190476190476, "calib/std_conf": 0.22402421313903184, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4428571428571428, "calib/step_q_c_n": 28.0, "calib/step_q_gap": -0.0036641221374046906, "calib/step_q_w": 0.4465212649945475, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 388.27734375, "completions/mean_terminated_length": 388.27734375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.04906666666666667, "grad_norm": 3.7936737537384033, "kl": 0.1002655029296875, "learning_rate": 4.305555555555556e-06, "loss": -0.6942, "mask/has_final_conf_rate": 0.08203125, "mask/share_final_conf": 0.003270457498729229, "mask/share_reasoning": 0.8765566349029541, "mask/share_step_conf": 0.1201729029417038, "num_tokens": 10996732.0, "reward": 0.03558116778731346, "reward_std": 0.07172537595033646, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.05937773361802101, "rewards/format_reward_step": 0.08203125, "rewards/step_l2_reward": -0.010090397670865059, "step": 46 }, { "adv/mean_abs_final_conf": 0.09661008417606354, "adv/mean_abs_reasoning": 0.14102500677108765, "adv/mean_abs_step_conf": 0.09573409706354141, "adv/ratio_final_to_reasoning": 0.6850564051585646, "adv/ratio_step_to_reasoning": 0.6788448322426772, "adv/std_final_conf": 0.37023141980171204, "adv/std_reasoning": 0.43742695450782776, "adv/std_step_conf": 0.3668881356716156, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 3.09375, "calib/ece": 0.4033333333333334, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.15166666666666667, "calib/mean_conf": 0.5877777777777777, "calib/mu_c": 0.6383333333333333, "calib/mu_w": 0.48666666666666664, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1622222222222222, "calib/std_conf": 0.3198186214360946, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.49692307692307697, "calib/step_q_c_n": 26.0, "calib/step_q_gap": 0.09089174533038769, "calib/step_q_w": 0.4060313315926893, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 311.859375, "completions/mean_terminated_length": 311.859375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.050133333333333335, "grad_norm": 1.8240582942962646, "kl": 0.11304473876953125, "learning_rate": 4.277777777777778e-06, "loss": -0.3307, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0008738382603041828, "mask/share_reasoning": 0.882652759552002, "mask/share_step_conf": 0.11647340655326843, "num_tokens": 11182544.0, "reward": 0.011280306614935398, "reward_std": 0.03114240989089012, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.016706641763448715, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.002739777322858572, "step": 47 }, { "adv/mean_abs_final_conf": 0.019286589697003365, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.019292207434773445, "adv/ratio_final_to_reasoning": 1.000320062725725, "adv/ratio_step_to_reasoning": 1.0006114328376434, "adv/std_final_conf": 0.16526895761489868, "adv/std_reasoning": 0.16521607339382172, "adv/std_step_conf": 0.165317103266716, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.62109375, "calib/ece": 0.88, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.88, "calib/mu_c": NaN, "calib/mu_w": 0.88, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.88, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.96484375, "calib/step_q_w": 0.3338875502008032, "calib/step_q_w_n": 415.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 210.82421875, "completions/mean_terminated_length": 210.82421875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0512, "grad_norm": 0.3917446732521057, "kl": 0.161651611328125, "learning_rate": 4.25e-06, "loss": -0.0596, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.000414299254771322, "mask/share_reasoning": 0.8767637014389038, "mask/share_step_conf": 0.12282195687294006, "num_tokens": 11340203.0, "reward": 0.00033257147879339755, "reward_std": 0.000940654135774821, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0008812500163912773, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0009973570704460144, "step": 48 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 1.26953125, "calib/ece": 0.975, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.975, "calib/mu_c": NaN, "calib/mu_w": 0.975, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.975, "calib/std_conf": 0.0050000000000000044, "calib/step_conf_rate": 0.96875, "calib/step_q_w": 0.2965538461538461, "calib/step_q_w_n": 325.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 141.046875, "completions/mean_terminated_length": 141.046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.05226666666666667, "grad_norm": 0.25012171268463135, "kl": 0.2276153564453125, "learning_rate": 4.222222222222223e-06, "loss": 0.0187, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00011055837967433035, "mask/share_reasoning": 0.8652602434158325, "mask/share_step_conf": 0.13462916016578674, "num_tokens": 11480847.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 49 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.13671875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/step_conf_rate": 0.97265625, "calib/step_q_w": 0.335085910652921, "calib/step_q_w_n": 291.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 153.07421875, "completions/mean_terminated_length": 153.07421875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.05333333333333334, "grad_norm": 0.19393673539161682, "kl": 0.192138671875, "learning_rate": 4.194444444444445e-06, "loss": 0.0201, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8889811038970947, "mask/share_step_conf": 0.11101890355348587, "num_tokens": 11625394.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 50 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.0078125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/step_conf_rate": 0.9765625, "calib/step_q_w": 0.3553875968992248, "calib/step_q_w_n": 258.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 130.890625, "completions/mean_terminated_length": 130.890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0544, "grad_norm": 0.2684899568557739, "kl": 0.2183380126953125, "learning_rate": 4.166666666666667e-06, "loss": 0.0212, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8706995248794556, "mask/share_step_conf": 0.12930047512054443, "num_tokens": 11768198.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 51 }, { "adv/mean_abs_final_conf": 0.019299857318401337, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.019311558455228806, "adv/ratio_final_to_reasoning": 1.0010082024164397, "adv/ratio_step_to_reasoning": 1.0016150946721005, "adv/std_final_conf": 0.16538265347480774, "adv/std_reasoning": 0.16521607339382172, "adv/std_step_conf": 0.16548292338848114, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.0, "calib/ece": 0.83, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.83, "calib/mu_c": NaN, "calib/mu_w": 0.83, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.83, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.9765625, "calib/step_q_w": 0.4443359375, "calib/step_q_w_n": 256.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 139.91015625, "completions/mean_terminated_length": 140.45883178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.055466666666666664, "grad_norm": 0.41800710558891296, "kl": 0.1961212158203125, "learning_rate": 4.138888888888889e-06, "loss": -0.0459, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0002508600882720202, "mask/share_reasoning": 0.8741176724433899, "mask/share_step_conf": 0.12172523885965347, "num_tokens": 11911967.0, "reward": 8.606584742665291e-05, "reward_std": 0.00024343098630197346, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.001215234398841858, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0018243527738377452, "step": 52 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.01171875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/step_conf_rate": 0.98046875, "calib/step_q_w": 0.515907335907336, "calib/step_q_w_n": 259.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 125.6328125, "completions/mean_terminated_length": 125.6328125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.05653333333333333, "grad_norm": 0.24187202751636505, "kl": 0.2152557373046875, "learning_rate": 4.111111111111111e-06, "loss": 0.0195, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8754205107688904, "mask/share_step_conf": 0.12457950413227081, "num_tokens": 12049953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 53 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.015625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/step_conf_rate": 0.97265625, "calib/step_q_w": 0.6044230769230768, "calib/step_q_w_n": 260.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 134.953125, "completions/mean_terminated_length": 134.953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0576, "grad_norm": 0.08608198165893555, "kl": 0.1942138671875, "learning_rate": 4.083333333333334e-06, "loss": 0.0183, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8727961778640747, "mask/share_step_conf": 0.12720385193824768, "num_tokens": 12190733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 54 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.9921875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/step_conf_rate": 0.97265625, "calib/step_q_w": 0.6146062992125984, "calib/step_q_w_n": 254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 139.8125, "completions/mean_terminated_length": 139.8125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.058666666666666666, "grad_norm": 0.3314754068851471, "kl": 0.1821441650390625, "learning_rate": 4.055555555555556e-06, "loss": 0.0194, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8783440589904785, "mask/share_step_conf": 0.12165594100952148, "num_tokens": 12334349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 55 }, { "adv/mean_abs_final_conf": 0.019313529133796692, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.019305992871522903, "adv/ratio_final_to_reasoning": 1.0017173060707774, "adv/ratio_step_to_reasoning": 1.001326429587746, "adv/std_final_conf": 0.16549980640411377, "adv/std_reasoning": 0.16521607339382172, "adv/std_step_conf": 0.16543522477149963, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.015625, "calib/ece": 0.7, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.7, "calib/mu_c": NaN, "calib/mu_w": 0.7, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.7, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.96875, "calib/step_q_w": 0.6632307692307692, "calib/step_q_w_n": 260.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 125.453125, "completions/mean_terminated_length": 125.453125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.05973333333333333, "grad_norm": 0.38354986906051636, "kl": 0.2048492431640625, "learning_rate": 4.027777777777779e-06, "loss": -0.0481, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00015233286831062287, "mask/share_reasoning": 0.8705087900161743, "mask/share_step_conf": 0.12933892011642456, "num_tokens": 12473305.0, "reward": 0.0006501090247184038, "reward_std": 0.0018387859454378486, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.001992187462747097, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0014732194831594825, "step": 56 }, { "adv/mean_abs_final_conf": 0.019323885440826416, "adv/mean_abs_reasoning": 0.01930764690041542, "adv/mean_abs_step_conf": 0.01930798403918743, "adv/ratio_final_to_reasoning": 1.0008410419195437, "adv/ratio_step_to_reasoning": 1.0000174614117272, "adv/std_final_conf": 0.16558855772018433, "adv/std_reasoning": 0.16544939577579498, "adv/std_step_conf": 0.1654522866010666, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.0078125, "calib/ece": 0.10999999999999999, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.89, "calib/mu_c": 0.89, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.895, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.20169140625000004, "calib/step_q_w": 0.69330859375, "calib/step_q_w_n": 256.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 148.18359375, "completions/mean_terminated_length": 148.18359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0608, "grad_norm": 0.37517762184143066, "kl": 0.1676788330078125, "learning_rate": 4.000000000000001e-06, "loss": -0.0603, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.00014583332813344896, "mask/share_reasoning": 0.8833422660827637, "mask/share_step_conf": 0.11651188880205154, "num_tokens": 12618032.0, "reward": 0.0019196701468899846, "reward_std": 0.00542964693158865, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.00385898444801569, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0015821442939341068, "step": 57 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.03125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/step_conf_rate": 0.98046875, "calib/step_q_w": 0.6980719696969697, "calib/step_q_w_n": 264.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 137.72265625, "completions/mean_terminated_length": 137.72265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.06186666666666667, "grad_norm": 0.47784343361854553, "kl": 0.1968841552734375, "learning_rate": 3.972222222222223e-06, "loss": 0.0183, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8683435916900635, "mask/share_step_conf": 0.1316564381122589, "num_tokens": 12759609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 58 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.019280418753623962, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16521607339382172, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.05078125, "calib/ece": 0.25, "calib/final_conf_rate": 0.00390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.0, "calib/mean_conf": 0.75, "calib/mu_c": 0.75, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.00390625, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.0, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.94921875, "calib/step_q_w": 0.7050929368029739, "calib/step_q_w_n": 269.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 155.48828125, "completions/mean_terminated_length": 155.48828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.06293333333333333, "grad_norm": 0.1444707214832306, "kl": 0.1646575927734375, "learning_rate": 3.944444444444445e-06, "loss": 0.0064, "mask/has_final_conf_rate": 0.00390625, "mask/share_final_conf": 0.0001254300441360101, "mask/share_reasoning": 0.8840838670730591, "mask/share_step_conf": 0.11579069495201111, "num_tokens": 12905662.0, "reward": 0.0003906250058207661, "reward_std": 0.001104854280129075, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 59 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.1953125, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/step_conf_rate": 0.96484375, "calib/step_q_w": 0.6973169934640524, "calib/step_q_w_n": 306.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 162.34375, "completions/mean_terminated_length": 162.34375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.064, "grad_norm": 0.1543644368648529, "kl": 0.16278076171875, "learning_rate": 3.916666666666667e-06, "loss": 0.0151, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8886805772781372, "mask/share_step_conf": 0.111319400370121, "num_tokens": 13056078.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 60 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.265625, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/step_conf_rate": 0.9765625, "calib/step_q_w": 0.7531172839506173, "calib/step_q_w_n": 324.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 149.2265625, "completions/mean_terminated_length": 149.2265625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.06506666666666666, "grad_norm": 0.5520812273025513, "kl": 0.1855621337890625, "learning_rate": 3.88888888888889e-06, "loss": 0.016, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8732104301452637, "mask/share_step_conf": 0.12678956985473633, "num_tokens": 13198344.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 61 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.044541239738464355, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.23372872173786163, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 1.3828125, "calib/ece": 0.22249999999999995, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.17000000000000004, "calib/mean_conf": 0.9275, "calib/mu_c": 0.9700000000000001, "calib/mu_w": 0.8, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.2, "calib/std_conf": 0.07361215932167725, "calib/step_conf_rate": 0.91796875, "calib/step_q_w": 0.7208813559322034, "calib/step_q_w_n": 354.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 200.24609375, "completions/mean_terminated_length": 201.03138732910156, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.06613333333333334, "grad_norm": 0.15924867987632751, "kl": 0.1466217041015625, "learning_rate": 3.861111111111112e-06, "loss": 0.0156, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.00020690049859695137, "mask/share_reasoning": 0.8920506834983826, "mask/share_step_conf": 0.10383619368076324, "num_tokens": 13356687.0, "reward": 0.0011718750465661287, "reward_std": 0.002551448065787554, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 62 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 1.4296875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/step_conf_rate": 0.92578125, "calib/step_q_w": 0.7488524590163935, "calib/step_q_w_n": 366.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 221.94921875, "completions/mean_terminated_length": 221.94921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0672, "grad_norm": 0.13147154450416565, "kl": 0.12944793701171875, "learning_rate": 3.833333333333334e-06, "loss": 0.0131, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.8959101438522339, "mask/share_step_conf": 0.1040898859500885, "num_tokens": 13522146.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 63 }, { "adv/mean_abs_final_conf": 0.03812452405691147, "adv/mean_abs_reasoning": 0.08312930166721344, "adv/mean_abs_step_conf": 0.03789771348237991, "adv/ratio_final_to_reasoning": 0.4586171577566367, "adv/ratio_step_to_reasoning": 0.4558887506849698, "adv/std_final_conf": 0.23102885484695435, "adv/std_reasoning": 0.3306039571762085, "adv/std_step_conf": 0.22967655956745148, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 1.68359375, "calib/ece": 0.304, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": -0.15749999999999997, "calib/mean_conf": 0.8639999999999999, "calib/mu_c": 0.8325, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.18399999999999997, "calib/std_conf": 0.1342534915747073, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.95, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.21518279069767432, "calib/step_q_w": 0.7348172093023256, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 232.96484375, "completions/mean_terminated_length": 232.96484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.06826666666666667, "grad_norm": 0.8708637952804565, "kl": 0.1328582763671875, "learning_rate": 3.8055555555555556e-06, "loss": -0.1549, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0008578735869377851, "mask/share_reasoning": 0.889821469783783, "mask/share_step_conf": 0.10932067036628723, "num_tokens": 13685561.0, "reward": 0.002973411697894335, "reward_std": 0.01275265496224165, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.003977734129875898, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.0027184111531823874, "step": 64 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.02526082471013069, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.16532622277736664, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 1.828125, "calib/ece": 0.06499999999999995, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.935, "calib/mu_c": 0.935, "calib/mu_w": NaN, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.0, "calib/std_conf": 0.034999999999999976, "calib/step_conf_rate": 0.95703125, "calib/step_q_w": 0.7642735042735043, "calib/step_q_w_n": 468.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 210.89453125, "completions/mean_terminated_length": 210.89453125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.06933333333333333, "grad_norm": 0.16085658967494965, "kl": 0.15030670166015625, "learning_rate": 3.777777777777778e-06, "loss": 0.0059, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.00012572437117341906, "mask/share_reasoning": 0.8844339847564697, "mask/share_step_conf": 0.1154402494430542, "num_tokens": 13844574.0, "reward": 0.0007812500116415322, "reward_std": 0.0014465939020738006, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 65 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.00390625, "calib/avg_num_step_conf": 1.875, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/step_conf_rate": 0.95703125, "calib/step_q_w": 0.7479742361111111, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 259.40625, "completions/mean_terminated_length": 259.40625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0704, "grad_norm": 0.10541064292192459, "kl": 0.1195220947265625, "learning_rate": 3.7500000000000005e-06, "loss": 0.0107, "mask/has_final_conf_rate": 0.0, "mask/share_final_conf": 0.0, "mask/share_reasoning": 0.895097017288208, "mask/share_step_conf": 0.10490301251411438, "num_tokens": 14017334.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 66 }, { "adv/mean_abs_final_conf": 0.038456253707408905, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.03827614337205887, "adv/ratio_final_to_reasoning": 0.9972878234343494, "adv/ratio_step_to_reasoning": 0.992617013695941, "adv/std_final_conf": 0.23301896452903748, "adv/std_reasoning": 0.2336508184671402, "adv/std_step_conf": 0.2319279909133911, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 2.03515625, "calib/ece": 0.9133333333333333, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/mean_conf": 0.9133333333333334, "calib/mu_c": NaN, "calib/mu_w": 0.9133333333333334, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.9133333333333333, "calib/std_conf": 0.08013876853447535, "calib/step_conf_rate": 0.96875, "calib/step_q_w": 0.7468445297504799, "calib/step_q_w_n": 521.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 240.91015625, "completions/mean_terminated_length": 240.91015625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.07146666666666666, "grad_norm": 1.064578890800476, "kl": 0.127685546875, "learning_rate": 3.7222222222222225e-06, "loss": -0.0962, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0005408349097706378, "mask/share_reasoning": 0.8941315412521362, "mask/share_step_conf": 0.10532761365175247, "num_tokens": 14184015.0, "reward": 0.0013375489506870508, "reward_std": 0.0037831594236195087, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0016371094388887286, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.0005245117354206741, "step": 67 }, { "adv/mean_abs_final_conf": 0.08223120868206024, "adv/mean_abs_reasoning": 0.10238249599933624, "adv/mean_abs_step_conf": 0.0826721042394638, "adv/ratio_final_to_reasoning": 0.8031764402637083, "adv/ratio_step_to_reasoning": 0.8074827970593702, "adv/std_final_conf": 0.32961374521255493, "adv/std_reasoning": 0.3694836497306824, "adv/std_step_conf": 0.32891473174095154, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 2.12109375, "calib/ece": 0.8766666666666667, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.5, "calib/mean_conf": 0.8766666666666666, "calib/mu_c": NaN, "calib/mu_w": 0.8766666666666666, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.8766666666666667, "calib/std_conf": 0.10322575044801346, "calib/step_conf_rate": 0.9609375, "calib/step_q_w": 0.7278688766114181, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 271.78515625, "completions/mean_terminated_length": 271.78515625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.07253333333333334, "grad_norm": 1.744085669517517, "kl": 0.11829376220703125, "learning_rate": 3.694444444444445e-06, "loss": -0.2893, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.001398720545694232, "mask/share_reasoning": 0.8910526633262634, "mask/share_step_conf": 0.10754863917827606, "num_tokens": 14357680.0, "reward": 0.00116417882964015, "reward_std": 0.009257977828383446, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0035250000655651093, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.005884142592549324, "step": 68 }, { "adv/mean_abs_final_conf": 0.09641197323799133, "adv/mean_abs_reasoning": 0.09645654261112213, "adv/mean_abs_step_conf": 0.09608538448810577, "adv/ratio_final_to_reasoning": 0.9995379331259002, "adv/ratio_step_to_reasoning": 0.9961520689735611, "adv/std_final_conf": 0.3694744110107422, "adv/std_reasoning": 0.36964312195777893, "adv/std_step_conf": 0.3682325482368469, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 2.21484375, "calib/ece": 0.432, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.4, "calib/gap": -0.0033333333333334103, "calib/mean_conf": 0.8320000000000001, "calib/mu_c": 0.83, "calib/mu_w": 0.8333333333333334, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.432, "calib/std_conf": 0.11855800268223146, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6866666666666666, "calib/step_q_c_n": 3.0, "calib/step_q_gap": -0.044130023640661986, "calib/step_q_w": 0.7307966903073286, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 302.3203125, "completions/mean_terminated_length": 302.3203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.0736, "grad_norm": 1.0207089185714722, "kl": 0.106475830078125, "learning_rate": 3.6666666666666666e-06, "loss": -0.3893, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0009857293916866183, "mask/share_reasoning": 0.9040440320968628, "mask/share_step_conf": 0.0949702113866806, "num_tokens": 14539570.0, "reward": 0.005831425078213215, "reward_std": 0.018426839262247086, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.010892968624830246, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.004698868375271559, "step": 69 }, { "adv/mean_abs_final_conf": 0.11544821411371231, "adv/mean_abs_reasoning": 0.13496293127536774, "adv/mean_abs_step_conf": 0.11570356786251068, "adv/ratio_final_to_reasoning": 0.8554068366977068, "adv/ratio_step_to_reasoning": 0.8572988654672759, "adv/std_final_conf": 0.4038769602775574, "adv/std_reasoning": 0.43712061643600464, "adv/std_step_conf": 0.4047698676586151, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 2.4375, "calib/ece": 0.77, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.7142857142857143, "calib/gap": 0.019999999999999907, "calib/mean_conf": 0.9128571428571429, "calib/mu_c": 0.93, "calib/mu_w": 0.9100000000000001, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.77, "calib/std_conf": 0.05495824017620382, "calib/step_conf_rate": 0.9609375, "calib/step_q_w": 0.724153205128205, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 272.37109375, "completions/mean_terminated_length": 272.37109375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.07466666666666667, "grad_norm": 1.4476022720336914, "kl": 0.1175994873046875, "learning_rate": 3.638888888888889e-06, "loss": -0.3608, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.002013806952163577, "mask/share_reasoning": 0.8816252946853638, "mask/share_step_conf": 0.11636090278625488, "num_tokens": 14716289.0, "reward": 0.0008833690080791712, "reward_std": 0.006656920071691275, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.003947656136006117, "rewards/format_reward_step": 0.0234375, "rewards/step_l2_reward": -0.007649668026715517, "step": 70 }, { "adv/mean_abs_final_conf": 0.11568950116634369, "adv/mean_abs_reasoning": 0.1157369613647461, "adv/mean_abs_step_conf": 0.11430220305919647, "adv/ratio_final_to_reasoning": 0.9995899304954721, "adv/ratio_step_to_reasoning": 0.9876032834400416, "adv/std_final_conf": 0.40472161769866943, "adv/std_reasoning": 0.4048856496810913, "adv/std_step_conf": 0.39993974566459656, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 2.6328125, "calib/ece": 0.5416666666666666, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.14250000000000007, "calib/mean_conf": 0.875, "calib/mu_c": 0.97, "calib/mu_w": 0.8274999999999999, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.5416666666666666, "calib/std_conf": 0.11800423721205947, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.89, "calib/step_q_c_n": 3.0, "calib/step_q_gap": 0.1498345752608049, "calib/step_q_w": 0.7401654247391951, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 264.23828125, "completions/mean_terminated_length": 264.23828125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.07573333333333333, "grad_norm": 1.3018159866333008, "kl": 0.117828369140625, "learning_rate": 3.6111111111111115e-06, "loss": -0.3776, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0016604708507657051, "mask/share_reasoning": 0.8779726624488831, "mask/share_step_conf": 0.1203669011592865, "num_tokens": 14888342.0, "reward": 0.005938471294939518, "reward_std": 0.021040217950940132, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.012510547414422035, "rewards/format_reward_step": 0.0234375, "rewards/step_l2_reward": -0.006883603520691395, "step": 71 }, { "adv/mean_abs_final_conf": 0.09619291871786118, "adv/mean_abs_reasoning": 0.11568251252174377, "adv/mean_abs_step_conf": 0.0960080623626709, "adv/ratio_final_to_reasoning": 0.8315251512174814, "adv/ratio_step_to_reasoning": 0.829927188386621, "adv/std_final_conf": 0.36863529682159424, "adv/std_reasoning": 0.404695063829422, "adv/std_step_conf": 0.3679353594779968, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 2.67578125, "calib/ece": 0.892, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.4, "calib/mean_conf": 0.892, "calib/mu_c": NaN, "calib/mu_w": 0.892, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.892, "calib/std_conf": 0.06764613810115104, "calib/step_conf_rate": 0.9453125, "calib/step_q_w": 0.7401007785888079, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 274.6875, "completions/mean_terminated_length": 274.6875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.0768, "grad_norm": 1.2662255764007568, "kl": 0.1224365234375, "learning_rate": 3.5833333333333335e-06, "loss": -0.3751, "mask/has_final_conf_rate": 0.01953125, "mask/share_final_conf": 0.0011026529828086495, "mask/share_reasoning": 0.8825006484985352, "mask/share_step_conf": 0.1163966953754425, "num_tokens": 15063070.0, "reward": 0.0011323363287374377, "reward_std": 0.007427211385220289, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0039015626534819603, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.006324389949440956, "step": 72 }, { "adv/mean_abs_final_conf": 0.08131413161754608, "adv/mean_abs_reasoning": 0.10292495787143707, "adv/mean_abs_step_conf": 0.07757073640823364, "adv/ratio_final_to_reasoning": 0.7900331785329955, "adv/ratio_step_to_reasoning": 0.7536630377359665, "adv/std_final_conf": 0.32966530323028564, "adv/std_reasoning": 0.36954551935195923, "adv/std_step_conf": 0.329280823469162, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 2.29296875, "calib/ece": 0.46285714285714286, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.7142857142857143, "calib/gap": 0.13750000000000018, "calib/mean_conf": 0.8914285714285715, "calib/mu_c": 0.9700000000000001, "calib/mu_w": 0.8324999999999999, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.46285714285714286, "calib/std_conf": 0.12426108130275793, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4166666666666667, "calib/step_q_c_n": 6.0, "calib/step_q_gap": -0.34818703384968447, "calib/step_q_w": 0.7648537005163512, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 263.05078125, "completions/mean_terminated_length": 263.05078125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.07786666666666667, "grad_norm": 1.5457650423049927, "kl": 0.1108245849609375, "learning_rate": 3.555555555555556e-06, "loss": -0.3003, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.0014520924305543303, "mask/share_reasoning": 0.8960614204406738, "mask/share_step_conf": 0.10248646140098572, "num_tokens": 15237443.0, "reward": 0.0062183234840631485, "reward_std": 0.013650456443428993, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.008403124287724495, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.0022164792753756046, "step": 73 }, { "adv/mean_abs_final_conf": 0.09627045691013336, "adv/mean_abs_reasoning": 0.11570973694324493, "adv/mean_abs_step_conf": 0.08923877775669098, "adv/ratio_final_to_reasoning": 0.8319996177793884, "adv/ratio_step_to_reasoning": 0.7712296312665733, "adv/std_final_conf": 0.3689347803592682, "adv/std_reasoning": 0.40479037165641785, "adv/std_step_conf": 0.3464997708797455, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 2.3671875, "calib/ece": 0.6666666666666666, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.11714285714285722, "calib/mean_conf": 0.8888888888888888, "calib/mu_c": 0.98, "calib/mu_w": 0.8628571428571428, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.6666666666666666, "calib/std_conf": 0.1257373315416304, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.765, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.03894519867549673, "calib/step_q_w": 0.7260548013245033, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 252.3203125, "completions/mean_terminated_length": 252.3203125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.07893333333333333, "grad_norm": 2.045851707458496, "kl": 0.12564849853515625, "learning_rate": 3.5277777777777784e-06, "loss": -0.3596, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.0016800765879452229, "mask/share_reasoning": 0.8799179196357727, "mask/share_step_conf": 0.11840201169252396, "num_tokens": 15405965.0, "reward": 0.004328823648393154, "reward_std": 0.015819130465388298, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008874218910932541, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.0056853219866752625, "step": 74 }, { "adv/mean_abs_final_conf": 0.038303278386592865, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.03767896816134453, "adv/ratio_final_to_reasoning": 0.9933207072951502, "adv/ratio_step_to_reasoning": 0.9771304410663373, "adv/std_final_conf": 0.23209019005298615, "adv/std_reasoning": 0.2336508184671402, "adv/std_step_conf": 0.22837959229946136, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 2.40625, "calib/ece": 0.97, "calib/final_conf_rate": 0.0078125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.97, "calib/mu_c": NaN, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.0078125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.97, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.984375, "calib/step_q_w": 0.7387186688311689, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 242.41015625, "completions/mean_terminated_length": 242.41015625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.08, "grad_norm": 0.2904098927974701, "kl": 0.1312713623046875, "learning_rate": 3.5e-06, "loss": -0.1373, "mask/has_final_conf_rate": 0.0078125, "mask/share_final_conf": 0.0005445777205750346, "mask/share_reasoning": 0.8716053366661072, "mask/share_step_conf": 0.12785005569458008, "num_tokens": 15572774.0, "reward": -4.18518902733922e-06, "reward_std": 0.002756119705736637, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0004617187369149178, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.002032589167356491, "step": 75 }, { "adv/mean_abs_final_conf": 0.07703585922718048, "adv/mean_abs_reasoning": 0.09642931818962097, "adv/mean_abs_step_conf": 0.0654100775718689, "adv/ratio_final_to_reasoning": 0.7988842052755707, "adv/ratio_step_to_reasoning": 0.6783214773254429, "adv/std_final_conf": 0.3300666809082031, "adv/std_reasoning": 0.3695387542247772, "adv/std_step_conf": 0.29358813166618347, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 2.02734375, "calib/ece": 0.6114285714285713, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.05900000000000016, "calib/mean_conf": 0.8228571428571428, "calib/mu_c": 0.865, "calib/mu_w": 0.8059999999999998, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.5742857142857142, "calib/std_conf": 0.16900670195623144, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.965, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.22326034816247575, "calib/step_q_w": 0.7417396518375242, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 229.1796875, "completions/mean_terminated_length": 229.1796875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.08106666666666666, "grad_norm": 1.202623963356018, "kl": 0.13092041015625, "learning_rate": 3.4722222222222224e-06, "loss": -0.2987, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.002270713448524475, "mask/share_reasoning": 0.8843415975570679, "mask/share_step_conf": 0.11338771134614944, "num_tokens": 15734500.0, "reward": 0.0039461469277739525, "reward_std": 0.01371490303426981, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.007228515576571226, "rewards/format_reward_step": 0.015625, "rewards/step_l2_reward": -0.004023721441626549, "step": 76 }, { "adv/mean_abs_final_conf": 0.07710777968168259, "adv/mean_abs_reasoning": 0.08146252483129501, "adv/mean_abs_step_conf": 0.076274573802948, "adv/ratio_final_to_reasoning": 0.9465429636678842, "adv/ratio_step_to_reasoning": 0.9363148755935197, "adv/std_final_conf": 0.33037495613098145, "adv/std_reasoning": 0.3306713402271271, "adv/std_step_conf": 0.32685402035713196, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 2.1640625, "calib/ece": 0.34666666666666673, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.11999999999999988, "calib/mean_conf": 0.8466666666666666, "calib/mu_c": 0.9066666666666666, "calib/mu_w": 0.7866666666666667, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.34666666666666673, "calib/std_conf": 0.13374935098492585, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.8500000000000001, "calib/step_q_c_n": 4.0, "calib/step_q_gap": 0.13186727272727294, "calib/step_q_w": 0.7181327272727271, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 215.12109375, "completions/mean_terminated_length": 215.12109375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.08213333333333334, "grad_norm": 14.370048522949219, "kl": 1.21722412109375, "learning_rate": 3.444444444444445e-06, "loss": -0.2759, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0014718102756887674, "mask/share_reasoning": 0.8603799343109131, "mask/share_step_conf": 0.1381482481956482, "num_tokens": 15894235.0, "reward": 0.006166364997625351, "reward_std": 0.016257690265774727, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.00962656270712614, "rewards/format_reward_step": 0.015625, "rewards/step_l2_reward": -0.002762582851573825, "step": 77 }, { "adv/mean_abs_final_conf": 0.19243697822093964, "adv/mean_abs_reasoning": 0.19285863637924194, "adv/mean_abs_step_conf": 0.18977494537830353, "adv/ratio_final_to_reasoning": 0.9978136412959326, "adv/ratio_step_to_reasoning": 0.9840106149310598, "adv/std_final_conf": 0.5214677453041077, "adv/std_reasoning": 0.5226067304611206, "adv/std_step_conf": 0.5145155787467957, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 1.9921875, "calib/ece": 0.6799999999999999, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": 0.11250000000000004, "calib/mean_conf": 0.8800000000000001, "calib/mu_c": 0.97, "calib/mu_w": 0.8574999999999999, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.6799999999999999, "calib/std_conf": 0.183412104289766, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.8049999999999999, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.05166907480314953, "calib/step_q_w": 0.7533309251968504, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 222.7734375, "completions/mean_terminated_length": 222.7734375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.0832, "grad_norm": 1.918144941329956, "kl": 0.1381072998046875, "learning_rate": 3.416666666666667e-06, "loss": -0.5302, "mask/has_final_conf_rate": 0.0390625, "mask/share_final_conf": 0.0031930464319884777, "mask/share_reasoning": 0.8872984647750854, "mask/share_step_conf": 0.1095084697008133, "num_tokens": 16059289.0, "reward": 0.006953669711947441, "reward_std": 0.030223235487937927, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.01484218705445528, "rewards/format_reward_step": 0.0390625, "rewards/step_l2_reward": -0.010309848934412003, "step": 78 }, { "adv/mean_abs_final_conf": 0.0188005194067955, "adv/mean_abs_reasoning": 0.038560837507247925, "adv/mean_abs_step_conf": 0.01921161264181137, "adv/ratio_final_to_reasoning": 0.4875547478257375, "adv/ratio_step_to_reasoning": 0.49821564788888056, "adv/std_final_conf": 0.1611037701368332, "adv/std_reasoning": 0.2336508184671402, "adv/std_step_conf": 0.16462647914886475, "calib/answer_extract_rate": 0.01171875, "calib/avg_num_step_conf": 1.83203125, "calib/ece": 0.6333333333333335, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.06999999999999995, "calib/mean_conf": 0.9666666666666668, "calib/mu_c": 0.92, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.6333333333333335, "calib/std_conf": 0.03299831645537219, "calib/step_conf_rate": 0.96875, "calib/step_q_w": 0.7336100213219616, "calib/step_q_w_n": 469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 185.90234375, "completions/mean_terminated_length": 185.90234375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.08426666666666667, "grad_norm": 1.044692039489746, "kl": 0.153411865234375, "learning_rate": 3.3888888888888893e-06, "loss": -0.0782, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0006982251070439816, "mask/share_reasoning": 0.8826907277107239, "mask/share_step_conf": 0.1166110560297966, "num_tokens": 16213256.0, "reward": 0.0006480214651674032, "reward_std": 0.0018328814767301083, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 7.773437391733751e-05, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.00034419141593389213, "step": 79 }, { "adv/mean_abs_final_conf": 0.13986647129058838, "adv/mean_abs_reasoning": 0.1668010950088501, "adv/mean_abs_step_conf": 0.14373824000358582, "adv/ratio_final_to_reasoning": 0.8385225006056907, "adv/ratio_step_to_reasoning": 0.8617343908681138, "adv/std_final_conf": 0.43378275632858276, "adv/std_reasoning": 0.4675740599632263, "adv/std_step_conf": 0.4371723234653473, "calib/answer_extract_rate": 0.046875, "calib/avg_num_step_conf": 1.77734375, "calib/ece": 0.53, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": -0.01828571428571435, "calib/mean_conf": 0.9466666666666667, "calib/mu_c": 0.9359999999999999, "calib/mu_w": 0.9542857142857143, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.53, "calib/std_conf": 0.05821416398857659, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.8175, "calib/step_q_c_n": 4.0, "calib/step_q_gap": 0.06217110125646719, "calib/step_q_w": 0.7553288987435328, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 173.30859375, "completions/mean_terminated_length": 173.30859375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.08533333333333333, "grad_norm": 1.3374284505844116, "kl": 0.19683837890625, "learning_rate": 3.3611111111111117e-06, "loss": -0.4672, "mask/has_final_conf_rate": 0.046875, "mask/share_final_conf": 0.0058358209207654, "mask/share_reasoning": 0.8643442988395691, "mask/share_step_conf": 0.1298198699951172, "num_tokens": 16359783.0, "reward": 0.004866867791861296, "reward_std": 0.029468756169080734, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.016436327248811722, "rewards/format_reward_step": 0.0390625, "rewards/step_l2_reward": -0.018421342596411705, "step": 80 }, { "adv/mean_abs_final_conf": 0.08202240616083145, "adv/mean_abs_reasoning": 0.10839013755321503, "adv/mean_abs_step_conf": 0.08240969479084015, "adv/ratio_final_to_reasoning": 0.7567331125542845, "adv/ratio_step_to_reasoning": 0.7603062109814228, "adv/std_final_conf": 0.32736995816230774, "adv/std_reasoning": 0.36963728070259094, "adv/std_step_conf": 0.3276873528957367, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 1.62890625, "calib/ece": 0.6324999999999998, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": -0.09066666666666656, "calib/mean_conf": 0.94, "calib/mu_c": 0.8833333333333333, "calib/mu_w": 0.9739999999999999, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.5987499999999999, "calib/std_conf": 0.08015609770940699, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.94, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.20176201923076909, "calib/step_q_w": 0.7382379807692309, "calib/step_q_w_n": 416.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 197.1328125, "completions/mean_terminated_length": 197.90589904785156, "completions/min_length": 0.0, "completions/min_terminated_length": 49.0, "epoch": 0.0864, "grad_norm": 0.9779260158538818, "kl": 0.1576995849609375, "learning_rate": 3.3333333333333333e-06, "loss": -0.2706, "mask/has_final_conf_rate": 0.03125, "mask/share_final_conf": 0.0026308889500796795, "mask/share_reasoning": 0.8815246224403381, "mask/share_step_conf": 0.1119382381439209, "num_tokens": 16516497.0, "reward": 0.001145427580922842, "reward_std": 0.014946578070521355, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.004672265611588955, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.008631411008536816, "step": 81 }, { "adv/mean_abs_final_conf": 0.037952158600091934, "adv/mean_abs_reasoning": 0.05784125253558159, "adv/mean_abs_step_conf": 0.03859909623861313, "adv/ratio_final_to_reasoning": 0.6561434432414013, "adv/ratio_step_to_reasoning": 0.6673281532910881, "adv/std_final_conf": 0.22997251152992249, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.23388271033763885, "calib/answer_extract_rate": 0.015625, "calib/avg_num_step_conf": 1.5234375, "calib/ece": 0.6924999999999999, "calib/final_conf_rate": 0.015625, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.023333333333333428, "calib/mean_conf": 0.9424999999999999, "calib/mu_c": 0.96, "calib/mu_w": 0.9366666666666665, "calib/nonempty_final_conf_rate": 0.015625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.6924999999999999, "calib/std_conf": 0.054486236794258416, "calib/step_conf_rate": 0.984375, "calib/step_q_w": 0.7191292307692306, "calib/step_q_w_n": 390.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 153.69921875, "completions/mean_terminated_length": 153.69921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.08746666666666666, "grad_norm": 0.7302096486091614, "kl": 0.1923828125, "learning_rate": 3.3055555555555558e-06, "loss": -0.1652, "mask/has_final_conf_rate": 0.015625, "mask/share_final_conf": 0.0009736621286720037, "mask/share_reasoning": 0.8738419413566589, "mask/share_step_conf": 0.12518437206745148, "num_tokens": 16661396.0, "reward": -1.4653633115813136e-05, "reward_std": 0.0022511552087962627, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0003085937350988388, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.0026816511526703835, "step": 82 }, { "adv/mean_abs_final_conf": 0.09552133083343506, "adv/mean_abs_reasoning": 0.11570973694324493, "adv/mean_abs_step_conf": 0.09623701125383377, "adv/ratio_final_to_reasoning": 0.82552543421897, "adv/ratio_step_to_reasoning": 0.831710569880887, "adv/std_final_conf": 0.36607638001441956, "adv/std_reasoning": 0.40479037165641785, "adv/std_step_conf": 0.36880892515182495, "calib/answer_extract_rate": 0.0234375, "calib/avg_num_step_conf": 1.47265625, "calib/ece": 0.66, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.8333333333333334, "calib/gap": -0.05500000000000016, "calib/mean_conf": 0.9466666666666667, "calib/mu_c": 0.9099999999999999, "calib/mu_w": 0.9650000000000001, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.6366666666666667, "calib/std_conf": 0.04678556282539399, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.875, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.14929253333333337, "calib/step_q_w": 0.7257074666666666, "calib/step_q_w_n": 375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 161.3984375, "completions/mean_terminated_length": 162.03138732910156, "completions/min_length": 0.0, "completions/min_terminated_length": 42.0, "epoch": 0.08853333333333334, "grad_norm": 1.34121572971344, "kl": 0.1849365234375, "learning_rate": 3.277777777777778e-06, "loss": -0.2845, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.0027125701308250427, "mask/share_reasoning": 0.8722102642059326, "mask/share_step_conf": 0.12117096781730652, "num_tokens": 16809978.0, "reward": 0.0018548424122855067, "reward_std": 0.012132625095546246, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.004958593752235174, "rewards/format_reward_step": 0.01953125, "rewards/step_l2_reward": -0.006717659067362547, "step": 83 }, { "adv/mean_abs_final_conf": 0.019323859363794327, "adv/mean_abs_reasoning": 0.01930764690041542, "adv/mean_abs_step_conf": 0.01931280642747879, "adv/ratio_final_to_reasoning": 1.0008396913131117, "adv/ratio_step_to_reasoning": 1.0002672271297472, "adv/std_final_conf": 0.16558833420276642, "adv/std_reasoning": 0.16544939577579498, "adv/std_step_conf": 0.16549362242221832, "calib/answer_extract_rate": 0.0078125, "calib/avg_num_step_conf": 1.0625, "calib/ece": 0.6866666666666666, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": -0.08999999999999997, "calib/mean_conf": 0.94, "calib/mu_c": 0.88, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.6466666666666666, "calib/std_conf": 0.04242640687119283, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.815, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.06681481481481477, "calib/step_q_w": 0.7481851851851852, "calib/step_q_w_n": 270.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 160.22265625, "completions/mean_terminated_length": 160.22265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0896, "grad_norm": 0.5504854917526245, "kl": 0.20377349853515625, "learning_rate": 3.2500000000000002e-06, "loss": -0.0583, "mask/has_final_conf_rate": 0.01171875, "mask/share_final_conf": 0.0006643373053520918, "mask/share_reasoning": 0.8847761154174805, "mask/share_step_conf": 0.11455954611301422, "num_tokens": 16956915.0, "reward": 0.0017426569247618318, "reward_std": 0.004928978160023689, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0038499999791383743, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.001927186269313097, "step": 84 }, { "adv/mean_abs_final_conf": 0.057187922298908234, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.057268816977739334, "adv/ratio_final_to_reasoning": 0.9887047065676265, "adv/ratio_step_to_reasoning": 0.9901032702237521, "adv/std_final_conf": 0.2829303443431854, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.2833392918109894, "calib/answer_extract_rate": 0.01953125, "calib/avg_num_step_conf": 1.0390625, "calib/ece": 0.9299999999999999, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/mean_conf": 0.93, "calib/mu_c": NaN, "calib/mu_w": 0.93, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.9299999999999999, "calib/std_conf": 0.07094598884597589, "calib/step_conf_rate": 0.93359375, "calib/step_q_w": 0.7473684210526316, "calib/step_q_w_n": 266.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 144.11328125, "completions/mean_terminated_length": 144.11328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.09066666666666667, "grad_norm": 1.0435339212417603, "kl": 0.1903533935546875, "learning_rate": 3.2222222222222227e-06, "loss": -0.2095, "mask/has_final_conf_rate": 0.0234375, "mask/share_final_conf": 0.002138955984264612, "mask/share_reasoning": 0.861068844795227, "mask/share_step_conf": 0.13679219782352448, "num_tokens": 17101632.0, "reward": 0.0010343744652345777, "reward_std": 0.0029256530106067657, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.00046406249748542905, "rewards/format_reward_step": 0.01171875, "rewards/step_l2_reward": -0.0007390635437332094, "step": 85 }, { "adv/mean_abs_final_conf": 0.038520447909832, "adv/mean_abs_reasoning": 0.07714889943599701, "adv/mean_abs_step_conf": 0.03856028616428375, "adv/ratio_final_to_reasoning": 0.4993000313865617, "adv/ratio_step_to_reasoning": 0.49981641275743016, "adv/std_final_conf": 0.2334073781967163, "adv/std_reasoning": 0.33054888248443604, "adv/std_step_conf": 0.2336476892232895, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.92578125, "calib/ece": 0.6614285714285715, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.03200000000000003, "calib/mean_conf": 0.9471428571428572, "calib/mu_c": 0.97, "calib/mu_w": 0.938, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.6614285714285715, "calib/std_conf": 0.02490799396308954, "calib/step_conf_rate": 0.89453125, "calib/step_q_c": 0.52, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.16228813559322053, "calib/step_q_w": 0.6822881355932205, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 154.296875, "completions/mean_terminated_length": 154.296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.09173333333333333, "grad_norm": 0.8700916171073914, "kl": 0.1883392333984375, "learning_rate": 3.1944444444444443e-06, "loss": -0.1395, "mask/has_final_conf_rate": 0.02734375, "mask/share_final_conf": 0.002657730830833316, "mask/share_reasoning": 0.8664588928222656, "mask/share_step_conf": 0.1308833360671997, "num_tokens": 17246644.0, "reward": 0.003074523527175188, "reward_std": 0.009674238041043282, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.0042089843191206455, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.001966187497600913, "step": 86 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.07712167501449585, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.33043214678764343, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.90234375, "calib/ece": 0.5572727272727273, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.6363636363636364, "calib/gap": -0.004285714285714337, "calib/mean_conf": 0.8827272727272727, "calib/mu_c": 0.88, "calib/mu_w": 0.8842857142857143, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.93359375, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.5381818181818182, "calib/std_conf": 0.14007082034008295, "calib/step_conf_rate": 0.90234375, "calib/step_q_w": 0.7795238095238096, "calib/step_q_w_n": 231.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 138.26171875, "completions/mean_terminated_length": 138.26171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0928, "grad_norm": 0.2774300277233124, "kl": 0.21185302734375, "learning_rate": 3.1666666666666667e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0029894590843468904, "mask/share_reasoning": 0.881792426109314, "mask/share_step_conf": 0.1152181625366211, "num_tokens": 17387535.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 87 }, { "adv/mean_abs_final_conf": 0.03856541961431503, "adv/mean_abs_reasoning": 0.13927656412124634, "adv/mean_abs_step_conf": 0.03861725330352783, "adv/ratio_final_to_reasoning": 0.2768981260963772, "adv/ratio_step_to_reasoning": 0.2772702898522814, "adv/std_final_conf": 0.23367911577224731, "adv/std_reasoning": 0.437213271856308, "adv/std_step_conf": 0.23399266600608826, "calib/answer_extract_rate": 0.05859375, "calib/avg_num_step_conf": 0.88671875, "calib/ece": 0.6288888888888889, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.8888888888888888, "calib/gap": -0.013333333333333197, "calib/mean_conf": 0.9622222222222222, "calib/mu_c": 0.9533333333333335, "calib/mu_w": 0.9666666666666667, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.92578125, "calib/nonempty_step_conf_rate": 0.875, "calib/pce": 0.6288888888888889, "calib/std_conf": 0.03189488909868294, "calib/step_conf_rate": 0.875, "calib/step_q_c": 0.6, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.19370442477876115, "calib/step_q_w": 0.7937044247787611, "calib/step_q_w_n": 226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 140.62109375, "completions/mean_terminated_length": 140.62109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.09386666666666667, "grad_norm": 0.5458513498306274, "kl": 0.226104736328125, "learning_rate": 3.138888888888889e-06, "loss": -0.1198, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.0038935919292271137, "mask/share_reasoning": 0.8730115294456482, "mask/share_step_conf": 0.12309487909078598, "num_tokens": 17533382.0, "reward": 0.003952160477638245, "reward_std": 0.012706692330539227, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.004346875008195639, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.003473804332315922, "step": 88 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.10836289823055267, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.36953291296958923, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.08203125, "calib/avg_num_step_conf": 0.796875, "calib/ece": 0.6340909090909089, "calib/final_conf_rate": 0.0859375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.9090909090909091, "calib/gap": 0.03857142857142837, "calib/mean_conf": 0.952272727272727, "calib/mu_c": 0.9785714285714285, "calib/mu_w": 0.9400000000000002, "calib/nonempty_final_conf_rate": 0.0859375, "calib/nonempty_reasoning_rate": 0.87890625, "calib/nonempty_step_conf_rate": 0.796875, "calib/pce": 0.6340909090909089, "calib/std_conf": 0.09214276562052143, "calib/step_conf_rate": 0.796875, "calib/step_q_w": 0.7481210784313724, "calib/step_q_w_n": 204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 159.33984375, "completions/mean_terminated_length": 159.33984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.09493333333333333, "grad_norm": 0.18698692321777344, "kl": 0.219879150390625, "learning_rate": 3.1111111111111116e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.0859375, "mask/share_final_conf": 0.01074138842523098, "mask/share_reasoning": 0.8866013288497925, "mask/share_step_conf": 0.10265731811523438, "num_tokens": 17683061.0, "reward": 0.002734375186264515, "reward_std": 0.006207750178873539, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 89 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.15290415287017822, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.43724557757377625, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.09375, "calib/avg_num_step_conf": 0.8125, "calib/ece": 0.5904347826086954, "calib/final_conf_rate": 0.08984375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.9565217391304348, "calib/gap": -0.09423076923076923, "calib/mean_conf": 0.9382608695652171, "calib/mu_c": 0.885, "calib/mu_w": 0.9792307692307692, "calib/nonempty_final_conf_rate": 0.08984375, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.8125, "calib/pce": 0.5469565217391302, "calib/std_conf": 0.2004701468377696, "calib/step_conf_rate": 0.8125, "calib/step_q_w": 0.76063125, "calib/step_q_w_n": 208.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 132.74609375, "completions/mean_terminated_length": 132.74609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.096, "grad_norm": 0.3043246865272522, "kl": 0.2758331298828125, "learning_rate": 3.0833333333333336e-06, "loss": -0.081, "mask/has_final_conf_rate": 0.08984375, "mask/share_final_conf": 0.015118993818759918, "mask/share_reasoning": 0.855556070804596, "mask/share_step_conf": 0.12932494282722473, "num_tokens": 17820364.0, "reward": 0.00390625, "reward_std": 0.00875919871032238, "rewards/accuracy_reward_step": 0.0390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 90 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.2954920530319214, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5959096550941467, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.18359375, "calib/avg_num_step_conf": 0.65625, "calib/ece": 0.46977272727272706, "calib/final_conf_rate": 0.171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.9318181818181818, "calib/gap": 0.06453416149068314, "calib/mean_conf": 0.933409090909091, "calib/mu_c": 0.9671428571428569, "calib/mu_w": 0.9026086956521737, "calib/nonempty_final_conf_rate": 0.171875, "calib/nonempty_reasoning_rate": 0.83984375, "calib/nonempty_step_conf_rate": 0.65625, "calib/pce": 0.46295454545454523, "calib/std_conf": 0.20092901484918615, "calib/step_conf_rate": 0.65625, "calib/step_q_w": 0.7510714285714286, "calib/step_q_w_n": 168.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 163.19921875, "completions/mean_terminated_length": 163.19921875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.09706666666666666, "grad_norm": 0.2645391821861267, "kl": 0.241119384765625, "learning_rate": 3.055555555555556e-06, "loss": -0.036, "mask/has_final_conf_rate": 0.171875, "mask/share_final_conf": 0.020244672894477844, "mask/share_reasoning": 0.8839057087898254, "mask/share_step_conf": 0.09584958851337433, "num_tokens": 17969855.0, "reward": 0.008593750186264515, "reward_std": 0.016925785690546036, "rewards/accuracy_reward_step": 0.0859375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 91 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.39354297518730164, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6814802885055542, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.30078125, "calib/avg_num_step_conf": 0.55078125, "calib/ece": 0.4719444444444445, "calib/final_conf_rate": 0.28125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": 0.03213899613899618, "calib/mean_conf": 0.9580555555555557, "calib/mu_c": 0.9745714285714285, "calib/mu_w": 0.9424324324324324, "calib/nonempty_final_conf_rate": 0.28125, "calib/nonempty_reasoning_rate": 0.8515625, "calib/nonempty_step_conf_rate": 0.55078125, "calib/pce": 0.4719444444444445, "calib/std_conf": 0.06317169920341635, "calib/step_conf_rate": 0.55078125, "calib/step_q_w": 0.8117730496453901, "calib/step_q_w_n": 141.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 172.0390625, "completions/mean_terminated_length": 172.0390625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.09813333333333334, "grad_norm": 0.4520159065723419, "kl": 0.23626708984375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0759, "mask/has_final_conf_rate": 0.28125, "mask/share_final_conf": 0.04713796079158783, "mask/share_reasoning": 0.8745333552360535, "mask/share_step_conf": 0.07832865417003632, "num_tokens": 18120617.0, "reward": 0.01484375074505806, "reward_std": 0.022541169077157974, "rewards/accuracy_reward_step": 0.1484375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 92 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.40045166015625, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6815094947814941, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.35546875, "calib/avg_num_step_conf": 0.51171875, "calib/ece": 0.5515384615384616, "calib/final_conf_rate": 0.35546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.8901098901098901, "calib/gap": 0.04241414141414146, "calib/mean_conf": 0.9471428571428571, "calib/mu_c": 0.9727777777777779, "calib/mu_w": 0.9303636363636364, "calib/nonempty_final_conf_rate": 0.35546875, "calib/nonempty_reasoning_rate": 0.8671875, "calib/nonempty_step_conf_rate": 0.51171875, "calib/pce": 0.5515384615384616, "calib/std_conf": 0.1252889908977975, "calib/step_conf_rate": 0.51171875, "calib/step_q_w": 0.7734198473282442, "calib/step_q_w_n": 131.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 174.6484375, "completions/mean_terminated_length": 174.6484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0992, "grad_norm": 0.4297439455986023, "kl": 0.2293701171875, "learning_rate": 3e-06, "loss": -0.0498, "mask/has_final_conf_rate": 0.35546875, "mask/share_final_conf": 0.058933135122060776, "mask/share_reasoning": 0.8597003221511841, "mask/share_step_conf": 0.08136658370494843, "num_tokens": 18271103.0, "reward": 0.014062500558793545, "reward_std": 0.02293594926595688, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 93 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.560976505279541, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.792765736579895, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.54296875, "calib/avg_num_step_conf": 0.34765625, "calib/ece": 0.5181870503597124, "calib/final_conf_rate": 0.54296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.9064748201438849, "calib/gap": 0.016617065994114832, "calib/mean_conf": 0.9570359712230215, "calib/mu_c": 0.9663606557377049, "calib/mu_w": 0.9497435897435901, "calib/nonempty_final_conf_rate": 0.54296875, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.34765625, "calib/pce": 0.5181870503597124, "calib/std_conf": 0.09007282505830924, "calib/step_conf_rate": 0.34765625, "calib/step_q_w": 0.7235955056179776, "calib/step_q_w_n": 89.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 149.36328125, "completions/mean_terminated_length": 149.94903564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.10026666666666667, "grad_norm": 0.47062671184539795, "kl": 0.300323486328125, "learning_rate": 2.9722222222222225e-06, "loss": -0.0837, "mask/has_final_conf_rate": 0.5390625, "mask/share_final_conf": 0.10760974884033203, "mask/share_reasoning": 0.831997275352478, "mask/share_step_conf": 0.056486740708351135, "num_tokens": 18418020.0, "reward": 0.02421875111758709, "reward_std": 0.032127510756254196, "rewards/accuracy_reward_step": 0.2421875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 94 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5798459053039551, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928367853164673, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.61328125, "calib/avg_num_step_conf": 0.21875, "calib/ece": 0.5654774193548388, "calib/final_conf_rate": 0.60546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.8193548387096774, "calib/gap": 0.03835517364840679, "calib/mean_conf": 0.9125741935483872, "calib/mu_c": 0.9368245614035088, "calib/mu_w": 0.898469387755102, "calib/nonempty_final_conf_rate": 0.60546875, "calib/nonempty_reasoning_rate": 0.83203125, "calib/nonempty_step_conf_rate": 0.21875, "calib/pce": 0.5551548387096775, "calib/std_conf": 0.20260869479120286, "calib/step_conf_rate": 0.21875, "calib/step_q_w": 0.7480517857142858, "calib/step_q_w_n": 56.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 188.37890625, "completions/mean_terminated_length": 188.37890625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.10133333333333333, "grad_norm": 0.6134114265441895, "kl": 0.2447509765625, "learning_rate": 2.944444444444445e-06, "loss": 0.0036, "mask/has_final_conf_rate": 0.60546875, "mask/share_final_conf": 0.13950452208518982, "mask/share_reasoning": 0.8184226155281067, "mask/share_step_conf": 0.04207289218902588, "num_tokens": 18572373.0, "reward": 0.0234375, "reward_std": 0.03320576995611191, "rewards/accuracy_reward_step": 0.234375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 95 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6374795436859131, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8428910374641418, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5859375, "calib/avg_num_step_conf": 0.25, "calib/ece": 0.5187037037037037, "calib/final_conf_rate": 0.6328125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.8580246913580247, "calib/gap": 2.940721250577205e-05, "calib/mean_conf": 0.9456172839506173, "calib/mu_c": 0.9456338028169015, "calib/mu_w": 0.9456043956043957, "calib/nonempty_final_conf_rate": 0.6328125, "calib/nonempty_reasoning_rate": 0.8359375, "calib/nonempty_step_conf_rate": 0.25, "calib/pce": 0.5130246913580246, "calib/std_conf": 0.09783420946055232, "calib/step_conf_rate": 0.25, "calib/step_q_w": 0.7690625, "calib/step_q_w_n": 64.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 174.91015625, "completions/mean_terminated_length": 175.59608459472656, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.1024, "grad_norm": 0.34501567482948303, "kl": 0.276336669921875, "learning_rate": 2.916666666666667e-06, "loss": 0.0185, "mask/has_final_conf_rate": 0.6328125, "mask/share_final_conf": 0.1448785662651062, "mask/share_reasoning": 0.8194688558578491, "mask/share_step_conf": 0.03174634277820587, "num_tokens": 18722966.0, "reward": 0.02890624850988388, "reward_std": 0.0365084670484066, "rewards/accuracy_reward_step": 0.2890625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 96 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.4943743646144867, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7574437260627747, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.578125, "calib/avg_num_step_conf": 0.30078125, "calib/ece": 0.6255974842767297, "calib/final_conf_rate": 0.62109375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.6981132075471698, "calib/gap": 0.026420008267879158, "calib/mean_conf": 0.8708805031446539, "calib/mu_c": 0.8904878048780487, "calib/mu_w": 0.8640677966101695, "calib/nonempty_final_conf_rate": 0.62109375, "calib/nonempty_reasoning_rate": 0.87890625, "calib/nonempty_step_conf_rate": 0.30078125, "calib/pce": 0.619308176100629, "calib/std_conf": 0.23975454861615617, "calib/step_conf_rate": 0.30078125, "calib/step_q_w": 0.8445220779220781, "calib/step_q_w_n": 77.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 172.87890625, "completions/mean_terminated_length": 172.87890625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.10346666666666667, "grad_norm": 0.38908159732818604, "kl": 0.296417236328125, "learning_rate": 2.888888888888889e-06, "loss": -0.0426, "mask/has_final_conf_rate": 0.62109375, "mask/share_final_conf": 0.16929784417152405, "mask/share_reasoning": 0.7983481287956238, "mask/share_step_conf": 0.03235398232936859, "num_tokens": 18872295.0, "reward": 0.01640625111758709, "reward_std": 0.028315432369709015, "rewards/accuracy_reward_step": 0.1640625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 97 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.45829716324806213, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7391588091850281, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.39453125, "calib/avg_num_step_conf": 0.37890625, "calib/ece": 0.6931932773109243, "calib/final_conf_rate": 0.46484375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.7647058823529411, "calib/gap": -0.006038314176245119, "calib/mean_conf": 0.9200840336134454, "calib/mu_c": 0.9155172413793105, "calib/mu_w": 0.9215555555555556, "calib/nonempty_final_conf_rate": 0.46484375, "calib/nonempty_reasoning_rate": 0.7734375, "calib/nonempty_step_conf_rate": 0.37890625, "calib/pce": 0.6847899159663864, "calib/std_conf": 0.1431547073947489, "calib/step_conf_rate": 0.37890625, "calib/step_q_w": 0.7460824742268042, "calib/step_q_w_n": 97.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 233.30859375, "completions/mean_terminated_length": 234.22354125976562, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.10453333333333334, "grad_norm": 0.2955537438392639, "kl": 0.227935791015625, "learning_rate": 2.861111111111111e-06, "loss": -0.0498, "mask/has_final_conf_rate": 0.4609375, "mask/share_final_conf": 0.11774331331253052, "mask/share_reasoning": 0.8449557423591614, "mask/share_step_conf": 0.033394694328308105, "num_tokens": 19038206.0, "reward": 0.01328125037252903, "reward_std": 0.02625075727701187, "rewards/accuracy_reward_step": 0.1328125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 98 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.2998278737068176, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6183291673660278, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.38671875, "calib/avg_num_step_conf": 0.453125, "calib/ece": 0.7344554455445544, "calib/final_conf_rate": 0.39453125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.7227722772277227, "calib/gap": 0.01857429718875503, "calib/mean_conf": 0.8930693069306932, "calib/mu_c": 0.9083333333333334, "calib/mu_w": 0.8897590361445784, "calib/nonempty_final_conf_rate": 0.39453125, "calib/nonempty_reasoning_rate": 0.83984375, "calib/nonempty_step_conf_rate": 0.453125, "calib/pce": 0.7246534653465346, "calib/std_conf": 0.1987771655225595, "calib/step_conf_rate": 0.453125, "calib/step_q_w": 0.8375862068965518, "calib/step_q_w_n": 116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 240.1171875, "completions/mean_terminated_length": 241.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.1056, "grad_norm": 0.17655298113822937, "kl": 0.2249603271484375, "learning_rate": 2.8333333333333335e-06, "loss": 0.0271, "mask/has_final_conf_rate": 0.39453125, "mask/share_final_conf": 0.12888720631599426, "mask/share_reasoning": 0.8282879590988159, "mask/share_step_conf": 0.03891859948635101, "num_tokens": 19205476.0, "reward": 0.00742187537252903, "reward_std": 0.017176657915115356, "rewards/accuracy_reward_step": 0.07421875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 99 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.36230605840682983, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6611034274101257, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.39453125, "calib/avg_num_step_conf": 0.515625, "calib/ece": 0.6748543689320389, "calib/final_conf_rate": 0.40234375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.6504854368932039, "calib/gap": -0.04204295704295724, "calib/mean_conf": 0.87873786407767, "calib/mu_c": 0.8473076923076922, "calib/mu_w": 0.8893506493506494, "calib/nonempty_final_conf_rate": 0.40234375, "calib/nonempty_reasoning_rate": 0.91015625, "calib/nonempty_step_conf_rate": 0.515625, "calib/pce": 0.6505825242718447, "calib/std_conf": 0.19604180063302273, "calib/step_conf_rate": 0.515625, "calib/step_q_w": 0.7578030303030303, "calib/step_q_w_n": 132.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 201.21875, "completions/mean_terminated_length": 201.21875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.10666666666666667, "grad_norm": 0.23728390038013458, "kl": 0.248291015625, "learning_rate": 2.805555555555556e-06, "loss": 0.0129, "mask/has_final_conf_rate": 0.40234375, "mask/share_final_conf": 0.15121278166770935, "mask/share_reasoning": 0.8124673962593079, "mask/share_step_conf": 0.036319803446531296, "num_tokens": 19364396.0, "reward": 0.01015624962747097, "reward_std": 0.020753080025315285, "rewards/accuracy_reward_step": 0.1015625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 100 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.2316746860742569, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5226800441741943, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.3125, "calib/avg_num_step_conf": 0.50390625, "calib/ece": 0.6648275862068965, "calib/final_conf_rate": 0.33984375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5862068965517241, "calib/gap": 0.04954334365325075, "calib/mean_conf": 0.8639080459770117, "calib/mu_c": 0.9026315789473683, "calib/mu_w": 0.8530882352941176, "calib/nonempty_final_conf_rate": 0.33984375, "calib/nonempty_reasoning_rate": 0.81640625, "calib/nonempty_step_conf_rate": 0.50390625, "calib/pce": 0.6551724137931034, "calib/std_conf": 0.20962668691341935, "calib/step_conf_rate": 0.50390625, "calib/step_q_w": 0.7727906976744185, "calib/step_q_w_n": 129.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 272.4765625, "completions/mean_terminated_length": 272.4765625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.10773333333333333, "grad_norm": 0.16580340266227722, "kl": 0.199188232421875, "learning_rate": 2.7777777777777783e-06, "loss": 0.0138, "mask/has_final_conf_rate": 0.33984375, "mask/share_final_conf": 0.13236968219280243, "mask/share_reasoning": 0.8355380892753601, "mask/share_step_conf": 0.03209220618009567, "num_tokens": 19541142.0, "reward": 0.00742187537252903, "reward_std": 0.013269728049635887, "rewards/accuracy_reward_step": 0.07421875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 101 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.30013322830200195, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.572675883769989, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.30859375, "calib/avg_num_step_conf": 0.60546875, "calib/ece": 0.5996341463414635, "calib/final_conf_rate": 0.3203125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.4878048780487805, "calib/gap": -0.11944444444444435, "calib/mean_conf": 0.818658536585366, "calib/mu_c": 0.7399999999999999, "calib/mu_w": 0.8594444444444442, "calib/nonempty_final_conf_rate": 0.3203125, "calib/nonempty_reasoning_rate": 0.9140625, "calib/nonempty_step_conf_rate": 0.60546875, "calib/pce": 0.5384146341463416, "calib/std_conf": 0.23736748392972173, "calib/step_conf_rate": 0.60546875, "calib/step_q_w": 0.7538709677419354, "calib/step_q_w_n": 155.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 182.15234375, "completions/mean_terminated_length": 182.86668395996094, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.1088, "grad_norm": 0.24702408909797668, "kl": 0.2725830078125, "learning_rate": 2.7500000000000004e-06, "loss": 0.0477, "mask/has_final_conf_rate": 0.31640625, "mask/share_final_conf": 0.13098107278347015, "mask/share_reasoning": 0.8201836347579956, "mask/share_step_conf": 0.044929005205631256, "num_tokens": 19694469.0, "reward": 0.011328124441206455, "reward_std": 0.01718788966536522, "rewards/accuracy_reward_step": 0.11328125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 102 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.2895159423351288, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5958887934684753, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.40234375, "calib/avg_num_step_conf": 0.5, "calib/ece": 0.7086407766990289, "calib/final_conf_rate": 0.40234375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5631067961165048, "calib/gap": -0.04659774436090214, "calib/mean_conf": 0.84747572815534, "calib/mu_c": 0.8094736842105263, "calib/mu_w": 0.8560714285714285, "calib/nonempty_final_conf_rate": 0.40234375, "calib/nonempty_reasoning_rate": 0.90234375, "calib/nonempty_step_conf_rate": 0.5, "calib/pce": 0.6858252427184464, "calib/std_conf": 0.2311462297074407, "calib/step_conf_rate": 0.5, "calib/step_q_w": 0.741015625, "calib/step_q_w_n": 128.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 217.74609375, "completions/mean_terminated_length": 217.74609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.10986666666666667, "grad_norm": 0.1895759403705597, "kl": 0.2274017333984375, "learning_rate": 2.7222222222222224e-06, "loss": 0.0327, "mask/has_final_conf_rate": 0.40234375, "mask/share_final_conf": 0.18485210835933685, "mask/share_reasoning": 0.7832228541374207, "mask/share_step_conf": 0.0319250263273716, "num_tokens": 19854764.0, "reward": 0.0078125, "reward_std": 0.01658429019153118, "rewards/accuracy_reward_step": 0.078125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 103 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.2904398739337921, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5958819389343262, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5546875, "calib/avg_num_step_conf": 0.35546875, "calib/ece": 0.6597841726618705, "calib/final_conf_rate": 0.54296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.43884892086330934, "calib/gap": 0.0170217391304347, "calib/mean_conf": 0.8084172661870503, "calib/mu_c": 0.8224999999999999, "calib/mu_w": 0.8054782608695652, "calib/nonempty_final_conf_rate": 0.54296875, "calib/nonempty_reasoning_rate": 0.91015625, "calib/nonempty_step_conf_rate": 0.35546875, "calib/pce": 0.6477697841726618, "calib/std_conf": 0.2301618416917616, "calib/step_conf_rate": 0.35546875, "calib/step_q_w": 0.7660439560439561, "calib/step_q_w_n": 91.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 152.58203125, "completions/mean_terminated_length": 152.58203125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.11093333333333333, "grad_norm": 0.21708692610263824, "kl": 0.3060302734375, "learning_rate": 2.6944444444444444e-06, "loss": -0.0092, "mask/has_final_conf_rate": 0.54296875, "mask/share_final_conf": 0.26213765144348145, "mask/share_reasoning": 0.7116431593894958, "mask/share_step_conf": 0.026219181716442108, "num_tokens": 20000505.0, "reward": 0.00937500037252903, "reward_std": 0.016637086868286133, "rewards/accuracy_reward_step": 0.09375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 104 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.32168108224868774, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6184209585189819, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55078125, "calib/avg_num_step_conf": 0.33984375, "calib/ece": 0.6417605633802816, "calib/final_conf_rate": 0.5546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.39436619718309857, "calib/gap": 0.0321286472148542, "calib/mean_conf": 0.796830985915493, "calib/mu_c": 0.8230769230769232, "calib/mu_w": 0.790948275862069, "calib/nonempty_final_conf_rate": 0.5546875, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.33984375, "calib/pce": 0.6277464788732394, "calib/std_conf": 0.2340650256295708, "calib/step_conf_rate": 0.33984375, "calib/step_q_w": 0.7344827586206897, "calib/step_q_w_n": 87.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 176.86328125, "completions/mean_terminated_length": 176.86328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.112, "grad_norm": 0.26342976093292236, "kl": 0.2776336669921875, "learning_rate": 2.666666666666667e-06, "loss": 0.0817, "mask/has_final_conf_rate": 0.5546875, "mask/share_final_conf": 0.26337558031082153, "mask/share_reasoning": 0.7090876698493958, "mask/share_step_conf": 0.027536744251847267, "num_tokens": 20151542.0, "reward": 0.01132812537252903, "reward_std": 0.01842541992664337, "rewards/accuracy_reward_step": 0.11328125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 105 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.28838005661964417, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5958743095397949, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.625, "calib/avg_num_step_conf": 0.28125, "calib/ece": 0.6691975308641975, "calib/final_conf_rate": 0.6328125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.4382716049382716, "calib/gap": -0.00977710233029383, "calib/mean_conf": 0.7899382716049385, "calib/mu_c": 0.7814285714285715, "calib/mu_w": 0.7912056737588653, "calib/nonempty_final_conf_rate": 0.6328125, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.28125, "calib/pce": 0.6647530864197531, "calib/std_conf": 0.2551869247433646, "calib/step_conf_rate": 0.28125, "calib/step_q_w": 0.7486111111111111, "calib/step_q_w_n": 72.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 135.83984375, "completions/mean_terminated_length": 136.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.11306666666666666, "grad_norm": 0.17964833974838257, "kl": 0.297821044921875, "learning_rate": 2.6388888888888893e-06, "loss": -0.0105, "mask/has_final_conf_rate": 0.62890625, "mask/share_final_conf": 0.308359295129776, "mask/share_reasoning": 0.6718757152557373, "mask/share_step_conf": 0.01585877686738968, "num_tokens": 20290901.0, "reward": 0.008593750186264515, "reward_std": 0.01651938259601593, "rewards/accuracy_reward_step": 0.0859375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 106 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.2720876932144165, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5725226998329163, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.69921875, "calib/avg_num_step_conf": 0.23046875, "calib/ece": 0.6738212290502794, "calib/final_conf_rate": 0.69921875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3240223463687151, "calib/gap": -0.02180394736842106, "calib/mean_conf": 0.7579106145251396, "calib/mu_c": 0.7384210526315789, "calib/mu_w": 0.7602249999999999, "calib/nonempty_final_conf_rate": 0.69921875, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.23046875, "calib/pce": 0.6627932960893855, "calib/std_conf": 0.2682787528242608, "calib/step_conf_rate": 0.23046875, "calib/step_q_w": 0.7813559322033898, "calib/step_q_w_n": 59.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 107.0859375, "completions/mean_terminated_length": 107.50588989257812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.11413333333333334, "grad_norm": 0.30144068598747253, "kl": 0.359100341796875, "learning_rate": 2.6111111111111113e-06, "loss": -0.0394, "mask/has_final_conf_rate": 0.6953125, "mask/share_final_conf": 0.3688632845878601, "mask/share_reasoning": 0.6099119186401367, "mask/share_step_conf": 0.017318598926067352, "num_tokens": 20422931.0, "reward": 0.008593750186264515, "reward_std": 0.015585274435579777, "rewards/accuracy_reward_step": 0.0859375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 107 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.49251788854599, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7574393153190613, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.75, "calib/avg_num_step_conf": 0.1875, "calib/ece": 0.5028571428571429, "calib/final_conf_rate": 0.73828125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3544973544973545, "calib/gap": 0.019162937615161968, "calib/mean_conf": 0.7874074074074073, "calib/mu_c": 0.8006896551724139, "calib/mu_w": 0.7815267175572519, "calib/nonempty_final_conf_rate": 0.73828125, "calib/nonempty_reasoning_rate": 0.9375, "calib/nonempty_step_conf_rate": 0.1875, "calib/pce": 0.4916931216931216, "calib/std_conf": 0.22954325052258504, "calib/step_conf_rate": 0.1875, "calib/step_q_w": 0.7729166666666666, "calib/step_q_w_n": 48.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 112.07421875, "completions/mean_terminated_length": 112.51373291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.1152, "grad_norm": 0.32904762029647827, "kl": 0.327239990234375, "learning_rate": 2.5833333333333337e-06, "loss": 0.141, "mask/has_final_conf_rate": 0.734375, "mask/share_final_conf": 0.3876108229160309, "mask/share_reasoning": 0.5916550159454346, "mask/share_step_conf": 0.01682785153388977, "num_tokens": 20554854.0, "reward": 0.02265625074505806, "reward_std": 0.028209349140524864, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 108 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.43973737955093384, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7013425230979919, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.8359375, "calib/avg_num_step_conf": 0.1171875, "calib/ece": 0.549514563106796, "calib/final_conf_rate": 0.8046875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3786407766990291, "calib/gap": 0.05893205080774444, "calib/mean_conf": 0.8067961165048543, "calib/mu_c": 0.850566037735849, "calib/mu_w": 0.7916339869281046, "calib/nonempty_final_conf_rate": 0.8046875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.1171875, "calib/pce": 0.549514563106796, "calib/std_conf": 0.21122257892800225, "calib/step_conf_rate": 0.1171875, "calib/step_q_w": 0.7606666666666667, "calib/step_q_w_n": 30.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 84.38671875, "completions/mean_terminated_length": 84.38671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.11626666666666667, "grad_norm": 0.479681134223938, "kl": 0.422088623046875, "learning_rate": 2.5555555555555557e-06, "loss": -0.0318, "mask/has_final_conf_rate": 0.8046875, "mask/share_final_conf": 0.40282905101776123, "mask/share_reasoning": 0.5846548080444336, "mask/share_step_conf": 0.012516127899289131, "num_tokens": 20681057.0, "reward": 0.021484375, "reward_std": 0.025183971971273422, "rewards/accuracy_reward_step": 0.21484375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 109 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.26682794094085693, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5482406616210938, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.875, "calib/avg_num_step_conf": 0.08984375, "calib/ece": 0.6419282511210762, "calib/final_conf_rate": 0.87109375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3183856502242152, "calib/gap": 0.027856381087806525, "calib/mean_conf": 0.7719730941704035, "calib/mu_c": 0.796206896551724, "calib/mu_w": 0.7683505154639175, "calib/nonempty_final_conf_rate": 0.87109375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.08984375, "calib/pce": 0.6419282511210762, "calib/std_conf": 0.22947524304040517, "calib/step_conf_rate": 0.08984375, "calib/step_q_w": 0.6339130434782608, "calib/step_q_w_n": 23.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 77.94140625, "completions/mean_terminated_length": 78.24706268310547, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.11733333333333333, "grad_norm": 0.434518426656723, "kl": 0.47857666015625, "learning_rate": 2.5277777777777778e-06, "loss": -0.1372, "mask/has_final_conf_rate": 0.8671875, "mask/share_final_conf": 0.4286842346191406, "mask/share_reasoning": 0.5606935024261475, "mask/share_step_conf": 0.006716080941259861, "num_tokens": 20805930.0, "reward": 0.01132812537252903, "reward_std": 0.015281605534255505, "rewards/accuracy_reward_step": 0.11328125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 110 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.3461071252822876, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5961334705352783, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.87890625, "calib/avg_num_step_conf": 0.0703125, "calib/ece": 0.5774269005847953, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.27631578947368424, "calib/gap": -0.016889568764568597, "calib/mean_conf": 0.7620760233918129, "calib/mu_c": 0.7490384615384618, "calib/mu_w": 0.7659280303030304, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.5557163742690058, "calib/std_conf": 0.2304349605063494, "calib/step_conf_rate": 0.0703125, "calib/step_q_w": 0.7294444444444445, "calib/step_q_w_n": 18.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 69.4140625, "completions/mean_terminated_length": 69.4140625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.1184, "grad_norm": 0.5920610427856445, "kl": 0.51239013671875, "learning_rate": 2.5e-06, "loss": -0.0325, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.4412933588027954, "mask/share_reasoning": 0.5496131777763367, "mask/share_step_conf": 0.009093429893255234, "num_tokens": 20931108.0, "reward": 0.02031250111758709, "reward_std": 0.01981809176504612, "rewards/accuracy_reward_step": 0.203125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 111 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.4163287281990051, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6815744638442993, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 0.01953125, "calib/ece": 0.5136514522821576, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.26141078838174275, "calib/gap": 0.026646881477293993, "calib/mean_conf": 0.7226970954356847, "calib/mu_c": 0.7429310344827585, "calib/mu_w": 0.7162841530054646, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.49784232365145226, "calib/std_conf": 0.2611397527943396, "calib/step_conf_rate": 0.01953125, "calib/step_q_w": 0.7539999999999999, "calib/step_q_w_n": 5.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.11946666666666667, "grad_norm": 0.7337045073509216, "kl": 0.593536376953125, "learning_rate": 2.4722222222222226e-06, "loss": -0.0399, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.49154359102249146, "mask/share_reasoning": 0.5072588324546814, "mask/share_step_conf": 0.0011975823435932398, "num_tokens": 21052956.0, "reward": 0.02265625074505806, "reward_std": 0.02384321764111519, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 112 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5207710862159729, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7752877473831177, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 0.0234375, "calib/ece": 0.6300747899159664, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.31092436974789917, "calib/gap": -0.020954728682170742, "calib/mean_conf": 0.7815873949579831, "calib/mu_c": 0.7644186046511627, "calib/mu_w": 0.7853733333333335, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.6154949579831932, "calib/std_conf": 0.21596721197403, "calib/step_conf_rate": 0.0234375, "calib/step_q_w": 0.8350000000000001, "calib/step_q_w_n": 6.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 83.03515625, "completions/mean_terminated_length": 83.03515625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.12053333333333334, "grad_norm": 0.8056948781013489, "kl": 0.4501953125, "learning_rate": 2.4444444444444447e-06, "loss": -0.1778, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.4397127032279968, "mask/share_reasoning": 0.5587553381919861, "mask/share_step_conf": 0.0015319802332669497, "num_tokens": 21179413.0, "reward": 0.01718750223517418, "reward_std": 0.0298269335180521, "rewards/accuracy_reward_step": 0.171875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 113 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6232556104660034, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8265672922134399, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.91015625, "calib/avg_num_step_conf": 0.05859375, "calib/ece": 0.4028260869565218, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.25217391304347825, "calib/gap": 0.01284474071777375, "calib/mean_conf": 0.760304347826087, "calib/mu_c": 0.7677319587628866, "calib/mu_w": 0.7548872180451128, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.37069565217391304, "calib/std_conf": 0.24044052591615153, "calib/step_conf_rate": 0.05859375, "calib/step_q_w": 0.7380000000000001, "calib/step_q_w_n": 15.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 149.109375, "completions/mean_terminated_length": 149.69412231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.1216, "grad_norm": 0.7283843755722046, "kl": 0.28704833984375, "learning_rate": 2.4166666666666667e-06, "loss": -0.1192, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.29339420795440674, "mask/share_reasoning": 0.6928345561027527, "mask/share_step_conf": 0.009865010157227516, "num_tokens": 21322609.0, "reward": 0.03828125074505806, "reward_std": 0.03569255769252777, "rewards/accuracy_reward_step": 0.3828125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 114 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6219078302383423, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8099403381347656, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.78515625, "calib/avg_num_step_conf": 0.109375, "calib/ece": 0.3879310344827587, "calib/final_conf_rate": 0.79296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3448275862068966, "calib/gap": 0.09248554336989034, "calib/mean_conf": 0.7869458128078818, "calib/mu_c": 0.8407058823529411, "calib/mu_w": 0.7482203389830507, "calib/nonempty_final_conf_rate": 0.79296875, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.109375, "calib/pce": 0.3780788177339902, "calib/std_conf": 0.22538315929172625, "calib/step_conf_rate": 0.109375, "calib/step_q_w": 0.7196428571428573, "calib/step_q_w_n": 28.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 243.6328125, "completions/mean_terminated_length": 243.6328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.12266666666666666, "grad_norm": 0.5352555513381958, "kl": 0.211669921875, "learning_rate": 2.388888888888889e-06, "loss": -0.0547, "mask/has_final_conf_rate": 0.79296875, "mask/share_final_conf": 0.16653913259506226, "mask/share_reasoning": 0.8282657861709595, "mask/share_step_conf": 0.0051950933411717415, "num_tokens": 21490243.0, "reward": 0.03359375149011612, "reward_std": 0.035612430423498154, "rewards/accuracy_reward_step": 0.3359375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 115 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5753067135810852, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7754773497581482, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55078125, "calib/avg_num_step_conf": 0.34375, "calib/ece": 0.38651851851851843, "calib/final_conf_rate": 0.52734375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3037037037037037, "calib/gap": 0.0821271831616659, "calib/mean_conf": 0.7817777777777777, "calib/mu_c": 0.8286206896551724, "calib/mu_w": 0.7464935064935065, "calib/nonempty_final_conf_rate": 0.52734375, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.34375, "calib/pce": 0.36933333333333324, "calib/std_conf": 0.23689618500333814, "calib/step_conf_rate": 0.34375, "calib/step_q_w": 0.7093181818181818, "calib/step_q_w_n": 88.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 330.046875, "completions/mean_terminated_length": 330.046875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12373333333333333, "grad_norm": 0.3845723569393158, "kl": 0.1770172119140625, "learning_rate": 2.361111111111111e-06, "loss": 0.0159, "mask/has_final_conf_rate": 0.52734375, "mask/share_final_conf": 0.04086475074291229, "mask/share_reasoning": 0.9291352033615112, "mask/share_step_conf": 0.030000029131770134, "num_tokens": 21679255.0, "reward": 0.02460937574505806, "reward_std": 0.032943278551101685, "rewards/accuracy_reward_step": 0.24609375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 116 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.4300439953804016, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7204158902168274, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.390625, "calib/avg_num_step_conf": 0.50390625, "calib/ece": 0.481078431372549, "calib/final_conf_rate": 0.3984375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.37254901960784315, "calib/gap": -0.00723667377398729, "calib/mean_conf": 0.7830392156862743, "calib/mu_c": 0.7782857142857142, "calib/mu_w": 0.7855223880597015, "calib/nonempty_final_conf_rate": 0.3984375, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.50390625, "calib/pce": 0.46049019607843134, "calib/std_conf": 0.2285966960446325, "calib/step_conf_rate": 0.50390625, "calib/step_q_w": 0.7446434108527131, "calib/step_q_w_n": 129.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 312.45703125, "completions/mean_terminated_length": 312.45703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1248, "grad_norm": 0.1619957536458969, "kl": 0.179473876953125, "learning_rate": 2.3333333333333336e-06, "loss": 0.0044, "mask/has_final_conf_rate": 0.3984375, "mask/share_final_conf": 0.02772274985909462, "mask/share_reasoning": 0.9462649822235107, "mask/share_step_conf": 0.02601221576333046, "num_tokens": 21865844.0, "reward": 0.014062500558793545, "reward_std": 0.024633172899484634, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 117 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.3409615159034729, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6401100158691406, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.19140625, "calib/avg_num_step_conf": 0.65625, "calib/ece": 0.3719607843137256, "calib/final_conf_rate": 0.19921875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.45098039215686275, "calib/gap": 0.04656923076923081, "calib/mean_conf": 0.8429411764705881, "calib/mu_c": 0.8657692307692308, "calib/mu_w": 0.8192, "calib/nonempty_final_conf_rate": 0.19921875, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.65625, "calib/pce": 0.35254901960784324, "calib/std_conf": 0.1939506114376732, "calib/step_conf_rate": 0.65625, "calib/step_q_w": 0.7661904761904762, "calib/step_q_w_n": 168.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 320.09375, "completions/mean_terminated_length": 320.09375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.12586666666666665, "grad_norm": 0.15597763657569885, "kl": 0.179046630859375, "learning_rate": 2.305555555555556e-06, "loss": 0.0162, "mask/has_final_conf_rate": 0.19921875, "mask/share_final_conf": 0.015014585107564926, "mask/share_reasoning": 0.9491573572158813, "mask/share_step_conf": 0.035828039050102234, "num_tokens": 22051796.0, "reward": 0.01015624962747097, "reward_std": 0.019530273973941803, "rewards/accuracy_reward_step": 0.1015625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 118 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.3127126395702362, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6183814406394958, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.1875, "calib/avg_num_step_conf": 0.66015625, "calib/ece": 0.25979591836734695, "calib/final_conf_rate": 0.19140625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3469387755102041, "calib/gap": 0.15809364548494986, "calib/mean_conf": 0.790408163265306, "calib/mu_c": 0.8646153846153847, "calib/mu_w": 0.7065217391304348, "calib/nonempty_final_conf_rate": 0.19140625, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.66015625, "calib/pce": 0.25979591836734695, "calib/std_conf": 0.2042203610116495, "calib/step_conf_rate": 0.66015625, "calib/step_q_w": 0.7396449704142012, "calib/step_q_w_n": 169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 371.734375, "completions/mean_terminated_length": 371.734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.12693333333333334, "grad_norm": 0.14461444318294525, "kl": 0.1634979248046875, "learning_rate": 2.277777777777778e-06, "loss": 0.0027, "mask/has_final_conf_rate": 0.19140625, "mask/share_final_conf": 0.007947621867060661, "mask/share_reasoning": 0.9608822464942932, "mask/share_step_conf": 0.03117012232542038, "num_tokens": 22252024.0, "reward": 0.010546875186264515, "reward_std": 0.017912933602929115, "rewards/accuracy_reward_step": 0.10546875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 119 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.24497468769550323, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5481371879577637, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.1015625, "calib/avg_num_step_conf": 0.75390625, "calib/ece": 0.3674074074074074, "calib/final_conf_rate": 0.10546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2222222222222222, "calib/gap": -0.0340588235294117, "calib/mean_conf": 0.6755555555555555, "calib/mu_c": 0.6629411764705884, "calib/mu_w": 0.6970000000000001, "calib/nonempty_final_conf_rate": 0.10546875, "calib/nonempty_reasoning_rate": 0.85546875, "calib/nonempty_step_conf_rate": 0.75390625, "calib/pce": 0.20666666666666667, "calib/std_conf": 0.2829518128499299, "calib/step_conf_rate": 0.75390625, "calib/step_q_w": 0.7301554404145078, "calib/step_q_w_n": 193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.128, "grad_norm": 0.1572761982679367, "kl": 0.170440673828125, "learning_rate": 2.25e-06, "loss": 0.0256, "mask/has_final_conf_rate": 0.10546875, "mask/share_final_conf": 0.0060433279722929, "mask/share_reasoning": 0.9575179815292358, "mask/share_step_conf": 0.03643868863582611, "num_tokens": 22436536.0, "reward": 0.006640625186264515, "reward_std": 0.014032842591404915, "rewards/accuracy_reward_step": 0.06640625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 120 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.18342046439647675, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.495700865983963, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0859375, "calib/avg_num_step_conf": 0.796875, "calib/ece": 0.5049999999999999, "calib/final_conf_rate": 0.0859375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.36363636363636365, "calib/gap": -0.08766666666666678, "calib/mean_conf": 0.7968181818181818, "calib/mu_c": 0.749, "calib/mu_w": 0.8366666666666668, "calib/nonempty_final_conf_rate": 0.0859375, "calib/nonempty_reasoning_rate": 0.8828125, "calib/nonempty_step_conf_rate": 0.796875, "calib/pce": 0.4236363636363636, "calib/std_conf": 0.20567817676334602, "calib/step_conf_rate": 0.796875, "calib/step_q_w": 0.7502941176470589, "calib/step_q_w_n": 204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 355.26171875, "completions/mean_terminated_length": 356.6549377441406, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.12906666666666666, "grad_norm": 0.10364165157079697, "kl": 0.1559600830078125, "learning_rate": 2.222222222222222e-06, "loss": 0.0065, "mask/has_final_conf_rate": 0.0859375, "mask/share_final_conf": 0.0035274566616863012, "mask/share_reasoning": 0.9567550420761108, "mask/share_step_conf": 0.035811252892017365, "num_tokens": 22632539.0, "reward": 0.004687500186264515, "reward_std": 0.010509217157959938, "rewards/accuracy_reward_step": 0.046875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 121 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.18548457324504852, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.4957216680049896, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.09375, "calib/avg_num_step_conf": 0.78515625, "calib/ece": 0.3540909090909091, "calib/final_conf_rate": 0.0859375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.22727272727272727, "calib/gap": 0.030000000000000027, "calib/mean_conf": 0.7913636363636364, "calib/mu_c": 0.8063636363636363, "calib/mu_w": 0.7763636363636363, "calib/nonempty_final_conf_rate": 0.0859375, "calib/nonempty_reasoning_rate": 0.87890625, "calib/nonempty_step_conf_rate": 0.78515625, "calib/pce": 0.32272727272727275, "calib/std_conf": 0.21880450914718483, "calib/step_conf_rate": 0.78515625, "calib/step_q_w": 0.739004975124378, "calib/step_q_w_n": 201.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 330.5859375, "completions/mean_terminated_length": 330.5859375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.13013333333333332, "grad_norm": 0.11776480823755264, "kl": 0.1688232421875, "learning_rate": 2.1944444444444445e-06, "loss": 0.0041, "mask/has_final_conf_rate": 0.0859375, "mask/share_final_conf": 0.00397895835340023, "mask/share_reasoning": 0.9588773250579834, "mask/share_step_conf": 0.037143707275390625, "num_tokens": 22824513.0, "reward": 0.0042968750931322575, "reward_std": 0.01062716729938984, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 122 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.1469237357378006, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.43720394372940063, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.08984375, "calib/avg_num_step_conf": 0.7578125, "calib/ece": 0.3704761904761904, "calib/final_conf_rate": 0.08203125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.23809523809523808, "calib/gap": 0.015833333333333366, "calib/mean_conf": 0.7876190476190476, "calib/mu_c": 0.7966666666666666, "calib/mu_w": 0.7808333333333333, "calib/nonempty_final_conf_rate": 0.08203125, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.7578125, "calib/pce": 0.36476190476190473, "calib/std_conf": 0.15556057653056427, "calib/step_conf_rate": 0.7578125, "calib/step_q_w": 0.7216494845360825, "calib/step_q_w_n": 194.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 355.66015625, "completions/mean_terminated_length": 355.66015625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.1312, "grad_norm": 0.09586463868618011, "kl": 0.1557159423828125, "learning_rate": 2.166666666666667e-06, "loss": -0.0006, "mask/has_final_conf_rate": 0.08203125, "mask/share_final_conf": 0.002528803190216422, "mask/share_reasoning": 0.9620292782783508, "mask/share_step_conf": 0.035441912710666656, "num_tokens": 23020850.0, "reward": 0.0035156249068677425, "reward_std": 0.008417459204792976, "rewards/accuracy_reward_step": 0.03515625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 123 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.17352376878261566, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.49564820528030396, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0625, "calib/avg_num_step_conf": 0.7890625, "calib/ece": 0.33187500000000003, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.4375, "calib/gap": 0.04079365079365094, "calib/mean_conf": 0.8943749999999999, "calib/mu_c": 0.9122222222222223, "calib/mu_w": 0.8714285714285713, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.8515625, "calib/nonempty_step_conf_rate": 0.7890625, "calib/pce": 0.33187500000000003, "calib/std_conf": 0.08276236690066324, "calib/step_conf_rate": 0.7890625, "calib/step_q_w": 0.7372772277227722, "calib/step_q_w_n": 202.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 320.1875, "completions/mean_terminated_length": 320.1875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.13226666666666667, "grad_norm": 0.1139259859919548, "kl": 0.1735076904296875, "learning_rate": 2.138888888888889e-06, "loss": -0.0024, "mask/has_final_conf_rate": 0.0625, "mask/share_final_conf": 0.0033107856288552284, "mask/share_reasoning": 0.9576960206031799, "mask/share_step_conf": 0.03899314999580383, "num_tokens": 23209634.0, "reward": 0.003515625139698386, "reward_std": 0.009943688288331032, "rewards/accuracy_reward_step": 0.03515625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 124 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.05784125626087189, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.2861626148223877, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.79296875, "calib/ece": 0.6881818181818182, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.45454545454545453, "calib/gap": -0.07000000000000006, "calib/mean_conf": 0.860909090909091, "calib/mu_c": 0.8099999999999999, "calib/mu_w": 0.88, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.8359375, "calib/nonempty_step_conf_rate": 0.79296875, "calib/pce": 0.6381818181818182, "calib/std_conf": 0.10457311886944293, "calib/step_conf_rate": 0.79296875, "calib/step_q_w": 0.7353201970443349, "calib/step_q_w_n": 203.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 332.6484375, "completions/mean_terminated_length": 333.9529724121094, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, "epoch": 0.13333333333333333, "grad_norm": 0.07615622878074646, "kl": 0.156707763671875, "learning_rate": 2.1111111111111114e-06, "loss": -0.003, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.0015664431266486645, "mask/share_reasoning": 0.9517641067504883, "mask/share_step_conf": 0.04276318848133087, "num_tokens": 23399600.0, "reward": 0.0011718750465661287, "reward_std": 0.003314562840387225, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 125 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.044541239738464355, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.23372872173786163, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.73828125, "calib/ece": 0.6771428571428572, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": -0.16606060606060602, "calib/mean_conf": 0.8371428571428571, "calib/mu_c": 0.7066666666666667, "calib/mu_w": 0.8727272727272727, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.79296875, "calib/nonempty_step_conf_rate": 0.73828125, "calib/pce": 0.65, "calib/std_conf": 0.10470561258176396, "calib/step_conf_rate": 0.73828125, "calib/step_q_w": 0.737037037037037, "calib/step_q_w_n": 189.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1928.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 332.74609375, "completions/mean_terminated_length": 332.74609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1344, "grad_norm": 0.05615081265568733, "kl": 0.1518402099609375, "learning_rate": 2.0833333333333334e-06, "loss": 0.0113, "mask/has_final_conf_rate": 0.0546875, "mask/share_final_conf": 0.002635817974805832, "mask/share_reasoning": 0.9557403922080994, "mask/share_step_conf": 0.041623782366514206, "num_tokens": 23590247.0, "reward": 0.0011718750465661287, "reward_std": 0.002551448065787554, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 126 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.07712167501449585, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.33043214678764343, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0390625, "calib/avg_num_step_conf": 0.84375, "calib/ece": 0.43666666666666665, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2222222222222222, "calib/gap": 0.12150000000000005, "calib/mean_conf": 0.7100000000000001, "calib/mu_c": 0.7775, "calib/mu_w": 0.6559999999999999, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.8828125, "calib/nonempty_step_conf_rate": 0.84375, "calib/pce": 0.35111111111111104, "calib/std_conf": 0.21974732965132682, "calib/step_conf_rate": 0.84375, "calib/step_q_w": 0.7301388888888888, "calib/step_q_w_n": 216.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 303.4453125, "completions/mean_terminated_length": 303.4453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.13546666666666668, "grad_norm": 0.07867080718278885, "kl": 0.1750335693359375, "learning_rate": 2.0555555555555555e-06, "loss": 0.0151, "mask/has_final_conf_rate": 0.03515625, "mask/share_final_conf": 0.00164020957890898, "mask/share_reasoning": 0.9506195783615112, "mask/share_step_conf": 0.04774019867181778, "num_tokens": 23771601.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 127 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.06382165849208832, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.2862262725830078, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.05078125, "calib/avg_num_step_conf": 0.734375, "calib/ece": 0.6200000000000001, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.02949999999999997, "calib/mean_conf": 0.8314285714285715, "calib/mu_c": 0.8525, "calib/mu_w": 0.8230000000000001, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.78515625, "calib/nonempty_step_conf_rate": 0.734375, "calib/pce": 0.582857142857143, "calib/std_conf": 0.21390442267708334, "calib/step_conf_rate": 0.734375, "calib/step_q_w": 0.7459042553191491, "calib/step_q_w_n": 188.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2892.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 342.48046875, "completions/mean_terminated_length": 343.82354736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 39.0, "epoch": 0.13653333333333334, "grad_norm": 0.06135067716240883, "kl": 0.1589813232421875, "learning_rate": 2.027777777777778e-06, "loss": 0.0225, "mask/has_final_conf_rate": 0.0546875, "mask/share_final_conf": 0.001771321753039956, "mask/share_reasoning": 0.9519810676574707, "mask/share_step_conf": 0.04234137386083603, "num_tokens": 23965940.0, "reward": 0.0015625000232830644, "reward_std": 0.003656302345916629, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 128 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.13063138723373413, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.4048003852367401, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.8203125, "calib/ece": 0.15909090909090903, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.36363636363636365, "calib/gap": -0.01666666666666672, "calib/mean_conf": 0.8663636363636363, "calib/mu_c": 0.8633333333333333, "calib/mu_w": 0.88, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.86328125, "calib/nonempty_step_conf_rate": 0.8203125, "calib/pce": 0.10363636363636357, "calib/std_conf": 0.09334907383995227, "calib/step_conf_rate": 0.8203125, "calib/step_q_w": 0.743, "calib/step_q_w_n": 210.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 301.328125, "completions/mean_terminated_length": 301.328125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.1376, "grad_norm": 0.24197635054588318, "kl": 0.2081451416015625, "learning_rate": 2.0000000000000003e-06, "loss": 0.0295, "mask/has_final_conf_rate": 0.04296875, "mask/share_final_conf": 0.002114505972713232, "mask/share_reasoning": 0.9501706957817078, "mask/share_step_conf": 0.04771481826901436, "num_tokens": 24145464.0, "reward": 0.0035156249068677425, "reward_std": 0.007483351975679398, "rewards/accuracy_reward_step": 0.03515625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 129 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.20476499199867249, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5225287675857544, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.078125, "calib/avg_num_step_conf": 0.76171875, "calib/ece": 0.29700000000000004, "calib/final_conf_rate": 0.078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": -0.053333333333333344, "calib/mean_conf": 0.8229999999999998, "calib/mu_c": 0.8016666666666666, "calib/mu_w": 0.855, "calib/nonempty_final_conf_rate": 0.078125, "calib/nonempty_reasoning_rate": 0.83984375, "calib/nonempty_step_conf_rate": 0.76171875, "calib/pce": 0.26000000000000006, "calib/std_conf": 0.13550276749941306, "calib/step_conf_rate": 0.76171875, "calib/step_q_w": 0.7591794871794871, "calib/step_q_w_n": 195.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.13866666666666666, "grad_norm": 0.1571102887392044, "kl": 0.165740966796875, "learning_rate": 1.9722222222222224e-06, "loss": -0.0206, "mask/has_final_conf_rate": 0.078125, "mask/share_final_conf": 0.0029551468323916197, "mask/share_reasoning": 0.9555857181549072, "mask/share_step_conf": 0.041459180414676666, "num_tokens": 24330272.0, "reward": 0.004687500186264515, "reward_std": 0.01173202134668827, "rewards/accuracy_reward_step": 0.046875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 130 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.13959984481334686, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.4048607349395752, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0703125, "calib/avg_num_step_conf": 0.8125, "calib/ece": 0.31166666666666665, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2777777777777778, "calib/gap": 0.027142857142857135, "calib/mean_conf": 0.7994444444444444, "calib/mu_c": 0.81, "calib/mu_w": 0.7828571428571429, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.8828125, "calib/nonempty_step_conf_rate": 0.8125, "calib/pce": 0.25, "calib/std_conf": 0.2205919365460483, "calib/step_conf_rate": 0.8125, "calib/step_q_w": 0.7614903846153847, "calib/step_q_w_n": 208.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 267.1875, "completions/mean_terminated_length": 267.1875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.13973333333333332, "grad_norm": 0.11208275705575943, "kl": 0.180572509765625, "learning_rate": 1.944444444444445e-06, "loss": 0.0063, "mask/has_final_conf_rate": 0.0703125, "mask/share_final_conf": 0.0038356492295861244, "mask/share_reasoning": 0.9527290463447571, "mask/share_step_conf": 0.04343531280755997, "num_tokens": 24504880.0, "reward": 0.0042968750931322575, "reward_std": 0.007995839230716228, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 131 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.16620416939258575, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.46737951040267944, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.078125, "calib/avg_num_step_conf": 0.7578125, "calib/ece": 0.3385, "calib/final_conf_rate": 0.078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.45, "calib/gap": 0.14500000000000002, "calib/mean_conf": 0.8384999999999998, "calib/mu_c": 0.9109999999999999, "calib/mu_w": 0.7659999999999999, "calib/nonempty_final_conf_rate": 0.078125, "calib/nonempty_reasoning_rate": 0.8359375, "calib/nonempty_step_conf_rate": 0.7578125, "calib/pce": 0.3385, "calib/std_conf": 0.19799684340918167, "calib/step_conf_rate": 0.7578125, "calib/step_q_w": 0.7511340206185567, "calib/step_q_w_n": 194.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 329.43359375, "completions/mean_terminated_length": 329.43359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.1408, "grad_norm": 0.1098121926188469, "kl": 0.14825439453125, "learning_rate": 1.916666666666667e-06, "loss": 0.0104, "mask/has_final_conf_rate": 0.078125, "mask/share_final_conf": 0.002711489563807845, "mask/share_reasoning": 0.948211669921875, "mask/share_step_conf": 0.049076832830905914, "num_tokens": 24694807.0, "reward": 0.00390625, "reward_std": 0.009522313252091408, "rewards/accuracy_reward_step": 0.0390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 132 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.09640209376811981, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.3694343566894531, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.125, "calib/avg_num_step_conf": 0.69140625, "calib/ece": 0.7000000000000001, "calib/final_conf_rate": 0.125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.375, "calib/gap": 0.040000000000000036, "calib/mean_conf": 0.85625, "calib/mu_c": 0.89, "calib/mu_w": 0.85, "calib/nonempty_final_conf_rate": 0.125, "calib/nonempty_reasoning_rate": 0.81640625, "calib/nonempty_step_conf_rate": 0.69140625, "calib/pce": 0.7000000000000001, "calib/std_conf": 0.10400570897792102, "calib/step_conf_rate": 0.69140625, "calib/step_q_w": 0.743728813559322, "calib/step_q_w_n": 177.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 392.83203125, "completions/mean_terminated_length": 392.83203125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.14186666666666667, "grad_norm": 0.07171526551246643, "kl": 0.135406494140625, "learning_rate": 1.888888888888889e-06, "loss": 0.0111, "mask/has_final_conf_rate": 0.125, "mask/share_final_conf": 0.004062256310135126, "mask/share_reasoning": 0.9640213251113892, "mask/share_step_conf": 0.03191642463207245, "num_tokens": 24901716.0, "reward": 0.001953125, "reward_std": 0.005524271167814732, "rewards/accuracy_reward_step": 0.01953125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 133 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.1396041214466095, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.4048749804496765, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.0859375, "calib/avg_num_step_conf": 0.671875, "calib/ece": 0.3680952380952381, "calib/final_conf_rate": 0.08203125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": 0.06981818181818189, "calib/mean_conf": 0.7614285714285715, "calib/mu_c": 0.798, "calib/mu_w": 0.7281818181818182, "calib/nonempty_final_conf_rate": 0.08203125, "calib/nonempty_reasoning_rate": 0.7578125, "calib/nonempty_step_conf_rate": 0.671875, "calib/pce": 0.32666666666666666, "calib/std_conf": 0.26380522602653567, "calib/step_conf_rate": 0.671875, "calib/step_q_w": 0.7805232558139535, "calib/step_q_w_n": 172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 369.51171875, "completions/mean_terminated_length": 369.51171875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.14293333333333333, "grad_norm": 0.09373670071363449, "kl": 0.136810302734375, "learning_rate": 1.8611111111111113e-06, "loss": 0.0003, "mask/has_final_conf_rate": 0.08203125, "mask/share_final_conf": 0.00392060587182641, "mask/share_reasoning": 0.9603133201599121, "mask/share_step_conf": 0.035766102373600006, "num_tokens": 25105263.0, "reward": 0.00390625, "reward_std": 0.007996084168553352, "rewards/accuracy_reward_step": 0.0390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 134 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.21806500852108002, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.5479929447174072, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.08984375, "calib/avg_num_step_conf": 0.75390625, "calib/ece": 0.3695238095238095, "calib/final_conf_rate": 0.08203125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": -0.019722222222222308, "calib/mean_conf": 0.8342857142857143, "calib/mu_c": 0.8258333333333333, "calib/mu_w": 0.8455555555555556, "calib/nonempty_final_conf_rate": 0.08203125, "calib/nonempty_reasoning_rate": 0.84375, "calib/nonempty_step_conf_rate": 0.75390625, "calib/pce": 0.31619047619047613, "calib/std_conf": 0.11692779328943745, "calib/step_conf_rate": 0.75390625, "calib/step_q_w": 0.7612435233160623, "calib/step_q_w_n": 193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 349.265625, "completions/mean_terminated_length": 352.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 41.0, "epoch": 0.144, "grad_norm": 0.12835043668746948, "kl": 0.135101318359375, "learning_rate": 1.8333333333333333e-06, "loss": -0.0023, "mask/has_final_conf_rate": 0.08203125, "mask/share_final_conf": 0.0038112555630505085, "mask/share_reasoning": 0.9503839015960693, "mask/share_step_conf": 0.03799235075712204, "num_tokens": 25300555.0, "reward": 0.004687500186264515, "reward_std": 0.012495135888457298, "rewards/accuracy_reward_step": 0.046875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 135 }, { "adv/mean_abs_final_conf": 0.019303197041153908, "adv/mean_abs_reasoning": 0.19146069884300232, "adv/mean_abs_step_conf": 0.019317565485835075, "adv/ratio_final_to_reasoning": 0.10082067577212031, "adv/ratio_step_to_reasoning": 0.10089572221647154, "adv/std_final_conf": 0.16541126370429993, "adv/std_reasoning": 0.49574676156044006, "adv/std_step_conf": 0.1655343919992447, "calib/answer_extract_rate": 0.12890625, "calib/avg_num_step_conf": 0.72265625, "calib/ece": 0.4067741935483871, "calib/final_conf_rate": 0.12109375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.2903225806451613, "calib/gap": 0.08273109243697474, "calib/mean_conf": 0.8067741935483872, "calib/mu_c": 0.8521428571428571, "calib/mu_w": 0.7694117647058824, "calib/nonempty_final_conf_rate": 0.12109375, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.72265625, "calib/pce": 0.3809677419354839, "calib/std_conf": 0.19141893081919178, "calib/step_conf_rate": 0.72265625, "calib/step_q_w": 0.7596216216216218, "calib/step_q_w_n": 185.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 302.9140625, "completions/mean_terminated_length": 304.10198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.14506666666666668, "grad_norm": 0.7878153324127197, "kl": 0.1512451171875, "learning_rate": 1.8055555555555557e-06, "loss": -0.0754, "mask/has_final_conf_rate": 0.1171875, "mask/share_final_conf": 0.00546366348862648, "mask/share_reasoning": 0.9488362073898315, "mask/share_step_conf": 0.04179389774799347, "num_tokens": 25486589.0, "reward": 0.005303375422954559, "reward_std": 0.01033155806362629, "rewards/accuracy_reward_step": 0.0546875, "rewards/final_brier_reward_step": 0.001343359355814755, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.002455358626320958, "step": 136 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.33405715227127075, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6400879621505737, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.13671875, "calib/avg_num_step_conf": 0.703125, "calib/ece": 0.26722222222222214, "calib/final_conf_rate": 0.140625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.4166666666666667, "calib/gap": 0.051839464882943, "calib/mean_conf": 0.8338888888888889, "calib/mu_c": 0.8526086956521738, "calib/mu_w": 0.8007692307692308, "calib/nonempty_final_conf_rate": 0.140625, "calib/nonempty_reasoning_rate": 0.83984375, "calib/nonempty_step_conf_rate": 0.703125, "calib/pce": 0.23111111111111104, "calib/std_conf": 0.19532420026682956, "calib/step_conf_rate": 0.703125, "calib/step_q_w": 0.7707222222222223, "calib/step_q_w_n": 180.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 346.734375, "completions/mean_terminated_length": 346.734375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.14613333333333334, "grad_norm": 0.1472981870174408, "kl": 0.14693450927734375, "learning_rate": 1.777777777777778e-06, "loss": 0.0257, "mask/has_final_conf_rate": 0.140625, "mask/share_final_conf": 0.007967180572450161, "mask/share_reasoning": 0.9528160095214844, "mask/share_step_conf": 0.03921680524945259, "num_tokens": 25682337.0, "reward": 0.008984374813735485, "reward_std": 0.019135739654302597, "rewards/accuracy_reward_step": 0.08984375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 137 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.3383844494819641, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.6609932780265808, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.125, "calib/avg_num_step_conf": 0.75, "calib/ece": 0.1984375000000001, "calib/final_conf_rate": 0.125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.28125, "calib/gap": -0.046454545454545526, "calib/mean_conf": 0.8340625, "calib/mu_c": 0.8195454545454545, "calib/mu_w": 0.866, "calib/nonempty_final_conf_rate": 0.125, "calib/nonempty_reasoning_rate": 0.875, "calib/nonempty_step_conf_rate": 0.75, "calib/pce": 0.17250000000000004, "calib/std_conf": 0.09512686841134843, "calib/step_conf_rate": 0.75, "calib/step_q_w": 0.7651562500000001, "calib/step_q_w_n": 192.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 314.4140625, "completions/mean_terminated_length": 314.4140625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.1472, "grad_norm": 0.1709873080253601, "kl": 0.15203857421875, "learning_rate": 1.75e-06, "loss": 0.0176, "mask/has_final_conf_rate": 0.125, "mask/share_final_conf": 0.006228435784578323, "mask/share_reasoning": 0.9524450898170471, "mask/share_step_conf": 0.041326455771923065, "num_tokens": 25867163.0, "reward": 0.008593750186264515, "reward_std": 0.01938612200319767, "rewards/accuracy_reward_step": 0.0859375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 138 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.27115947008132935, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.572519838809967, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.12109375, "calib/avg_num_step_conf": 0.81640625, "calib/ece": 0.21266666666666662, "calib/final_conf_rate": 0.1171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3, "calib/gap": 0.05301587301587296, "calib/mean_conf": 0.8393333333333332, "calib/mu_c": 0.8552380952380951, "calib/mu_w": 0.8022222222222222, "calib/nonempty_final_conf_rate": 0.1171875, "calib/nonempty_reasoning_rate": 0.9375, "calib/nonempty_step_conf_rate": 0.81640625, "calib/pce": 0.176, "calib/std_conf": 0.11093341346150952, "calib/step_conf_rate": 0.81640625, "calib/step_q_w": 0.7753574162679426, "calib/step_q_w_n": 209.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 292.64453125, "completions/mean_terminated_length": 292.64453125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.14826666666666666, "grad_norm": 0.13704362511634827, "kl": 0.1703948974609375, "learning_rate": 1.7222222222222224e-06, "loss": -0.0029, "mask/has_final_conf_rate": 0.1171875, "mask/share_final_conf": 0.00667218491435051, "mask/share_reasoning": 0.9529383182525635, "mask/share_step_conf": 0.040389493107795715, "num_tokens": 26045176.0, "reward": 0.008203125558793545, "reward_std": 0.015532232820987701, "rewards/accuracy_reward_step": 0.08203125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 139 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.43746131658554077, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7013096809387207, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.2265625, "calib/avg_num_step_conf": 0.671875, "calib/ece": 0.14245614035087711, "calib/final_conf_rate": 0.22265625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2982456140350877, "calib/gap": 0.0723323170731709, "calib/mean_conf": 0.8414035087719298, "calib/mu_c": 0.8617073170731708, "calib/mu_w": 0.7893749999999999, "calib/nonempty_final_conf_rate": 0.22265625, "calib/nonempty_reasoning_rate": 0.8984375, "calib/nonempty_step_conf_rate": 0.671875, "calib/pce": 0.13228070175438586, "calib/std_conf": 0.15132674763270382, "calib/step_conf_rate": 0.671875, "calib/step_q_w": 0.7836627906976744, "calib/step_q_w_n": 172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 321.37109375, "completions/mean_terminated_length": 321.37109375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.14933333333333335, "grad_norm": 0.3020021319389343, "kl": 0.177581787109375, "learning_rate": 1.6944444444444446e-06, "loss": 0.0181, "mask/has_final_conf_rate": 0.22265625, "mask/share_final_conf": 0.012400241568684578, "mask/share_reasoning": 0.951915979385376, "mask/share_step_conf": 0.03568378463387489, "num_tokens": 26232463.0, "reward": 0.01640624925494194, "reward_std": 0.02505391091108322, "rewards/accuracy_reward_step": 0.1640625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 140 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5413906574249268, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8096631765365601, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.3046875, "calib/avg_num_step_conf": 0.55078125, "calib/ece": 0.31581081081081086, "calib/final_conf_rate": 0.2890625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.40540540540540543, "calib/gap": 0.016208425720620867, "calib/mean_conf": 0.8471621621621621, "calib/mu_c": 0.8543902439024389, "calib/mu_w": 0.838181818181818, "calib/nonempty_final_conf_rate": 0.2890625, "calib/nonempty_reasoning_rate": 0.85546875, "calib/nonempty_step_conf_rate": 0.55078125, "calib/pce": 0.3044594594594595, "calib/std_conf": 0.16851914856526806, "calib/step_conf_rate": 0.55078125, "calib/step_q_w": 0.7974468085106383, "calib/step_q_w_n": 141.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 335.48046875, "completions/mean_terminated_length": 336.7961120605469, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.1504, "grad_norm": 0.17797711491584778, "kl": 0.150115966796875, "learning_rate": 1.6666666666666667e-06, "loss": -0.0117, "mask/has_final_conf_rate": 0.2890625, "mask/share_final_conf": 0.01352442055940628, "mask/share_reasoning": 0.9501509666442871, "mask/share_step_conf": 0.032418347895145416, "num_tokens": 26425442.0, "reward": 0.01679687574505806, "reward_std": 0.031011424958705902, "rewards/accuracy_reward_step": 0.16796875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 141 }, { "adv/mean_abs_final_conf": 0.01929006353020668, "adv/mean_abs_reasoning": 0.5003547668457031, "adv/mean_abs_step_conf": 0.019317567348480225, "adv/ratio_final_to_reasoning": 0.038552772569378266, "adv/ratio_step_to_reasoning": 0.03860774120382724, "adv/std_final_conf": 0.1652987152338028, "adv/std_reasoning": 0.7574677467346191, "adv/std_step_conf": 0.16553440690040588, "calib/answer_extract_rate": 0.296875, "calib/avg_num_step_conf": 0.578125, "calib/ece": 0.2581818181818183, "calib/final_conf_rate": 0.30078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.3246753246753247, "calib/gap": 0.019607293127629544, "calib/mean_conf": 0.8555844155844157, "calib/mu_c": 0.8634782608695651, "calib/mu_w": 0.8438709677419356, "calib/nonempty_final_conf_rate": 0.30078125, "calib/nonempty_reasoning_rate": 0.87109375, "calib/nonempty_step_conf_rate": 0.578125, "calib/pce": 0.2581818181818183, "calib/std_conf": 0.12876530050548884, "calib/step_conf_rate": 0.578125, "calib/step_q_w": 0.7908108108108107, "calib/step_q_w_n": 148.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 369.17578125, "completions/mean_terminated_length": 370.6235656738281, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.15146666666666667, "grad_norm": 0.26313072443008423, "kl": 0.1321258544921875, "learning_rate": 1.638888888888889e-06, "loss": 0.009, "mask/has_final_conf_rate": 0.30078125, "mask/share_final_conf": 0.013783589005470276, "mask/share_reasoning": 0.9493716955184937, "mask/share_step_conf": 0.03293842077255249, "num_tokens": 26625111.0, "reward": 0.017606385052204132, "reward_std": 0.028986668214201927, "rewards/accuracy_reward_step": 0.1796875, "rewards/final_brier_reward_step": 0.000949609384406358, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.002455588662996888, "step": 142 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.4870590269565582, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7392624616622925, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.390625, "calib/avg_num_step_conf": 0.4765625, "calib/ece": 0.39294736842105266, "calib/final_conf_rate": 0.37109375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3473684210526316, "calib/gap": -0.003908199643493981, "calib/mean_conf": 0.8350526315789472, "calib/mu_c": 0.8329545454545453, "calib/mu_w": 0.8368627450980393, "calib/nonempty_final_conf_rate": 0.37109375, "calib/nonempty_reasoning_rate": 0.8671875, "calib/nonempty_step_conf_rate": 0.4765625, "calib/pce": 0.382421052631579, "calib/std_conf": 0.16809066278091542, "calib/step_conf_rate": 0.4765625, "calib/step_q_w": 0.7695081967213114, "calib/step_q_w_n": 122.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 355.2265625, "completions/mean_terminated_length": 356.61962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.15253333333333333, "grad_norm": 0.14126311242580414, "kl": 0.14892578125, "learning_rate": 1.6111111111111113e-06, "loss": -0.0178, "mask/has_final_conf_rate": 0.37109375, "mask/share_final_conf": 0.02051900513470173, "mask/share_reasoning": 0.951433002948761, "mask/share_step_conf": 0.024141736328601837, "num_tokens": 26823385.0, "reward": 0.01796875149011612, "reward_std": 0.02789430133998394, "rewards/accuracy_reward_step": 0.1796875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 143 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5897383689880371, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928624153137207, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.49609375, "calib/avg_num_step_conf": 0.390625, "calib/ece": 0.34152000000000005, "calib/final_conf_rate": 0.48828125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.344, "calib/gap": -0.001836945304437343, "calib/mean_conf": 0.8567199999999999, "calib/mu_c": 0.8558823529411765, "calib/mu_w": 0.8577192982456139, "calib/nonempty_final_conf_rate": 0.48828125, "calib/nonempty_reasoning_rate": 0.88671875, "calib/nonempty_step_conf_rate": 0.390625, "calib/pce": 0.32712, "calib/std_conf": 0.13055436262339148, "calib/step_conf_rate": 0.390625, "calib/step_q_w": 0.7783, "calib/step_q_w_n": 100.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 337.01171875, "completions/mean_terminated_length": 337.01171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1536, "grad_norm": 0.18569757044315338, "kl": 0.173431396484375, "learning_rate": 1.5833333333333333e-06, "loss": -0.0453, "mask/has_final_conf_rate": 0.48828125, "mask/share_final_conf": 0.029239937663078308, "mask/share_reasoning": 0.9489139318466187, "mask/share_step_conf": 0.0218461062759161, "num_tokens": 27013788.0, "reward": 0.02734375186264515, "reward_std": 0.033771052956581116, "rewards/accuracy_reward_step": 0.2734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 144 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.7038739919662476, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8747616410255432, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.4921875, "calib/avg_num_step_conf": 0.421875, "calib/ece": 0.24959016393442623, "calib/final_conf_rate": 0.4765625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.319672131147541, "calib/gap": 0.015188787185354768, "calib/mean_conf": 0.845983606557377, "calib/mu_c": 0.8517105263157895, "calib/mu_w": 0.8365217391304347, "calib/nonempty_final_conf_rate": 0.4765625, "calib/nonempty_reasoning_rate": 0.9140625, "calib/nonempty_step_conf_rate": 0.421875, "calib/pce": 0.23631147540983605, "calib/std_conf": 0.1444189025141314, "calib/step_conf_rate": 0.421875, "calib/step_q_w": 0.7674074074074073, "calib/step_q_w_n": 108.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 311.83203125, "completions/mean_terminated_length": 311.83203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.15466666666666667, "grad_norm": 0.17270921170711517, "kl": 0.1587677001953125, "learning_rate": 1.5555555555555558e-06, "loss": 0.0137, "mask/has_final_conf_rate": 0.4765625, "mask/share_final_conf": 0.028850942850112915, "mask/share_reasoning": 0.942456841468811, "mask/share_step_conf": 0.028692251071333885, "num_tokens": 27196321.0, "reward": 0.03085937537252903, "reward_std": 0.04030867666006088, "rewards/accuracy_reward_step": 0.30859375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 145 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.4999437630176544, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7393062710762024, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.50390625, "calib/avg_num_step_conf": 0.39453125, "calib/ece": 0.44346456692913383, "calib/final_conf_rate": 0.49609375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.4409448818897638, "calib/gap": 0.0002205128205128304, "calib/mean_conf": 0.8510236220472442, "calib/mu_c": 0.8511538461538461, "calib/mu_w": 0.8509333333333333, "calib/nonempty_final_conf_rate": 0.49609375, "calib/nonempty_reasoning_rate": 0.8984375, "calib/nonempty_step_conf_rate": 0.39453125, "calib/pce": 0.44251968503937, "calib/std_conf": 0.1380222203134937, "calib/step_conf_rate": 0.39453125, "calib/step_q_w": 0.7799009900990098, "calib/step_q_w_n": 101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 333.91015625, "completions/mean_terminated_length": 333.91015625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.15573333333333333, "grad_norm": 0.18089455366134644, "kl": 0.1577911376953125, "learning_rate": 1.527777777777778e-06, "loss": -0.0297, "mask/has_final_conf_rate": 0.49609375, "mask/share_final_conf": 0.0274009071290493, "mask/share_reasoning": 0.9498142600059509, "mask/share_step_conf": 0.02278483659029007, "num_tokens": 27389018.0, "reward": 0.02031249925494194, "reward_std": 0.0286305770277977, "rewards/accuracy_reward_step": 0.203125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 146 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5508720874786377, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.792724072933197, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.59375, "calib/avg_num_step_conf": 0.328125, "calib/ece": 0.37421052631578944, "calib/final_conf_rate": 0.59375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.4144736842105263, "calib/gap": 0.003397402597402399, "calib/mean_conf": 0.850921052631579, "calib/mu_c": 0.8525974025974025, "calib/mu_w": 0.8492000000000001, "calib/nonempty_final_conf_rate": 0.59375, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.328125, "calib/pce": 0.35927631578947367, "calib/std_conf": 0.1498084534152911, "calib/step_conf_rate": 0.328125, "calib/step_q_w": 0.7676309523809524, "calib/step_q_w_n": 84.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2313.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 351.90234375, "completions/mean_terminated_length": 351.90234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1568, "grad_norm": 0.2241380661725998, "kl": 0.1469268798828125, "learning_rate": 1.5e-06, "loss": -0.0022, "mask/has_final_conf_rate": 0.59375, "mask/share_final_conf": 0.03285910189151764, "mask/share_reasoning": 0.9477626085281372, "mask/share_step_conf": 0.01937827840447426, "num_tokens": 27582785.0, "reward": 0.03046875074505806, "reward_std": 0.03155011311173439, "rewards/accuracy_reward_step": 0.3046875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 147 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6207718849182129, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.809929609298706, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.59375, "calib/avg_num_step_conf": 0.29296875, "calib/ece": 0.2514093959731543, "calib/final_conf_rate": 0.58203125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3624161073825503, "calib/gap": 0.003634792626728389, "calib/mean_conf": 0.8363758389261745, "calib/mu_c": 0.8377419354838711, "calib/mu_w": 0.8341071428571427, "calib/nonempty_final_conf_rate": 0.58203125, "calib/nonempty_reasoning_rate": 0.88671875, "calib/nonempty_step_conf_rate": 0.29296875, "calib/pce": 0.23181208053691266, "calib/std_conf": 0.15188398418247342, "calib/step_conf_rate": 0.29296875, "calib/step_q_w": 0.7673333333333335, "calib/step_q_w_n": 75.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 327.73046875, "completions/mean_terminated_length": 329.0157165527344, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.15786666666666666, "grad_norm": 0.2297629415988922, "kl": 0.1496734619140625, "learning_rate": 1.4722222222222225e-06, "loss": 0.0047, "mask/has_final_conf_rate": 0.578125, "mask/share_final_conf": 0.03815712779760361, "mask/share_reasoning": 0.9385182857513428, "mask/share_step_conf": 0.019418369978666306, "num_tokens": 27771796.0, "reward": 0.03750000149011612, "reward_std": 0.03554752096533775, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 148 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5284003019332886, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.757583498954773, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.48828125, "calib/avg_num_step_conf": 0.39453125, "calib/ece": 0.31999999999999995, "calib/final_conf_rate": 0.48046875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.36585365853658536, "calib/gap": 0.01525935828877012, "calib/mean_conf": 0.8486178861788619, "calib/mu_c": 0.8554411764705881, "calib/mu_w": 0.840181818181818, "calib/nonempty_final_conf_rate": 0.48046875, "calib/nonempty_reasoning_rate": 0.8828125, "calib/nonempty_step_conf_rate": 0.39453125, "calib/pce": 0.3078861788617886, "calib/std_conf": 0.12848286212155452, "calib/step_conf_rate": 0.39453125, "calib/step_q_w": 0.7900396039603959, "calib/step_q_w_n": 101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 358.890625, "completions/mean_terminated_length": 358.890625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.15893333333333334, "grad_norm": 0.17723193764686584, "kl": 0.1447906494140625, "learning_rate": 1.4444444444444445e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.48046875, "mask/share_final_conf": 0.027915118262171745, "mask/share_reasoning": 0.9477370977401733, "mask/share_step_conf": 0.024347800761461258, "num_tokens": 27968128.0, "reward": 0.02656250074505806, "reward_std": 0.030259788036346436, "rewards/accuracy_reward_step": 0.265625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 149 }, { "adv/mean_abs_final_conf": 0.019323909655213356, "adv/mean_abs_reasoning": 0.628254771232605, "adv/mean_abs_step_conf": 0.018246658146381378, "adv/ratio_final_to_reasoning": 0.030758078633132853, "adv/ratio_step_to_reasoning": 0.02904340560849595, "adv/std_final_conf": 0.16558875143527985, "adv/std_reasoning": 0.8266027569770813, "adv/std_step_conf": 0.15635766088962555, "calib/answer_extract_rate": 0.54296875, "calib/avg_num_step_conf": 0.36328125, "calib/ece": 0.28391304347826085, "calib/final_conf_rate": 0.5390625, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.35507246376811596, "calib/gap": 0.034206896551724264, "calib/mean_conf": 0.863623188405797, "calib/mu_c": 0.8780000000000001, "calib/mu_w": 0.8437931034482758, "calib/nonempty_final_conf_rate": 0.5390625, "calib/nonempty_reasoning_rate": 0.8984375, "calib/nonempty_step_conf_rate": 0.36328125, "calib/pce": 0.28391304347826085, "calib/std_conf": 0.10754804810271376, "calib/step_conf_rate": 0.36328125, "calib/step_q_c": 0.9, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.09152173913043482, "calib/step_q_w": 0.8084782608695652, "calib/step_q_w_n": 92.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 326.1490478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.16, "grad_norm": 0.3182465434074402, "kl": 0.1643829345703125, "learning_rate": 1.4166666666666667e-06, "loss": 0.0247, "mask/has_final_conf_rate": 0.5390625, "mask/share_final_conf": 0.03170134127140045, "mask/share_reasoning": 0.9385929703712463, "mask/share_step_conf": 0.025799430906772614, "num_tokens": 28156256.0, "reward": 0.03355569392442703, "reward_std": 0.04173934459686279, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.003867187537252903, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -3.704856499098241e-05, "step": 150 }, { "adv/mean_abs_final_conf": 0.019322393462061882, "adv/mean_abs_reasoning": 0.544094979763031, "adv/mean_abs_step_conf": 0.01922667771577835, "adv/ratio_final_to_reasoning": 0.03551290524767815, "adv/ratio_step_to_reasoning": 0.035336987898973306, "adv/std_final_conf": 0.1655757576227188, "adv/std_reasoning": 0.7753925323486328, "adv/std_step_conf": 0.16475556790828705, "calib/answer_extract_rate": 0.55859375, "calib/avg_num_step_conf": 0.33984375, "calib/ece": 0.3914999999999999, "calib/final_conf_rate": 0.546875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.34285714285714286, "calib/gap": 0.03502467105263163, "calib/mean_conf": 0.848642857142857, "calib/mu_c": 0.86765625, "calib/mu_w": 0.8326315789473684, "calib/nonempty_final_conf_rate": 0.546875, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.33984375, "calib/pce": 0.3914999999999999, "calib/std_conf": 0.1224640513683547, "calib/step_conf_rate": 0.33984375, "calib/step_q_c": 0.64, "calib/step_q_c_n": 1.0, "calib/step_q_gap": -0.1497674418604653, "calib/step_q_w": 0.7897674418604653, "calib/step_q_w_n": 86.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 369.09765625, "completions/mean_terminated_length": 369.09765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16106666666666666, "grad_norm": 0.3350175619125366, "kl": 0.1311187744140625, "learning_rate": 1.3888888888888892e-06, "loss": 0.0041, "mask/has_final_conf_rate": 0.546875, "mask/share_final_conf": 0.030342375859618187, "mask/share_reasoning": 0.9506062269210815, "mask/share_step_conf": 0.019051434472203255, "num_tokens": 28357769.0, "reward": 0.027675680816173553, "reward_std": 0.03598960116505623, "rewards/accuracy_reward_step": 0.2578125, "rewards/final_brier_reward_step": 0.0034000000450760126, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.000392390153137967, "step": 151 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5678851008415222, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7927908301353455, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.51171875, "calib/avg_num_step_conf": 0.36328125, "calib/ece": 0.403515625, "calib/final_conf_rate": 0.5, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.265625, "calib/gap": 0.024904867803311026, "calib/mean_conf": 0.843203125, "calib/mu_c": 0.857017543859649, "calib/mu_w": 0.832112676056338, "calib/nonempty_final_conf_rate": 0.5, "calib/nonempty_reasoning_rate": 0.875, "calib/nonempty_step_conf_rate": 0.36328125, "calib/pce": 0.400703125, "calib/std_conf": 0.10895765801555381, "calib/step_conf_rate": 0.36328125, "calib/step_q_w": 0.767956989247312, "calib/step_q_w_n": 93.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 362.5625, "completions/mean_terminated_length": 362.5625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.16213333333333332, "grad_norm": 0.17765770852565765, "kl": 0.1453704833984375, "learning_rate": 1.3611111111111112e-06, "loss": 0.0062, "mask/has_final_conf_rate": 0.5, "mask/share_final_conf": 0.030476320534944534, "mask/share_reasoning": 0.9500786066055298, "mask/share_step_conf": 0.019445102661848068, "num_tokens": 28555977.0, "reward": 0.02265625074505806, "reward_std": 0.0325222909450531, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 152 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.615715503692627, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7929527163505554, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.53125, "calib/avg_num_step_conf": 0.359375, "calib/ece": 0.3260629921259843, "calib/final_conf_rate": 0.49609375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3779527559055118, "calib/gap": -0.025260651629072872, "calib/mean_conf": 0.8674803149606297, "calib/mu_c": 0.856142857142857, "calib/mu_w": 0.8814035087719299, "calib/nonempty_final_conf_rate": 0.49609375, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.359375, "calib/pce": 0.32118110236220476, "calib/std_conf": 0.09797129986251685, "calib/step_conf_rate": 0.359375, "calib/step_q_w": 0.7318478260869565, "calib/step_q_w_n": 92.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 358.3671875, "completions/mean_terminated_length": 358.3671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.1632, "grad_norm": 0.15940669178962708, "kl": 0.13603973388671875, "learning_rate": 1.3333333333333334e-06, "loss": 0.0379, "mask/has_final_conf_rate": 0.49609375, "mask/share_final_conf": 0.025707479566335678, "mask/share_reasoning": 0.9523909687995911, "mask/share_step_conf": 0.021901525557041168, "num_tokens": 28755039.0, "reward": 0.03007812611758709, "reward_std": 0.03525547683238983, "rewards/accuracy_reward_step": 0.30078125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 153 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5089123249053955, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7393391728401184, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5234375, "calib/avg_num_step_conf": 0.40625, "calib/ece": 0.4067938931297711, "calib/final_conf_rate": 0.51171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3053435114503817, "calib/gap": 0.03688474256022689, "calib/mean_conf": 0.8396183206106871, "calib/mu_c": 0.8601724137931035, "calib/mu_w": 0.8232876712328766, "calib/nonempty_final_conf_rate": 0.51171875, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.40625, "calib/pce": 0.4018320610687024, "calib/std_conf": 0.13053854731883202, "calib/step_conf_rate": 0.40625, "calib/step_q_w": 0.7641346153846154, "calib/step_q_w_n": 104.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 314.8828125, "completions/mean_terminated_length": 316.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 50.0, "epoch": 0.16426666666666667, "grad_norm": 0.17236383259296417, "kl": 0.154296875, "learning_rate": 1.3055555555555556e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.51171875, "mask/share_final_conf": 0.027197513729333878, "mask/share_reasoning": 0.9469050765037537, "mask/share_step_conf": 0.02199118211865425, "num_tokens": 28940089.0, "reward": 0.0234375, "reward_std": 0.029143065214157104, "rewards/accuracy_reward_step": 0.234375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 154 }, { "adv/mean_abs_final_conf": 0.03859548270702362, "adv/mean_abs_reasoning": 0.540123462677002, "adv/mean_abs_step_conf": 0.036589257419109344, "adv/ratio_final_to_reasoning": 0.0714567786330438, "adv/ratio_step_to_reasoning": 0.06774239585476036, "adv/std_final_conf": 0.23386096954345703, "adv/std_reasoning": 0.7753738760948181, "adv/std_step_conf": 0.22205311059951782, "calib/answer_extract_rate": 0.5390625, "calib/avg_num_step_conf": 0.3984375, "calib/ece": 0.4024812030075189, "calib/final_conf_rate": 0.51953125, "calib/format_rate": 0.0078125, "calib/frac_conf_gt_0.9": 0.39097744360902253, "calib/gap": 0.022089041095890827, "calib/mean_conf": 0.840375939849624, "calib/mu_c": 0.8525000000000003, "calib/mu_w": 0.8304109589041094, "calib/nonempty_final_conf_rate": 0.51953125, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.3984375, "calib/pce": 0.39586466165413536, "calib/std_conf": 0.16421334928306924, "calib/step_conf_rate": 0.3984375, "calib/step_q_c": 0.93, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.15534653465346548, "calib/step_q_w": 0.7746534653465346, "calib/step_q_w_n": 101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 301.796875, "completions/mean_terminated_length": 301.796875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.16533333333333333, "grad_norm": 0.7394980788230896, "kl": 0.1728668212890625, "learning_rate": 1.2777777777777779e-06, "loss": 0.0145, "mask/has_final_conf_rate": 0.51953125, "mask/share_final_conf": 0.029195178300142288, "mask/share_reasoning": 0.9469771385192871, "mask/share_step_conf": 0.02382766455411911, "num_tokens": 29124565.0, "reward": 0.026127008721232414, "reward_std": 0.037890445441007614, "rewards/accuracy_reward_step": 0.24609375, "rewards/final_brier_reward_step": 0.004552734550088644, "rewards/format_reward_step": 0.0078125, "rewards/step_l2_reward": -0.0030799706000834703, "step": 155 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.592730700969696, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928775548934937, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5078125, "calib/avg_num_step_conf": 0.41015625, "calib/ece": 0.4214876033057851, "calib/final_conf_rate": 0.47265625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.36363636363636365, "calib/gap": -0.0020784964068546463, "calib/mean_conf": 0.8396694214876034, "calib/mu_c": 0.8385185185185184, "calib/mu_w": 0.8405970149253731, "calib/nonempty_final_conf_rate": 0.47265625, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.41015625, "calib/pce": 0.40743801652892564, "calib/std_conf": 0.17125658042154718, "calib/step_conf_rate": 0.41015625, "calib/step_q_w": 0.767904761904762, "calib/step_q_w_n": 105.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 336.09765625, "completions/mean_terminated_length": 337.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.1664, "grad_norm": 0.16044975817203522, "kl": 0.149627685546875, "learning_rate": 1.25e-06, "loss": -0.0352, "mask/has_final_conf_rate": 0.46875, "mask/share_final_conf": 0.026372965425252914, "mask/share_reasoning": 0.9464188814163208, "mask/share_step_conf": 0.023301880806684494, "num_tokens": 29315366.0, "reward": 0.02265625074505806, "reward_std": 0.033942047506570816, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 156 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.646448016166687, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8429199457168579, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5078125, "calib/avg_num_step_conf": 0.3828125, "calib/ece": 0.2988281249999999, "calib/final_conf_rate": 0.5, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3046875, "calib/gap": 0.022956349206349258, "calib/mean_conf": 0.848984375, "calib/mu_c": 0.8590277777777778, "calib/mu_w": 0.8360714285714286, "calib/nonempty_final_conf_rate": 0.5, "calib/nonempty_reasoning_rate": 0.890625, "calib/nonempty_step_conf_rate": 0.3828125, "calib/pce": 0.2926562499999999, "calib/std_conf": 0.1319969213878088, "calib/step_conf_rate": 0.3828125, "calib/step_q_w": 0.7775510204081632, "calib/step_q_w_n": 98.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 338.3359375, "completions/mean_terminated_length": 338.3359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.16746666666666668, "grad_norm": 0.15877458453178406, "kl": 0.14886474609375, "learning_rate": 1.2222222222222223e-06, "loss": -0.017, "mask/has_final_conf_rate": 0.5, "mask/share_final_conf": 0.029453996568918228, "mask/share_reasoning": 0.9424052238464355, "mask/share_step_conf": 0.02814079448580742, "num_tokens": 29505708.0, "reward": 0.02812500298023224, "reward_std": 0.037020955234766006, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 157 }, { "adv/mean_abs_final_conf": 0.019062640145421028, "adv/mean_abs_reasoning": 0.635623037815094, "adv/mean_abs_step_conf": 0.019323568791151047, "adv/ratio_final_to_reasoning": 0.029990480223856277, "adv/ratio_step_to_reasoning": 0.030400988701690786, "adv/std_final_conf": 0.163349911570549, "adv/std_reasoning": 0.8428871035575867, "adv/std_step_conf": 0.16558583080768585, "calib/answer_extract_rate": 0.55859375, "calib/avg_num_step_conf": 0.31640625, "calib/ece": 0.24979020979020977, "calib/final_conf_rate": 0.55859375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.27972027972027974, "calib/gap": 0.00042655014565140537, "calib/mean_conf": 0.8448951048951049, "calib/mu_c": 0.845056179775281, "calib/mu_w": 0.8446296296296296, "calib/nonempty_final_conf_rate": 0.55859375, "calib/nonempty_reasoning_rate": 0.87109375, "calib/nonempty_step_conf_rate": 0.31640625, "calib/pce": 0.23615384615384613, "calib/std_conf": 0.1255247287528856, "calib/step_conf_rate": 0.31640625, "calib/step_q_w": 0.7923456790123455, "calib/step_q_w_n": 81.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 342.07421875, "completions/mean_terminated_length": 342.07421875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.16853333333333334, "grad_norm": 0.4512387216091156, "kl": 0.1579742431640625, "learning_rate": 1.1944444444444446e-06, "loss": -0.0406, "mask/has_final_conf_rate": 0.55859375, "mask/share_final_conf": 0.03258911520242691, "mask/share_reasoning": 0.943278431892395, "mask/share_step_conf": 0.024132438004016876, "num_tokens": 29698519.0, "reward": 0.03335782513022423, "reward_std": 0.039448127150535583, "rewards/accuracy_reward_step": 0.34765625, "rewards/final_brier_reward_step": 0.00015468749916180968, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.003751535667106509, "step": 158 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.7048021554946899, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.874763548374176, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.53125, "calib/avg_num_step_conf": 0.375, "calib/ece": 0.248840579710145, "calib/final_conf_rate": 0.5390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2971014492753623, "calib/gap": -0.02636818181818157, "calib/mean_conf": 0.8589855072463768, "calib/mu_c": 0.8494318181818182, "calib/mu_w": 0.8757999999999998, "calib/nonempty_final_conf_rate": 0.5390625, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.375, "calib/pce": 0.235072463768116, "calib/std_conf": 0.10756386613173807, "calib/step_conf_rate": 0.375, "calib/step_q_w": 0.7956249999999999, "calib/step_q_w_n": 96.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 336.46484375, "completions/mean_terminated_length": 336.46484375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1696, "grad_norm": 0.17757733166217804, "kl": 0.1535491943359375, "learning_rate": 1.1666666666666668e-06, "loss": -0.0185, "mask/has_final_conf_rate": 0.5390625, "mask/share_final_conf": 0.030048977583646774, "mask/share_reasoning": 0.9474559426307678, "mask/share_step_conf": 0.02249506302177906, "num_tokens": 29889438.0, "reward": 0.03437500447034836, "reward_std": 0.04036171734333038, "rewards/accuracy_reward_step": 0.34375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 159 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6747011542320251, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8589940667152405, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.51953125, "calib/avg_num_step_conf": 0.375, "calib/ece": 0.2803007518796993, "calib/final_conf_rate": 0.51953125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.3308270676691729, "calib/gap": 0.011589743589743406, "calib/mean_conf": 0.8587969924812029, "calib/mu_c": 0.8635897435897434, "calib/mu_w": 0.852, "calib/nonempty_final_conf_rate": 0.51953125, "calib/nonempty_reasoning_rate": 0.89453125, "calib/nonempty_step_conf_rate": 0.375, "calib/pce": 0.2763157894736842, "calib/std_conf": 0.10119613646808662, "calib/step_conf_rate": 0.375, "calib/step_q_w": 0.7814583333333333, "calib/step_q_w_n": 96.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 368.30078125, "completions/mean_terminated_length": 368.30078125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.17066666666666666, "grad_norm": 0.17369940876960754, "kl": 0.14441680908203125, "learning_rate": 1.138888888888889e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.51953125, "mask/share_final_conf": 0.024254774674773216, "mask/share_reasoning": 0.9520937204360962, "mask/share_step_conf": 0.023651521652936935, "num_tokens": 30088563.0, "reward": 0.03046875074505806, "reward_std": 0.03863853961229324, "rewards/accuracy_reward_step": 0.3046875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 160 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6919175386428833, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8590571284294128, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.578125, "calib/avg_num_step_conf": 0.3046875, "calib/ece": 0.1623287671232877, "calib/final_conf_rate": 0.5703125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2876712328767123, "calib/gap": 0.016439393939394198, "calib/mean_conf": 0.8367123287671233, "calib/mu_c": 0.8416666666666668, "calib/mu_w": 0.8252272727272726, "calib/nonempty_final_conf_rate": 0.5703125, "calib/nonempty_reasoning_rate": 0.8828125, "calib/nonempty_step_conf_rate": 0.3046875, "calib/pce": 0.1502054794520548, "calib/std_conf": 0.12389848688939245, "calib/step_conf_rate": 0.3046875, "calib/step_q_w": 0.8043589743589742, "calib/step_q_w_n": 78.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 347.9609375, "completions/mean_terminated_length": 349.32550048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 38.0, "epoch": 0.17173333333333332, "grad_norm": 0.18009351193904877, "kl": 0.1587066650390625, "learning_rate": 1.111111111111111e-06, "loss": 0.0317, "mask/has_final_conf_rate": 0.5703125, "mask/share_final_conf": 0.032218098640441895, "mask/share_reasoning": 0.9410046935081482, "mask/share_step_conf": 0.022870970889925957, "num_tokens": 30281561.0, "reward": 0.03984374925494194, "reward_std": 0.039622340351343155, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 161 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.7094434499740601, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8591045141220093, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5625, "calib/avg_num_step_conf": 0.32421875, "calib/ece": 0.18833333333333327, "calib/final_conf_rate": 0.5625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2847222222222222, "calib/gap": 0.014152524167561742, "calib/mean_conf": 0.8474999999999999, "calib/mu_c": 0.8523157894736842, "calib/mu_w": 0.8381632653061225, "calib/nonempty_final_conf_rate": 0.5625, "calib/nonempty_reasoning_rate": 0.88671875, "calib/nonempty_step_conf_rate": 0.32421875, "calib/pce": 0.1880555555555555, "calib/std_conf": 0.09173103800422915, "calib/step_conf_rate": 0.32421875, "calib/step_q_w": 0.7798795180722892, "calib/step_q_w_n": 83.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 327.3828125, "completions/mean_terminated_length": 328.66668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.1728, "grad_norm": 0.20924623310565948, "kl": 0.1875762939453125, "learning_rate": 1.0833333333333335e-06, "loss": -0.0228, "mask/has_final_conf_rate": 0.5625, "mask/share_final_conf": 0.029927421361207962, "mask/share_reasoning": 0.9480806589126587, "mask/share_step_conf": 0.018085699528455734, "num_tokens": 30469515.0, "reward": 0.037109375, "reward_std": 0.04062382131814957, "rewards/accuracy_reward_step": 0.37109375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 162 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6055176854133606, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8265026807785034, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55859375, "calib/avg_num_step_conf": 0.26953125, "calib/ece": 0.3418115942028985, "calib/final_conf_rate": 0.5390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2971014492753623, "calib/gap": 0.013413916333823828, "calib/mean_conf": 0.8563043478260869, "calib/mu_c": 0.8628169014084507, "calib/mu_w": 0.8494029850746269, "calib/nonempty_final_conf_rate": 0.5390625, "calib/nonempty_reasoning_rate": 0.82421875, "calib/nonempty_step_conf_rate": 0.265625, "calib/pce": 0.3418115942028985, "calib/std_conf": 0.10213713491635887, "calib/step_conf_rate": 0.265625, "calib/step_q_w": 0.7879710144927536, "calib/step_q_w_n": 69.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 377.96875, "completions/mean_terminated_length": 377.96875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.17386666666666667, "grad_norm": 0.15629848837852478, "kl": 0.1473236083984375, "learning_rate": 1.0555555555555557e-06, "loss": 0.0299, "mask/has_final_conf_rate": 0.5390625, "mask/share_final_conf": 0.029228312894701958, "mask/share_reasoning": 0.953479528427124, "mask/share_step_conf": 0.017292149364948273, "num_tokens": 30671107.0, "reward": 0.02851562574505806, "reward_std": 0.03467895835638046, "rewards/accuracy_reward_step": 0.28515625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 163 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5345840454101562, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7753315567970276, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.515625, "calib/avg_num_step_conf": 0.3359375, "calib/ece": 0.31358778625954187, "calib/final_conf_rate": 0.51171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2824427480916031, "calib/gap": 0.005791079812206279, "calib/mean_conf": 0.8403053435114504, "calib/mu_c": 0.842957746478873, "calib/mu_w": 0.8371666666666667, "calib/nonempty_final_conf_rate": 0.51171875, "calib/nonempty_reasoning_rate": 0.8515625, "calib/nonempty_step_conf_rate": 0.3359375, "calib/pce": 0.30595419847328237, "calib/std_conf": 0.12744068478859416, "calib/step_conf_rate": 0.3359375, "calib/step_q_w": 0.8046511627906977, "calib/step_q_w_n": 86.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 374.9609375, "completions/mean_terminated_length": 374.9609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.17493333333333333, "grad_norm": 0.15789611637592316, "kl": 0.1409149169921875, "learning_rate": 1.0277777777777777e-06, "loss": 0.0294, "mask/has_final_conf_rate": 0.51171875, "mask/share_final_conf": 0.026131562888622284, "mask/share_reasoning": 0.9563469886779785, "mask/share_step_conf": 0.017521433532238007, "num_tokens": 30873233.0, "reward": 0.02773437649011612, "reward_std": 0.03061625175178051, "rewards/accuracy_reward_step": 0.27734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 164 }, { "adv/mean_abs_final_conf": 0.019196441397070885, "adv/mean_abs_reasoning": 0.5598448514938354, "adv/mean_abs_step_conf": 0.019322002306580544, "adv/ratio_final_to_reasoning": 0.03428885939711506, "adv/ratio_step_to_reasoning": 0.03451313744338917, "adv/std_final_conf": 0.16449646651744843, "adv/std_reasoning": 0.7927621603012085, "adv/std_step_conf": 0.1655724197626114, "calib/answer_extract_rate": 0.59765625, "calib/avg_num_step_conf": 0.25390625, "calib/ece": 0.36934210526315775, "calib/final_conf_rate": 0.59375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.2631578947368421, "calib/gap": 0.00613998613998612, "calib/mean_conf": 0.851578947368421, "calib/mu_c": 0.8547297297297297, "calib/mu_w": 0.8485897435897436, "calib/nonempty_final_conf_rate": 0.59375, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.25390625, "calib/pce": 0.3670394736842104, "calib/std_conf": 0.10792361616072617, "calib/step_conf_rate": 0.25390625, "calib/step_q_w": 0.7778461538461537, "calib/step_q_w_n": 65.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 389.09375, "completions/mean_terminated_length": 389.09375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.176, "grad_norm": 0.4201725721359253, "kl": 0.1468963623046875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0039, "mask/has_final_conf_rate": 0.59375, "mask/share_final_conf": 0.02832464501261711, "mask/share_reasoning": 0.9569504857063293, "mask/share_step_conf": 0.014724886044859886, "num_tokens": 31078417.0, "reward": 0.02780143730342388, "reward_std": 0.03408287838101387, "rewards/accuracy_reward_step": 0.2890625, "rewards/final_brier_reward_step": 0.00030625000363215804, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.003297126619145274, "step": 165 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6967577934265137, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8590615391731262, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55078125, "calib/avg_num_step_conf": 0.27734375, "calib/ece": 0.2533576642335767, "calib/final_conf_rate": 0.53515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.23357664233576642, "calib/gap": -0.02477294007490627, "calib/mean_conf": 0.8403649635036496, "calib/mu_c": 0.831685393258427, "calib/mu_w": 0.8564583333333333, "calib/nonempty_final_conf_rate": 0.53515625, "calib/nonempty_reasoning_rate": 0.828125, "calib/nonempty_step_conf_rate": 0.27734375, "calib/pce": 0.22204379562043802, "calib/std_conf": 0.14374251492603518, "calib/step_conf_rate": 0.27734375, "calib/step_q_w": 0.7697183098591549, "calib/step_q_w_n": 71.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 419.19921875, "completions/mean_terminated_length": 420.8431701660156, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.17706666666666668, "grad_norm": 0.15491074323654175, "kl": 0.120269775390625, "learning_rate": 9.722222222222224e-07, "loss": 0.003, "mask/has_final_conf_rate": 0.53515625, "mask/share_final_conf": 0.02692285180091858, "mask/share_reasoning": 0.9514549374580383, "mask/share_step_conf": 0.017715971916913986, "num_tokens": 31291916.0, "reward": 0.03593749925494194, "reward_std": 0.03989892452955246, "rewards/accuracy_reward_step": 0.359375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 166 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6552045941352844, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8266619443893433, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.53515625, "calib/avg_num_step_conf": 0.31640625, "calib/ece": 0.2191851851851852, "calib/final_conf_rate": 0.52734375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2222222222222222, "calib/gap": 0.014061624649859716, "calib/mean_conf": 0.8399259259259261, "calib/mu_c": 0.8452380952380951, "calib/mu_w": 0.8311764705882354, "calib/nonempty_final_conf_rate": 0.52734375, "calib/nonempty_reasoning_rate": 0.8515625, "calib/nonempty_step_conf_rate": 0.31640625, "calib/pce": 0.21844444444444444, "calib/std_conf": 0.0989687290402699, "calib/step_conf_rate": 0.31640625, "calib/step_q_w": 0.7817283950617283, "calib/step_q_w_n": 81.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 362.19140625, "completions/mean_terminated_length": 362.19140625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.17813333333333334, "grad_norm": 0.15898919105529785, "kl": 0.14215087890625, "learning_rate": 9.444444444444445e-07, "loss": -0.0005, "mask/has_final_conf_rate": 0.52734375, "mask/share_final_conf": 0.024741780012845993, "mask/share_reasoning": 0.956134557723999, "mask/share_step_conf": 0.019123634323477745, "num_tokens": 31490245.0, "reward": 0.03281250223517418, "reward_std": 0.037518225610256195, "rewards/accuracy_reward_step": 0.328125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 167 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6289219856262207, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8588364720344543, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.52734375, "calib/avg_num_step_conf": 0.3359375, "calib/ece": 0.32492537313432834, "calib/final_conf_rate": 0.5234375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.26865671641791045, "calib/gap": 0.060958751393534016, "calib/mean_conf": 0.8398507462686566, "calib/mu_c": 0.8694202898550726, "calib/mu_w": 0.8084615384615386, "calib/nonempty_final_conf_rate": 0.5234375, "calib/nonempty_reasoning_rate": 0.86328125, "calib/nonempty_step_conf_rate": 0.3359375, "calib/pce": 0.32492537313432834, "calib/std_conf": 0.12406657501823976, "calib/step_conf_rate": 0.3359375, "calib/step_q_w": 0.7769767441860465, "calib/step_q_w_n": 86.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 382.296875, "completions/mean_terminated_length": 382.296875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1792, "grad_norm": 0.20057442784309387, "kl": 0.15277099609375, "learning_rate": 9.166666666666666e-07, "loss": -0.0156, "mask/has_final_conf_rate": 0.5234375, "mask/share_final_conf": 0.02479960396885872, "mask/share_reasoning": 0.9573352932929993, "mask/share_step_conf": 0.01786513812839985, "num_tokens": 31692785.0, "reward": 0.02695312723517418, "reward_std": 0.036022573709487915, "rewards/accuracy_reward_step": 0.26953125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 168 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6322240233421326, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8265968561172485, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55859375, "calib/avg_num_step_conf": 0.29296875, "calib/ece": 0.3102222222222222, "calib/final_conf_rate": 0.52734375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.25925925925925924, "calib/gap": 0.005400974745237153, "calib/mean_conf": 0.8583703703703703, "calib/mu_c": 0.8608108108108109, "calib/mu_w": 0.8554098360655737, "calib/nonempty_final_conf_rate": 0.52734375, "calib/nonempty_reasoning_rate": 0.8515625, "calib/nonempty_step_conf_rate": 0.29296875, "calib/pce": 0.3102222222222222, "calib/std_conf": 0.08169688699275676, "calib/step_conf_rate": 0.29296875, "calib/step_q_w": 0.7885333333333333, "calib/step_q_w_n": 75.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 377.17578125, "completions/mean_terminated_length": 378.6549377441406, "completions/min_length": 0.0, "completions/min_terminated_length": 39.0, "epoch": 0.18026666666666666, "grad_norm": 0.15046626329421997, "kl": 0.141265869140625, "learning_rate": 8.88888888888889e-07, "loss": 0.016, "mask/has_final_conf_rate": 0.52734375, "mask/share_final_conf": 0.024082526564598083, "mask/share_reasoning": 0.9533636569976807, "mask/share_step_conf": 0.018647566437721252, "num_tokens": 31893526.0, "reward": 0.02890625223517418, "reward_std": 0.03620504215359688, "rewards/accuracy_reward_step": 0.2890625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 169 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6629437208175659, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8589531183242798, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.515625, "calib/avg_num_step_conf": 0.30859375, "calib/ece": 0.23143939393939383, "calib/final_conf_rate": 0.515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.22727272727272727, "calib/gap": 0.010275962236746317, "calib/mean_conf": 0.8335606060606061, "calib/mu_c": 0.8375308641975308, "calib/mu_w": 0.8272549019607844, "calib/nonempty_final_conf_rate": 0.515625, "calib/nonempty_reasoning_rate": 0.8203125, "calib/nonempty_step_conf_rate": 0.30859375, "calib/pce": 0.22568181818181807, "calib/std_conf": 0.10313372149168089, "calib/step_conf_rate": 0.30859375, "calib/step_q_c": 0.8, "calib/step_q_c_n": 1.0, "calib/step_q_gap": 0.010512820512820653, "calib/step_q_w": 0.7894871794871794, "calib/step_q_w_n": 78.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 401.484375, "completions/mean_terminated_length": 403.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.18133333333333335, "grad_norm": 0.1561432182788849, "kl": 0.1366424560546875, "learning_rate": 8.611111111111112e-07, "loss": 0.0187, "mask/has_final_conf_rate": 0.515625, "mask/share_final_conf": 0.02365146577358246, "mask/share_reasoning": 0.9560713171958923, "mask/share_step_conf": 0.016370952129364014, "num_tokens": 32100458.0, "reward": 0.03203124925494194, "reward_std": 0.03796668350696564, "rewards/accuracy_reward_step": 0.3203125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 170 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5713903903961182, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7754671573638916, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.484375, "calib/avg_num_step_conf": 0.37109375, "calib/ece": 0.2712096774193547, "calib/final_conf_rate": 0.484375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.29838709677419356, "calib/gap": 0.04846930640446456, "calib/mean_conf": 0.843790322580645, "calib/mu_c": 0.864507042253521, "calib/mu_w": 0.8160377358490565, "calib/nonempty_final_conf_rate": 0.484375, "calib/nonempty_reasoning_rate": 0.85546875, "calib/nonempty_step_conf_rate": 0.37109375, "calib/pce": 0.2712096774193547, "calib/std_conf": 0.10617691218217407, "calib/step_conf_rate": 0.37109375, "calib/step_q_w": 0.7707368421052632, "calib/step_q_w_n": 95.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 326.390625, "completions/mean_terminated_length": 327.67059326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 43.0, "epoch": 0.1824, "grad_norm": 0.1624995768070221, "kl": 0.14697265625, "learning_rate": 8.333333333333333e-07, "loss": 0.031, "mask/has_final_conf_rate": 0.48046875, "mask/share_final_conf": 0.025319751352071762, "mask/share_reasoning": 0.9486113786697388, "mask/share_step_conf": 0.02216263860464096, "num_tokens": 32290910.0, "reward": 0.02773437649011612, "reward_std": 0.032719485461711884, "rewards/accuracy_reward_step": 0.27734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 171 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6793380975723267, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8430353999137878, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.45703125, "calib/avg_num_step_conf": 0.39453125, "calib/ece": 0.28189655172413786, "calib/final_conf_rate": 0.453125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.28448275862068967, "calib/gap": -0.029559050262103037, "calib/mean_conf": 0.8560344827586207, "calib/mu_c": 0.8440579710144928, "calib/mu_w": 0.8736170212765958, "calib/nonempty_final_conf_rate": 0.453125, "calib/nonempty_reasoning_rate": 0.8515625, "calib/nonempty_step_conf_rate": 0.39453125, "calib/pce": 0.27155172413793094, "calib/std_conf": 0.09696141527596773, "calib/step_conf_rate": 0.39453125, "calib/step_q_w": 0.7874257425742576, "calib/step_q_w_n": 101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 348.8828125, "completions/mean_terminated_length": 348.8828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.18346666666666667, "grad_norm": 0.1543116569519043, "kl": 0.143310546875, "learning_rate": 8.055555555555557e-07, "loss": 0.0325, "mask/has_final_conf_rate": 0.453125, "mask/share_final_conf": 0.022162608802318573, "mask/share_reasoning": 0.9561766386032104, "mask/share_step_conf": 0.021660756319761276, "num_tokens": 32483576.0, "reward": 0.02734375, "reward_std": 0.03890039771795273, "rewards/accuracy_reward_step": 0.2734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 172 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.669437050819397, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8429975509643555, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5234375, "calib/avg_num_step_conf": 0.33203125, "calib/ece": 0.2572727272727272, "calib/final_conf_rate": 0.515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2803030303030303, "calib/gap": 0.02381179842369241, "calib/mean_conf": 0.8465151515151514, "calib/mu_c": 0.8560759493670885, "calib/mu_w": 0.8322641509433961, "calib/nonempty_final_conf_rate": 0.515625, "calib/nonempty_reasoning_rate": 0.85546875, "calib/nonempty_step_conf_rate": 0.33203125, "calib/pce": 0.2526515151515151, "calib/std_conf": 0.10169759724586959, "calib/step_conf_rate": 0.33203125, "calib/step_q_w": 0.7796470588235294, "calib/step_q_w_n": 85.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 412.375, "completions/mean_terminated_length": 412.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.18453333333333333, "grad_norm": 0.14386945962905884, "kl": 0.133056640625, "learning_rate": 7.777777777777779e-07, "loss": -0.0083, "mask/has_final_conf_rate": 0.515625, "mask/share_final_conf": 0.024806944653391838, "mask/share_reasoning": 0.9530544281005859, "mask/share_step_conf": 0.022138644009828568, "num_tokens": 32692304.0, "reward": 0.03125, "reward_std": 0.038334622979164124, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 173 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5568525195121765, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7927470207214355, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.453125, "calib/avg_num_step_conf": 0.38671875, "calib/ece": 0.368157894736842, "calib/final_conf_rate": 0.4453125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2894736842105263, "calib/gap": 0.021827426810477557, "calib/mean_conf": 0.8481578947368422, "calib/mu_c": 0.8594545454545454, "calib/mu_w": 0.8376271186440678, "calib/nonempty_final_conf_rate": 0.4453125, "calib/nonempty_reasoning_rate": 0.83984375, "calib/nonempty_step_conf_rate": 0.38671875, "calib/pce": 0.3669298245614034, "calib/std_conf": 0.12635626179107026, "calib/step_conf_rate": 0.38671875, "calib/step_q_w": 0.7844444444444444, "calib/step_q_w_n": 99.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 379.87109375, "completions/mean_terminated_length": 381.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 0.1856, "grad_norm": 0.1287498027086258, "kl": 0.12885284423828125, "learning_rate": 7.5e-07, "loss": 0.0281, "mask/has_final_conf_rate": 0.4453125, "mask/share_final_conf": 0.02070237696170807, "mask/share_reasoning": 0.9495202302932739, "mask/share_step_conf": 0.02587110549211502, "num_tokens": 32893783.0, "reward": 0.021484375, "reward_std": 0.031891852617263794, "rewards/accuracy_reward_step": 0.21484375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 174 }, { "adv/mean_abs_final_conf": 0.019277554005384445, "adv/mean_abs_reasoning": 0.4558134973049164, "adv/mean_abs_step_conf": 0.019320236518979073, "adv/ratio_final_to_reasoning": 0.042292635297916, "adv/ratio_step_to_reasoning": 0.04238627560002858, "adv/std_final_conf": 0.16519151628017426, "adv/std_reasoning": 0.7205056548118591, "adv/std_step_conf": 0.16555728018283844, "calib/answer_extract_rate": 0.4296875, "calib/avg_num_step_conf": 0.3515625, "calib/ece": 0.43385321100917423, "calib/final_conf_rate": 0.42578125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.22935779816513763, "calib/gap": 0.030764622973925237, "calib/mean_conf": 0.8283486238532111, "calib/mu_c": 0.8469767441860464, "calib/mu_w": 0.8162121212121212, "calib/nonempty_final_conf_rate": 0.42578125, "calib/nonempty_reasoning_rate": 0.77734375, "calib/nonempty_step_conf_rate": 0.3515625, "calib/pce": 0.43385321100917423, "calib/std_conf": 0.11379616221474019, "calib/step_conf_rate": 0.3515625, "calib/step_q_w": 0.7782222222222223, "calib/step_q_w_n": 90.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 394.86328125, "completions/mean_terminated_length": 394.86328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.18666666666666668, "grad_norm": 0.5348300933837891, "kl": 0.145294189453125, "learning_rate": 7.222222222222222e-07, "loss": 0.0214, "mask/has_final_conf_rate": 0.42578125, "mask/share_final_conf": 0.020652402192354202, "mask/share_reasoning": 0.9563436508178711, "mask/share_step_conf": 0.023003987967967987, "num_tokens": 33100692.0, "reward": 0.016107818111777306, "reward_std": 0.026949819177389145, "rewards/accuracy_reward_step": 0.16796875, "rewards/final_brier_reward_step": 0.0007421874906867743, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0029015520121902227, "step": 175 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6810925006866455, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.87469482421875, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.46484375, "calib/avg_num_step_conf": 0.34765625, "calib/ece": 0.25808333333333344, "calib/final_conf_rate": 0.46875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.25, "calib/gap": 0.023779821787870192, "calib/mean_conf": 0.8414166666666667, "calib/mu_c": 0.8511267605633803, "calib/mu_w": 0.8273469387755101, "calib/nonempty_final_conf_rate": 0.46875, "calib/nonempty_reasoning_rate": 0.8125, "calib/nonempty_step_conf_rate": 0.34765625, "calib/pce": 0.2539166666666668, "calib/std_conf": 0.10912756933465019, "calib/step_conf_rate": 0.34765625, "calib/step_q_w": 0.7676404494382022, "calib/step_q_w_n": 89.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 363.68359375, "completions/mean_terminated_length": 366.5472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 48.0, "epoch": 0.18773333333333334, "grad_norm": 0.17209787666797638, "kl": 0.140167236328125, "learning_rate": 6.944444444444446e-07, "loss": -0.005, "mask/has_final_conf_rate": 0.46875, "mask/share_final_conf": 0.024472713470458984, "mask/share_reasoning": 0.9470512270927429, "mask/share_step_conf": 0.020663540810346603, "num_tokens": 33297859.0, "reward": 0.02812499925494194, "reward_std": 0.03900687023997307, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 176 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5837579369544983, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928394675254822, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.4921875, "calib/avg_num_step_conf": 0.3359375, "calib/ece": 0.30134920634920626, "calib/final_conf_rate": 0.4921875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.20634920634920634, "calib/gap": 0.012134888438133928, "calib/mean_conf": 0.8410317460317461, "calib/mu_c": 0.8466176470588235, "calib/mu_w": 0.8344827586206895, "calib/nonempty_final_conf_rate": 0.4921875, "calib/nonempty_reasoning_rate": 0.828125, "calib/nonempty_step_conf_rate": 0.3359375, "calib/pce": 0.30134920634920626, "calib/std_conf": 0.09712395422475838, "calib/step_conf_rate": 0.3359375, "calib/step_q_w": 0.7973255813953486, "calib/step_q_w_n": 86.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 357.1328125, "completions/mean_terminated_length": 358.5333557128906, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.1888, "grad_norm": 0.14639951288700104, "kl": 0.14056396484375, "learning_rate": 6.666666666666667e-07, "loss": -0.0141, "mask/has_final_conf_rate": 0.4921875, "mask/share_final_conf": 0.026274647563695908, "mask/share_reasoning": 0.9458457827568054, "mask/share_step_conf": 0.02397332713007927, "num_tokens": 33493117.0, "reward": 0.02695312723517418, "reward_std": 0.03342931345105171, "rewards/accuracy_reward_step": 0.26953125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 177 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6184110045433044, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8265557289123535, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.45703125, "calib/avg_num_step_conf": 0.41796875, "calib/ece": 0.263728813559322, "calib/final_conf_rate": 0.4609375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2796610169491525, "calib/gap": 0.01724303266406968, "calib/mean_conf": 0.8518644067796611, "calib/mu_c": 0.8587323943661971, "calib/mu_w": 0.8414893617021274, "calib/nonempty_final_conf_rate": 0.4609375, "calib/nonempty_reasoning_rate": 0.875, "calib/nonempty_step_conf_rate": 0.41796875, "calib/pce": 0.2569491525423728, "calib/std_conf": 0.12919535301858673, "calib/step_conf_rate": 0.41796875, "calib/step_q_w": 0.7772616822429906, "calib/step_q_w_n": 107.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 349.08984375, "completions/mean_terminated_length": 349.08984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.18986666666666666, "grad_norm": 0.15279927849769592, "kl": 0.1522064208984375, "learning_rate": 6.388888888888889e-07, "loss": 0.0331, "mask/has_final_conf_rate": 0.4609375, "mask/share_final_conf": 0.025390885770320892, "mask/share_reasoning": 0.9491012096405029, "mask/share_step_conf": 0.02550787664949894, "num_tokens": 33688556.0, "reward": 0.02773437649011612, "reward_std": 0.03541572391986847, "rewards/accuracy_reward_step": 0.27734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 178 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.585411012172699, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928422093391418, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.453125, "calib/avg_num_step_conf": 0.39453125, "calib/ece": 0.23249999999999998, "calib/final_conf_rate": 0.453125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.22413793103448276, "calib/gap": 0.010006259780907745, "calib/mean_conf": 0.8445689655172414, "calib/mu_c": 0.8484507042253523, "calib/mu_w": 0.8384444444444445, "calib/nonempty_final_conf_rate": 0.453125, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.39453125, "calib/pce": 0.23249999999999998, "calib/std_conf": 0.09401386766176015, "calib/step_conf_rate": 0.39453125, "calib/step_q_w": 0.7893069306930695, "calib/step_q_w_n": 101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 352.37890625, "completions/mean_terminated_length": 352.37890625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.19093333333333334, "grad_norm": 0.15809383988380432, "kl": 0.157135009765625, "learning_rate": 6.111111111111112e-07, "loss": 0.0264, "mask/has_final_conf_rate": 0.453125, "mask/share_final_conf": 0.024328168481588364, "mask/share_reasoning": 0.9527326226234436, "mask/share_step_conf": 0.02293919399380684, "num_tokens": 33885029.0, "reward": 0.02773437649011612, "reward_std": 0.03352377563714981, "rewards/accuracy_reward_step": 0.27734375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 179 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5974774360656738, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8264751434326172, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.4921875, "calib/avg_num_step_conf": 0.25, "calib/ece": 0.22779527559055118, "calib/final_conf_rate": 0.49609375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2204724409448819, "calib/gap": 0.01615770042194098, "calib/mean_conf": 0.8498425196850394, "calib/mu_c": 0.8559493670886077, "calib/mu_w": 0.8397916666666667, "calib/nonempty_final_conf_rate": 0.49609375, "calib/nonempty_reasoning_rate": 0.7421875, "calib/nonempty_step_conf_rate": 0.25, "calib/pce": 0.22779527559055118, "calib/std_conf": 0.0795259998287937, "calib/step_conf_rate": 0.25, "calib/step_q_w": 0.770625, "calib/step_q_w_n": 64.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 444.04296875, "completions/mean_terminated_length": 445.7843322753906, "completions/min_length": 0.0, "completions/min_terminated_length": 64.0, "epoch": 0.192, "grad_norm": 0.1322283148765564, "kl": 0.1263427734375, "learning_rate": 5.833333333333334e-07, "loss": 0.01, "mask/has_final_conf_rate": 0.49609375, "mask/share_final_conf": 0.021843096241354942, "mask/share_reasoning": 0.9585437178611755, "mask/share_step_conf": 0.015706941485404968, "num_tokens": 34102560.0, "reward": 0.03125, "reward_std": 0.03421951085329056, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 180 }, { "adv/mean_abs_final_conf": 0.019196441397070885, "adv/mean_abs_reasoning": 0.5584971308708191, "adv/mean_abs_step_conf": 0.01929621398448944, "adv/ratio_final_to_reasoning": 0.03437160253113465, "adv/ratio_step_to_reasoning": 0.03455024729384451, "adv/std_final_conf": 0.16449646651744843, "adv/std_reasoning": 0.7754106521606445, "adv/std_step_conf": 0.16535142064094543, "calib/answer_extract_rate": 0.48828125, "calib/avg_num_step_conf": 0.33984375, "calib/ece": 0.37024193548387097, "calib/final_conf_rate": 0.484375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.24193548387096775, "calib/gap": 0.013681877444589263, "calib/mean_conf": 0.8460483870967744, "calib/mu_c": 0.8532203389830508, "calib/mu_w": 0.8395384615384616, "calib/nonempty_final_conf_rate": 0.484375, "calib/nonempty_reasoning_rate": 0.82421875, "calib/nonempty_step_conf_rate": 0.33984375, "calib/pce": 0.37024193548387097, "calib/std_conf": 0.10215278364820002, "calib/step_conf_rate": 0.33984375, "calib/step_q_w": 0.7693103448275863, "calib/step_q_w_n": 87.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 343.0234375, "completions/mean_terminated_length": 343.0234375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.19306666666666666, "grad_norm": 0.39187780022621155, "kl": 0.1638336181640625, "learning_rate": 5.555555555555555e-07, "loss": 0.0494, "mask/has_final_conf_rate": 0.484375, "mask/share_final_conf": 0.025939784944057465, "mask/share_reasoning": 0.9525427222251892, "mask/share_step_conf": 0.021517515182495117, "num_tokens": 34296638.0, "reward": 0.023430868983268738, "reward_std": 0.030896620824933052, "rewards/accuracy_reward_step": 0.234375, "rewards/final_brier_reward_step": 0.00030625000363215804, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0011007614666596055, "step": 181 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6490250825881958, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8266454935073853, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.4765625, "calib/avg_num_step_conf": 0.35546875, "calib/ece": 0.1580327868852458, "calib/final_conf_rate": 0.4765625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.22131147540983606, "calib/gap": 0.014944356120826785, "calib/mean_conf": 0.8531147540983607, "calib/mu_c": 0.8576470588235294, "calib/mu_w": 0.8427027027027026, "calib/nonempty_final_conf_rate": 0.4765625, "calib/nonempty_reasoning_rate": 0.83203125, "calib/nonempty_step_conf_rate": 0.35546875, "calib/pce": 0.15721311475409824, "calib/std_conf": 0.07827109224149895, "calib/step_conf_rate": 0.35546875, "calib/step_q_w": 0.7895604395604395, "calib/step_q_w_n": 91.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 372.8203125, "completions/mean_terminated_length": 372.8203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.19413333333333332, "grad_norm": 0.16000139713287354, "kl": 0.1412506103515625, "learning_rate": 5.277777777777779e-07, "loss": 0.0138, "mask/has_final_conf_rate": 0.4765625, "mask/share_final_conf": 0.02340656891465187, "mask/share_reasoning": 0.9566706418991089, "mask/share_step_conf": 0.01992282271385193, "num_tokens": 34498240.0, "reward": 0.03359375149011612, "reward_std": 0.03716510534286499, "rewards/accuracy_reward_step": 0.3359375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 182 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5607688426971436, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7927569150924683, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5, "calib/avg_num_step_conf": 0.2578125, "calib/ece": 0.317109375, "calib/final_conf_rate": 0.5, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2890625, "calib/gap": 0.04363836554930267, "calib/mean_conf": 0.840546875, "calib/mu_c": 0.8613432835820896, "calib/mu_w": 0.8177049180327869, "calib/nonempty_final_conf_rate": 0.5, "calib/nonempty_reasoning_rate": 0.7578125, "calib/nonempty_step_conf_rate": 0.2578125, "calib/pce": 0.317109375, "calib/std_conf": 0.12856363085155295, "calib/step_conf_rate": 0.2578125, "calib/step_q_w": 0.793939393939394, "calib/step_q_w_n": 66.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 401.2890625, "completions/mean_terminated_length": 401.2890625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.1952, "grad_norm": 0.14064468443393707, "kl": 0.134674072265625, "learning_rate": 5.000000000000001e-07, "loss": -0.014, "mask/has_final_conf_rate": 0.5, "mask/share_final_conf": 0.022872116416692734, "mask/share_reasoning": 0.9605451226234436, "mask/share_step_conf": 0.016582757234573364, "num_tokens": 34707650.0, "reward": 0.02617187425494194, "reward_std": 0.032115641981363297, "rewards/accuracy_reward_step": 0.26171875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 183 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.617270827293396, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8265382647514343, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.56640625, "calib/avg_num_step_conf": 0.30078125, "calib/ece": 0.18647887323943663, "calib/final_conf_rate": 0.5546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.21830985915492956, "calib/gap": 0.01684210526315766, "calib/mean_conf": 0.8412676056338028, "calib/mu_c": 0.8468421052631577, "calib/mu_w": 0.8300000000000001, "calib/nonempty_final_conf_rate": 0.5546875, "calib/nonempty_reasoning_rate": 0.8671875, "calib/nonempty_step_conf_rate": 0.30078125, "calib/pce": 0.1793661971830986, "calib/std_conf": 0.09578334674663085, "calib/step_conf_rate": 0.30078125, "calib/step_q_w": 0.7942857142857142, "calib/step_q_w_n": 77.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2078.0, "completions/max_terminated_length": 2078.0, "completions/mean_length": 386.53515625, "completions/mean_terminated_length": 386.53515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.19626666666666667, "grad_norm": 0.14544470608234406, "kl": 0.1382904052734375, "learning_rate": 4.7222222222222226e-07, "loss": 0.0319, "mask/has_final_conf_rate": 0.5546875, "mask/share_final_conf": 0.0245208777487278, "mask/share_reasoning": 0.9588320255279541, "mask/share_step_conf": 0.016647107899188995, "num_tokens": 34911883.0, "reward": 0.03750000149011612, "reward_std": 0.03535057231783867, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 184 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5579884052276611, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7927578687667847, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5078125, "calib/avg_num_step_conf": 0.265625, "calib/ece": 0.2853076923076923, "calib/final_conf_rate": 0.5078125, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.27692307692307694, "calib/gap": 0.018460606060606066, "calib/mean_conf": 0.8459230769230769, "calib/mu_c": 0.8537333333333333, "calib/mu_w": 0.8352727272727273, "calib/nonempty_final_conf_rate": 0.5078125, "calib/nonempty_reasoning_rate": 0.7734375, "calib/nonempty_step_conf_rate": 0.265625, "calib/pce": 0.2771538461538462, "calib/std_conf": 0.12126066109416558, "calib/step_conf_rate": 0.265625, "calib/step_q_w": 0.768970588235294, "calib/step_q_w_n": 68.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 382.76953125, "completions/mean_terminated_length": 385.7834777832031, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.19733333333333333, "grad_norm": 0.14634037017822266, "kl": 0.1418609619140625, "learning_rate": 4.444444444444445e-07, "loss": 0.0199, "mask/has_final_conf_rate": 0.5078125, "mask/share_final_conf": 0.024744169786572456, "mask/share_reasoning": 0.9503279328346252, "mask/share_step_conf": 0.017115432769060135, "num_tokens": 35116792.0, "reward": 0.02929687686264515, "reward_std": 0.031956762075424194, "rewards/accuracy_reward_step": 0.29296875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 185 }, { "adv/mean_abs_final_conf": 0.019286589697003365, "adv/mean_abs_reasoning": 0.6044795513153076, "adv/mean_abs_step_conf": 0.019319185987114906, "adv/ratio_final_to_reasoning": 0.03190610775010837, "adv/ratio_step_to_reasoning": 0.03196003230395078, "adv/std_final_conf": 0.16526895761489868, "adv/std_reasoning": 0.7929074168205261, "adv/std_step_conf": 0.16554827988147736, "calib/answer_extract_rate": 0.5546875, "calib/avg_num_step_conf": 0.2578125, "calib/ece": 0.25496453900709215, "calib/final_conf_rate": 0.55078125, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.23404255319148937, "calib/gap": -0.01642367066895356, "calib/mean_conf": 0.8439007092198583, "calib/mu_c": 0.8377272727272728, "calib/mu_w": 0.8541509433962263, "calib/nonempty_final_conf_rate": 0.55078125, "calib/nonempty_reasoning_rate": 0.80859375, "calib/nonempty_step_conf_rate": 0.2578125, "calib/pce": 0.23737588652482264, "calib/std_conf": 0.11490219738558534, "calib/step_conf_rate": 0.2578125, "calib/step_q_w": 0.7868787878787877, "calib/step_q_w_n": 66.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2897.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 391.9296875, "completions/mean_terminated_length": 393.4666748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.1984, "grad_norm": 0.22775553166866302, "kl": 0.134429931640625, "learning_rate": 4.1666666666666667e-07, "loss": 0.0012, "mask/has_final_conf_rate": 0.55078125, "mask/share_final_conf": 0.02644348330795765, "mask/share_reasoning": 0.9552706480026245, "mask/share_step_conf": 0.014379597268998623, "num_tokens": 35322166.0, "reward": 0.03424294292926788, "reward_std": 0.035134799778461456, "rewards/accuracy_reward_step": 0.34765625, "rewards/final_brier_reward_step": 0.0008812500163912773, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0027078634593635798, "step": 186 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5996350646018982, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928953170776367, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.51171875, "calib/avg_num_step_conf": 0.3046875, "calib/ece": 0.3649242424242423, "calib/final_conf_rate": 0.515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2727272727272727, "calib/gap": -0.003750000000000031, "calib/mean_conf": 0.8643181818181819, "calib/mu_c": 0.8624999999999999, "calib/mu_w": 0.86625, "calib/nonempty_final_conf_rate": 0.515625, "calib/nonempty_reasoning_rate": 0.81640625, "calib/nonempty_step_conf_rate": 0.3046875, "calib/pce": 0.35704545454545444, "calib/std_conf": 0.08028541116030398, "calib/step_conf_rate": 0.3046875, "calib/step_q_w": 0.7730769230769231, "calib/step_q_w_n": 78.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2431.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 410.00390625, "completions/mean_terminated_length": 410.00390625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.19946666666666665, "grad_norm": 0.14223752915859222, "kl": 0.1377716064453125, "learning_rate": 3.8888888888888895e-07, "loss": 0.0046, "mask/has_final_conf_rate": 0.515625, "mask/share_final_conf": 0.022872790694236755, "mask/share_reasoning": 0.9567241072654724, "mask/share_step_conf": 0.020403096452355385, "num_tokens": 35528671.0, "reward": 0.02695312537252903, "reward_std": 0.03433658182621002, "rewards/accuracy_reward_step": 0.26953125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 187 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6717088222503662, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8589801788330078, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.51171875, "calib/avg_num_step_conf": 0.33203125, "calib/ece": 0.2512878787878786, "calib/final_conf_rate": 0.515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.29545454545454547, "calib/gap": 0.005778846153846162, "calib/mean_conf": 0.8473484848484849, "calib/mu_c": 0.849625, "calib/mu_w": 0.8438461538461538, "calib/nonempty_final_conf_rate": 0.515625, "calib/nonempty_reasoning_rate": 0.84375, "calib/nonempty_step_conf_rate": 0.33203125, "calib/pce": 0.2462878787878786, "calib/std_conf": 0.09837975161590594, "calib/step_conf_rate": 0.33203125, "calib/step_q_w": 0.7765882352941176, "calib/step_q_w_n": 85.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 437.89453125, "completions/mean_terminated_length": 437.89453125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.20053333333333334, "grad_norm": 0.17902015149593353, "kl": 0.115936279296875, "learning_rate": 3.611111111111111e-07, "loss": 0.0032, "mask/has_final_conf_rate": 0.515625, "mask/share_final_conf": 0.02250046841800213, "mask/share_reasoning": 0.96071457862854, "mask/share_step_conf": 0.016784997656941414, "num_tokens": 35744844.0, "reward": 0.03125, "reward_std": 0.03846754878759384, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 188 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5805579423904419, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7754928469657898, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55859375, "calib/avg_num_step_conf": 0.2578125, "calib/ece": 0.2944366197183097, "calib/final_conf_rate": 0.5546875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.22535211267605634, "calib/gap": 0.007846774193548622, "calib/mean_conf": 0.8523239436619718, "calib/mu_c": 0.8557500000000001, "calib/mu_w": 0.8479032258064515, "calib/nonempty_final_conf_rate": 0.5546875, "calib/nonempty_reasoning_rate": 0.81640625, "calib/nonempty_step_conf_rate": 0.2578125, "calib/pce": 0.2916901408450702, "calib/std_conf": 0.08090740223589067, "calib/step_conf_rate": 0.2578125, "calib/step_q_w": 0.7725757575757575, "calib/step_q_w_n": 66.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 361.56640625, "completions/mean_terminated_length": 361.56640625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2016, "grad_norm": 0.15060287714004517, "kl": 0.1468963623046875, "learning_rate": 3.3333333333333335e-07, "loss": 0.0419, "mask/has_final_conf_rate": 0.5546875, "mask/share_final_conf": 0.028109043836593628, "mask/share_reasoning": 0.9566246271133423, "mask/share_step_conf": 0.015266265720129013, "num_tokens": 35945173.0, "reward": 0.03125, "reward_std": 0.03324335068464279, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 189 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5268406271934509, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7393897771835327, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.46484375, "calib/avg_num_step_conf": 0.328125, "calib/ece": 0.25932773109243684, "calib/final_conf_rate": 0.46484375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.18487394957983194, "calib/gap": 0.022919600938966944, "calib/mean_conf": 0.855966386554622, "calib/mu_c": 0.8652112676056337, "calib/mu_w": 0.8422916666666668, "calib/nonempty_final_conf_rate": 0.46484375, "calib/nonempty_reasoning_rate": 0.79296875, "calib/nonempty_step_conf_rate": 0.328125, "calib/pce": 0.25932773109243684, "calib/std_conf": 0.0751393028748793, "calib/step_conf_rate": 0.328125, "calib/step_q_w": 0.7607142857142858, "calib/step_q_w_n": 84.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 418.125, "completions/mean_terminated_length": 419.7647399902344, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 0.20266666666666666, "grad_norm": 0.1258547306060791, "kl": 0.12890625, "learning_rate": 3.055555555555556e-07, "loss": 0.0258, "mask/has_final_conf_rate": 0.46484375, "mask/share_final_conf": 0.018940530717372894, "mask/share_reasoning": 0.9564769268035889, "mask/share_step_conf": 0.02067631483078003, "num_tokens": 36157821.0, "reward": 0.02812499925494194, "reward_std": 0.03016754984855652, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 190 }, { "adv/mean_abs_final_conf": 0.01930162124335766, "adv/mean_abs_reasoning": 0.5160157084465027, "adv/mean_abs_step_conf": 0.01927414909005165, "adv/ratio_final_to_reasoning": 0.03740510400636133, "adv/ratio_step_to_reasoning": 0.03735186501991901, "adv/std_final_conf": 0.1653977632522583, "adv/std_reasoning": 0.7393521666526794, "adv/std_step_conf": 0.1651623547077179, "calib/answer_extract_rate": 0.48828125, "calib/avg_num_step_conf": 0.359375, "calib/ece": 0.37543307086614175, "calib/final_conf_rate": 0.49609375, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.2440944881889764, "calib/gap": 0.00689518132141087, "calib/mean_conf": 0.8554330708661417, "calib/mu_c": 0.859016393442623, "calib/mu_w": 0.8521212121212122, "calib/nonempty_final_conf_rate": 0.49609375, "calib/nonempty_reasoning_rate": 0.8203125, "calib/nonempty_step_conf_rate": 0.3359375, "calib/pce": 0.3752755905511811, "calib/std_conf": 0.09370548522597767, "calib/step_conf_rate": 0.3359375, "calib/step_q_w": 0.7830434782608695, "calib/step_q_w_n": 92.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 359.9296875, "completions/mean_terminated_length": 361.3411865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 51.0, "epoch": 0.20373333333333332, "grad_norm": 0.845258891582489, "kl": 0.1506805419921875, "learning_rate": 2.7777777777777776e-07, "loss": 0.0766, "mask/has_final_conf_rate": 0.49609375, "mask/share_final_conf": 0.024587150663137436, "mask/share_reasoning": 0.9499251246452332, "mask/share_step_conf": 0.021581534296274185, "num_tokens": 36354131.0, "reward": 0.024508347734808922, "reward_std": 0.030368084087967873, "rewards/accuracy_reward_step": 0.23828125, "rewards/final_brier_reward_step": 0.0012796875089406967, "rewards/format_reward_step": 0.00390625, "rewards/step_l2_reward": -0.0007004928193055093, "step": 191 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5777860879898071, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928310632705688, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.51171875, "calib/avg_num_step_conf": 0.3203125, "calib/ece": 0.2725757575757576, "calib/final_conf_rate": 0.515625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.24242424242424243, "calib/gap": 0.019642857142857184, "calib/mean_conf": 0.8466666666666667, "calib/mu_c": 0.8550000000000001, "calib/mu_w": 0.8353571428571429, "calib/nonempty_final_conf_rate": 0.515625, "calib/nonempty_reasoning_rate": 0.83203125, "calib/nonempty_step_conf_rate": 0.3203125, "calib/pce": 0.27174242424242423, "calib/std_conf": 0.08751334674542996, "calib/step_conf_rate": 0.3203125, "calib/step_q_w": 0.7858536585365853, "calib/step_q_w_n": 82.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 395.37890625, "completions/mean_terminated_length": 395.37890625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2048, "grad_norm": 0.1448807716369629, "kl": 0.1321868896484375, "learning_rate": 2.5000000000000004e-07, "loss": 0.0108, "mask/has_final_conf_rate": 0.515625, "mask/share_final_conf": 0.02366378903388977, "mask/share_reasoning": 0.9502225518226624, "mask/share_step_conf": 0.026113644242286682, "num_tokens": 36560324.0, "reward": 0.02968749962747097, "reward_std": 0.033088065683841705, "rewards/accuracy_reward_step": 0.296875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 192 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6848012208938599, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8590258955955505, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.546875, "calib/avg_num_step_conf": 0.24609375, "calib/ece": 0.30107913669064745, "calib/final_conf_rate": 0.54296875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.26618705035971224, "calib/gap": 0.046375000000000055, "calib/mean_conf": 0.8406474820143884, "calib/mu_c": 0.8620000000000001, "calib/mu_w": 0.815625, "calib/nonempty_final_conf_rate": 0.54296875, "calib/nonempty_reasoning_rate": 0.79296875, "calib/nonempty_step_conf_rate": 0.24609375, "calib/pce": 0.30107913669064745, "calib/std_conf": 0.12079597302052868, "calib/step_conf_rate": 0.24609375, "calib/step_q_w": 0.7687301587301588, "calib/step_q_w_n": 63.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 383.0859375, "completions/mean_terminated_length": 383.0859375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.20586666666666667, "grad_norm": 0.15478582680225372, "kl": 0.134765625, "learning_rate": 2.2222222222222224e-07, "loss": 0.0317, "mask/has_final_conf_rate": 0.54296875, "mask/share_final_conf": 0.02465350739657879, "mask/share_reasoning": 0.9587289690971375, "mask/share_step_conf": 0.016617517918348312, "num_tokens": 36764106.0, "reward": 0.029296875, "reward_std": 0.03921569138765335, "rewards/accuracy_reward_step": 0.29296875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 193 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6990294456481934, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8747507333755493, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.55078125, "calib/avg_num_step_conf": 0.28515625, "calib/ece": 0.23746376811594205, "calib/final_conf_rate": 0.5390625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.18115942028985507, "calib/gap": 0.012724867724867583, "calib/mean_conf": 0.8331159420289856, "calib/mu_c": 0.8380952380952381, "calib/mu_w": 0.8253703703703705, "calib/nonempty_final_conf_rate": 0.5390625, "calib/nonempty_reasoning_rate": 0.8359375, "calib/nonempty_step_conf_rate": 0.28515625, "calib/pce": 0.23094202898550725, "calib/std_conf": 0.11233877803412114, "calib/step_conf_rate": 0.28515625, "calib/step_q_w": 0.7839726027397261, "calib/step_q_w_n": 73.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 353.5234375, "completions/mean_terminated_length": 353.5234375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.20693333333333333, "grad_norm": 0.1833369880914688, "kl": 0.1504364013671875, "learning_rate": 1.9444444444444447e-07, "loss": 0.0135, "mask/has_final_conf_rate": 0.5390625, "mask/share_final_conf": 0.028572622686624527, "mask/share_reasoning": 0.9558173418045044, "mask/share_step_conf": 0.01561001781374216, "num_tokens": 36960552.0, "reward": 0.03359375149011612, "reward_std": 0.04003185033798218, "rewards/accuracy_reward_step": 0.3359375, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 194 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5826220512390137, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928285002708435, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.48046875, "calib/avg_num_step_conf": 0.31640625, "calib/ece": 0.33798387096774196, "calib/final_conf_rate": 0.484375, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2661290322580645, "calib/gap": 0.0128839634941329, "calib/mean_conf": 0.8621774193548388, "calib/mu_c": 0.8683076923076922, "calib/mu_w": 0.8554237288135593, "calib/nonempty_final_conf_rate": 0.484375, "calib/nonempty_reasoning_rate": 0.796875, "calib/nonempty_step_conf_rate": 0.31640625, "calib/pce": 0.33798387096774196, "calib/std_conf": 0.08323655063181201, "calib/step_conf_rate": 0.31640625, "calib/step_q_w": 0.7564197530864198, "calib/step_q_w_n": 81.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 396.2890625, "completions/mean_terminated_length": 397.8431701660156, "completions/min_length": 0.0, "completions/min_terminated_length": 38.0, "epoch": 0.208, "grad_norm": 0.14063632488250732, "kl": 0.1334228515625, "learning_rate": 1.6666666666666668e-07, "loss": 0.003, "mask/has_final_conf_rate": 0.484375, "mask/share_final_conf": 0.02396385371685028, "mask/share_reasoning": 0.9530671834945679, "mask/share_step_conf": 0.019062696024775505, "num_tokens": 37167986.0, "reward": 0.025390625, "reward_std": 0.03336440399289131, "rewards/accuracy_reward_step": 0.25390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 195 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5906708240509033, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.7928717732429504, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.51171875, "calib/avg_num_step_conf": 0.3515625, "calib/ece": 0.3270229007633587, "calib/final_conf_rate": 0.51171875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.21374045801526717, "calib/gap": 0.022651727357609674, "calib/mean_conf": 0.8396946564885497, "calib/mu_c": 0.8505882352941176, "calib/mu_w": 0.827936507936508, "calib/nonempty_final_conf_rate": 0.51171875, "calib/nonempty_reasoning_rate": 0.86328125, "calib/nonempty_step_conf_rate": 0.3515625, "calib/pce": 0.32381679389312973, "calib/std_conf": 0.10209215214059696, "calib/step_conf_rate": 0.3515625, "calib/step_q_w": 0.7533333333333333, "calib/step_q_w_n": 90.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 319.04296875, "completions/mean_terminated_length": 319.04296875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.20906666666666668, "grad_norm": 0.15355850756168365, "kl": 0.1542510986328125, "learning_rate": 1.3888888888888888e-07, "loss": -0.0033, "mask/has_final_conf_rate": 0.51171875, "mask/share_final_conf": 0.026806414127349854, "mask/share_reasoning": 0.9513841271400452, "mask/share_step_conf": 0.021809469908475876, "num_tokens": 37352205.0, "reward": 0.02656250074505806, "reward_std": 0.03382434323430061, "rewards/accuracy_reward_step": 0.265625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 196 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6188219785690308, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8428393006324768, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5703125, "calib/avg_num_step_conf": 0.2578125, "calib/ece": 0.3242758620689655, "calib/final_conf_rate": 0.56640625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.25517241379310346, "calib/gap": 0.03402555301296706, "calib/mean_conf": 0.8484137931034483, "calib/mu_c": 0.8646052631578947, "calib/mu_w": 0.8305797101449276, "calib/nonempty_final_conf_rate": 0.56640625, "calib/nonempty_reasoning_rate": 0.828125, "calib/nonempty_step_conf_rate": 0.2578125, "calib/pce": 0.3242758620689655, "calib/std_conf": 0.08987866044203009, "calib/step_conf_rate": 0.2578125, "calib/step_q_w": 0.7756060606060606, "calib/step_q_w_n": 66.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 393.34765625, "completions/mean_terminated_length": 393.34765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.21013333333333334, "grad_norm": 0.15462549030780792, "kl": 0.1384735107421875, "learning_rate": 1.1111111111111112e-07, "loss": 0.0193, "mask/has_final_conf_rate": 0.56640625, "mask/share_final_conf": 0.026654046028852463, "mask/share_reasoning": 0.9595667719841003, "mask/share_step_conf": 0.01377915684133768, "num_tokens": 37557958.0, "reward": 0.02968750149011612, "reward_std": 0.03544231504201889, "rewards/accuracy_reward_step": 0.296875, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 197 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.5998426675796509, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8098545670509338, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.47265625, "calib/avg_num_step_conf": 0.3671875, "calib/ece": 0.3226446280991735, "calib/final_conf_rate": 0.47265625, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.24793388429752067, "calib/gap": 0.014246162280701746, "calib/mean_conf": 0.8515702479338844, "calib/mu_c": 0.85828125, "calib/mu_w": 0.8440350877192982, "calib/nonempty_final_conf_rate": 0.47265625, "calib/nonempty_reasoning_rate": 0.83984375, "calib/nonempty_step_conf_rate": 0.3671875, "calib/pce": 0.3226446280991735, "calib/std_conf": 0.08843925182618231, "calib/step_conf_rate": 0.3671875, "calib/step_q_w": 0.7740425531914893, "calib/step_q_w_n": 94.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 333.3046875, "completions/mean_terminated_length": 334.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.2112, "grad_norm": 0.19545160233974457, "kl": 0.2003021240234375, "learning_rate": 8.333333333333334e-08, "loss": 0.0158, "mask/has_final_conf_rate": 0.47265625, "mask/share_final_conf": 0.02741253189742565, "mask/share_reasoning": 0.9429019093513489, "mask/share_step_conf": 0.02577931620180607, "num_tokens": 37748668.0, "reward": 0.02499999850988388, "reward_std": 0.03435155376791954, "rewards/accuracy_reward_step": 0.25, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 198 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6346948146820068, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8428850173950195, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.4609375, "calib/avg_num_step_conf": 0.34765625, "calib/ece": 0.3185217391304347, "calib/final_conf_rate": 0.44921875, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.20869565217391303, "calib/gap": -0.03951899509803902, "calib/mean_conf": 0.8348695652173913, "calib/mu_c": 0.8173437499999999, "calib/mu_w": 0.856862745098039, "calib/nonempty_final_conf_rate": 0.44921875, "calib/nonempty_reasoning_rate": 0.80859375, "calib/nonempty_step_conf_rate": 0.34765625, "calib/pce": 0.2984347826086956, "calib/std_conf": 0.10425191244114926, "calib/step_conf_rate": 0.34765625, "calib/step_q_w": 0.7644943820224718, "calib/step_q_w_n": 89.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 410.59765625, "completions/mean_terminated_length": 412.2078552246094, "completions/min_length": 0.0, "completions/min_terminated_length": 30.0, "epoch": 0.21226666666666666, "grad_norm": 0.14168593287467957, "kl": 0.1448822021484375, "learning_rate": 5.555555555555556e-08, "loss": 0.0453, "mask/has_final_conf_rate": 0.44921875, "mask/share_final_conf": 0.023079898208379745, "mask/share_reasoning": 0.9513913989067078, "mask/share_step_conf": 0.021622436121106148, "num_tokens": 37957981.0, "reward": 0.025390625, "reward_std": 0.0363493412733078, "rewards/accuracy_reward_step": 0.25390625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 199 }, { "adv/mean_abs_final_conf": 0.0, "adv/mean_abs_reasoning": 0.6308720111846924, "adv/mean_abs_step_conf": 0.0, "adv/ratio_final_to_reasoning": 0.0, "adv/ratio_step_to_reasoning": 0.0, "adv/std_final_conf": 0.0, "adv/std_reasoning": 0.8099633455276489, "adv/std_step_conf": 0.0, "calib/answer_extract_rate": 0.5078125, "calib/avg_num_step_conf": 0.28125, "calib/ece": 0.23124999999999987, "calib/final_conf_rate": 0.5, "calib/format_rate": 0.0, "calib/frac_conf_gt_0.9": 0.2109375, "calib/gap": 0.02359080340997155, "calib/mean_conf": 0.8484375000000001, "calib/mu_c": 0.8574683544303797, "calib/mu_w": 0.8338775510204082, "calib/nonempty_final_conf_rate": 0.5, "calib/nonempty_reasoning_rate": 0.7890625, "calib/nonempty_step_conf_rate": 0.28125, "calib/pce": 0.23124999999999987, "calib/std_conf": 0.0858636045932734, "calib/step_conf_rate": 0.28125, "calib/step_q_w": 0.7734722222222222, "calib/step_q_w_n": 72.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 390.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.21333333333333335, "grad_norm": 0.15176671743392944, "kl": 0.129180908203125, "learning_rate": 2.777777777777778e-08, "loss": -0.0024, "mask/has_final_conf_rate": 0.5, "mask/share_final_conf": 0.02337362989783287, "mask/share_reasoning": 0.9549142122268677, "mask/share_step_conf": 0.017805900424718857, "num_tokens": 38165709.0, "reward": 0.03164062649011612, "reward_std": 0.03612466901540756, "rewards/accuracy_reward_step": 0.31640625, "rewards/final_brier_reward_step": 0.0, "rewards/format_reward_step": 0.0, "rewards/step_l2_reward": 0.0, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.0440832228428917, "train_runtime": 5329.4705, "train_samples_per_second": 9.607, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 38165709, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }