{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.7654397487640381, "adv/mean_abs_reasoning": 0.424932599067688, "adv/mean_abs_step_conf": 0.7846847772598267, "adv/ratio_final_to_reasoning": 1.801320375145213, "adv/ratio_step_to_reasoning": 1.84660997763279, "adv/std_final_conf": 0.9287529587745667, "adv/std_reasoning": 0.7013161778450012, "adv/std_step_conf": 0.9368540644645691, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.21378906250000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3046875, "calib/gap": 0.00016580667354659795, "calib/mean_conf": 0.8817578125000001, "calib/mu_c": 0.8818128654970762, "calib/mu_w": 0.8816470588235296, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21378906250000007, "calib/std_conf": 0.048946278921025696, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8082866741321388, "calib/step_q_c_n": 893.0, "calib/step_q_gap": 0.019531399870535426, "calib/step_q_w": 0.7887552742616034, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 451.0703125, "completions/mean_terminated_length": 452.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.0010666666666666667, "grad_norm": 1.9545003175735474, "kl": 0.00033098459243774414, "learning_rate": 0.0, "loss": 0.0363, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03566828370094299, "mask/share_reasoning": 0.8323470950126648, "mask/share_step_conf": 0.1280784010887146, "num_tokens": 223058.0, "reward": 0.560789942741394, "reward_std": 0.3364860415458679, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7295855283737183, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.0591818243265152, "step": 1 }, { "adv/mean_abs_final_conf": 0.7925335168838501, "adv/mean_abs_reasoning": 0.4465929865837097, "adv/mean_abs_step_conf": 0.772151529788971, "adv/ratio_final_to_reasoning": 1.7746215025597967, "adv/ratio_step_to_reasoning": 1.728982659794274, "adv/std_final_conf": 0.9316501617431641, "adv/std_reasoning": 0.7013890147209167, "adv/std_step_conf": 0.9368599653244019, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.2887058823529413, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2549019607843137, "calib/gap": -0.004035076611371591, "calib/mean_conf": 0.8730196078431373, "calib/mu_c": 0.8713422818791945, "calib/mu_w": 0.8753773584905661, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2887058823529413, "calib/std_conf": 0.04379877793804094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7949142857142858, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.006359152634437981, "calib/step_q_w": 0.7885551330798478, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 490.6640625, "completions/mean_terminated_length": 492.5882568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0021333333333333334, "grad_norm": 1.1789993047714233, "kl": 0.0003865659236907959, "learning_rate": 2.5000000000000004e-07, "loss": -0.0241, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033253151923418045, "mask/share_reasoning": 0.8548593521118164, "mask/share_step_conf": 0.10798123478889465, "num_tokens": 451956.0, "reward": 0.5552769899368286, "reward_std": 0.33204489946365356, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6672624945640564, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.12766647338867188, "step": 2 }, { "adv/mean_abs_final_conf": 0.7901794910430908, "adv/mean_abs_reasoning": 0.5001744031906128, "adv/mean_abs_step_conf": 0.7801445126533508, "adv/ratio_final_to_reasoning": 1.5798079349973437, "adv/ratio_step_to_reasoning": 1.5597449763058817, "adv/std_final_conf": 0.929809033870697, "adv/std_reasoning": 0.739359974861145, "adv/std_step_conf": 0.9367504715919495, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.21453124999999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.30859375, "calib/gap": -0.009246954595792056, "calib/mean_conf": 0.8864062500000001, "calib/mu_c": 0.8833720930232558, "calib/mu_w": 0.8926190476190479, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21453124999999995, "calib/std_conf": 0.04392348416209147, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7930548926014319, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.03363110077987064, "calib/step_q_w": 0.7594237918215613, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 498.06640625, "completions/mean_terminated_length": 500.0196228027344, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.0032, "grad_norm": 1.7985597848892212, "kl": 0.0006094872951507568, "learning_rate": 5.000000000000001e-07, "loss": 0.0081, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0330534353852272, "mask/share_reasoning": 0.8490853309631348, "mask/share_step_conf": 0.11395496875047684, "num_tokens": 684717.0, "reward": 0.570434033870697, "reward_std": 0.34667086601257324, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7267687916755676, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.08050548285245895, "step": 3 }, { "adv/mean_abs_final_conf": 0.7865801453590393, "adv/mean_abs_reasoning": 0.38204020261764526, "adv/mean_abs_step_conf": 0.7897936105728149, "adv/ratio_final_to_reasoning": 2.0588936451441135, "adv/ratio_step_to_reasoning": 2.0673049725168813, "adv/std_final_conf": 0.9282608032226562, "adv/std_reasoning": 0.6403441429138184, "adv/std_step_conf": 0.9368535280227661, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.2534387351778655, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2766798418972332, "calib/gap": -0.007984076006958163, "calib/mean_conf": 0.8818972332015811, "calib/mu_c": 0.8789308176100629, "calib/mu_w": 0.886914893617021, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2534387351778655, "calib/std_conf": 0.039726800626239726, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.805129151291513, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.008845626387298466, "calib/step_q_w": 0.7962835249042145, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 512.40234375, "completions/mean_terminated_length": 514.4118041992188, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.004266666666666667, "grad_norm": 1.8774765729904175, "kl": 0.003939583897590637, "learning_rate": 7.5e-07, "loss": 0.0083, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03307841718196869, "mask/share_reasoning": 0.8460214138031006, "mask/share_step_conf": 0.11699393391609192, "num_tokens": 922060.0, "reward": 0.5607033371925354, "reward_std": 0.32289978861808777, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6887961030006409, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.11073563247919083, "step": 4 }, { "adv/mean_abs_final_conf": 0.7866071462631226, "adv/mean_abs_reasoning": 0.391832560300827, "adv/mean_abs_step_conf": 0.8027805089950562, "adv/ratio_final_to_reasoning": 2.007508374646583, "adv/ratio_step_to_reasoning": 2.0487845838506282, "adv/std_final_conf": 0.9314478039741516, "adv/std_reasoning": 0.6612939834594727, "adv/std_step_conf": 0.9368767142295837, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.54296875, "calib/ece": 0.34543650793650793, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.27380952380952384, "calib/gap": -0.010557804199342269, "calib/mean_conf": 0.8771825396825397, "calib/mu_c": 0.8722388059701492, "calib/mu_w": 0.8827966101694915, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.34543650793650793, "calib/std_conf": 0.04505286150798803, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8036704730831974, "calib/step_q_c_n": 613.0, "calib/step_q_gap": 0.011561382174106516, "calib/step_q_w": 0.7921090909090909, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 493.96875, "completions/mean_terminated_length": 495.9059143066406, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.005333333333333333, "grad_norm": 1.215853214263916, "kl": 0.0003140568733215332, "learning_rate": 1.0000000000000002e-06, "loss": -0.0032, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034689657390117645, "mask/share_reasoning": 0.854096531867981, "mask/share_step_conf": 0.1073075607419014, "num_tokens": 1155204.0, "reward": 0.4441468417644501, "reward_std": 0.33386993408203125, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6089304685592651, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": -0.01985558122396469, "step": 5 }, { "adv/mean_abs_final_conf": 0.7760541439056396, "adv/mean_abs_reasoning": 0.4202449917793274, "adv/mean_abs_step_conf": 0.7856012582778931, "adv/ratio_final_to_reasoning": 1.8466707732073326, "adv/ratio_step_to_reasoning": 1.869388746196923, "adv/std_final_conf": 0.9323500394821167, "adv/std_reasoning": 0.6815860867500305, "adv/std_step_conf": 0.9368250370025635, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.33696850393700795, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.27165354330708663, "calib/gap": 0.009691808596917872, "calib/mean_conf": 0.8763385826771654, "calib/mu_c": 0.8808029197080292, "calib/mu_w": 0.8711111111111113, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.33696850393700795, "calib/std_conf": 0.07018240806028818, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8048847926267282, "calib/step_q_c_n": 651.0, "calib/step_q_gap": -0.00010720737327196694, "calib/step_q_w": 0.8049920000000002, "calib/step_q_w_n": 625.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 433.4140625, "completions/mean_terminated_length": 435.1137390136719, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0064, "grad_norm": 1.1629302501678467, "kl": 0.0004588663578033447, "learning_rate": 1.25e-06, "loss": 0.005, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03715100884437561, "mask/share_reasoning": 0.8348917961120605, "mask/share_step_conf": 0.12405093014240265, "num_tokens": 1372110.0, "reward": 0.504515528678894, "reward_std": 0.3196646273136139, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6290793418884277, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.07526424527168274, "step": 6 }, { "adv/mean_abs_final_conf": 0.776243269443512, "adv/mean_abs_reasoning": 0.3850036561489105, "adv/mean_abs_step_conf": 0.7583280801773071, "adv/ratio_final_to_reasoning": 2.016197137471544, "adv/ratio_step_to_reasoning": 1.9696646202341603, "adv/std_final_conf": 0.9296020269393921, "adv/std_reasoning": 0.6613085865974426, "adv/std_step_conf": 0.9368376731872559, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.27442687747035577, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": -0.0037748518762346084, "calib/mean_conf": 0.8870750988142293, "calib/mu_c": 0.8856129032258063, "calib/mu_w": 0.8893877551020409, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27442687747035577, "calib/std_conf": 0.044722931536599765, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8003634232121923, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.019843942692711836, "calib/step_q_w": 0.7805194805194805, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 552.13671875, "completions/mean_terminated_length": 552.13671875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.007466666666666667, "grad_norm": 1.8058161735534668, "kl": 0.00024586915969848633, "learning_rate": 1.5e-06, "loss": 0.0096, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030009038746356964, "mask/share_reasoning": 0.8615716695785522, "mask/share_step_conf": 0.10841932892799377, "num_tokens": 1620881.0, "reward": 0.5174537301063538, "reward_std": 0.3296523988246918, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6755769848823547, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.04058045893907547, "step": 7 }, { "adv/mean_abs_final_conf": 0.7708383798599243, "adv/mean_abs_reasoning": 0.4731570780277252, "adv/mean_abs_step_conf": 0.780807614326477, "adv/ratio_final_to_reasoning": 1.6291384313070683, "adv/ratio_step_to_reasoning": 1.6502080399624175, "adv/std_final_conf": 0.9300093650817871, "adv/std_reasoning": 0.7392821311950684, "adv/std_step_conf": 0.9368556141853333, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.8984375, "calib/ece": 0.32984126984126977, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2896825396825397, "calib/gap": -0.006260302856047617, "calib/mean_conf": 0.8832539682539682, "calib/mu_c": 0.8804964539007093, "calib/mu_w": 0.8867567567567569, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32678571428571423, "calib/std_conf": 0.04346736351105467, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7954205607476635, "calib/step_q_c_n": 642.0, "calib/step_q_gap": 0.01764278296988575, "calib/step_q_w": 0.7777777777777778, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 537.5234375, "completions/mean_terminated_length": 539.6314086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.008533333333333334, "grad_norm": 1.3534213304519653, "kl": 0.00039950013160705566, "learning_rate": 1.75e-06, "loss": 0.076, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032188210636377335, "mask/share_reasoning": 0.8591476678848267, "mask/share_step_conf": 0.1047578901052475, "num_tokens": 1864999.0, "reward": 0.49813294410705566, "reward_std": 0.3371485471725464, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6337077617645264, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.055526793003082275, "step": 8 }, { "adv/mean_abs_final_conf": 0.7789124846458435, "adv/mean_abs_reasoning": 0.4699265956878662, "adv/mean_abs_step_conf": 0.7847044467926025, "adv/ratio_final_to_reasoning": 1.6575194760060172, "adv/ratio_step_to_reasoning": 1.6698447246723136, "adv/std_final_conf": 0.9298046231269836, "adv/std_reasoning": 0.7393259406089783, "adv/std_step_conf": 0.9367402791976929, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.859375, "calib/ece": 0.2735059760956176, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3107569721115538, "calib/gap": 0.005534880618914162, "calib/mean_conf": 0.8830677290836654, "calib/mu_c": 0.8852287581699346, "calib/mu_w": 0.8796938775510205, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2735059760956176, "calib/std_conf": 0.06964159380765228, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7853571428571429, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.017713700234192054, "calib/step_q_w": 0.7676434426229508, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 508.93359375, "completions/mean_terminated_length": 510.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0096, "grad_norm": 1.1945762634277344, "kl": 0.0003765225410461426, "learning_rate": 2.0000000000000003e-06, "loss": -0.0441, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03323180973529816, "mask/share_reasoning": 0.8569766283035278, "mask/share_step_conf": 0.1058853268623352, "num_tokens": 2102822.0, "reward": 0.5631452798843384, "reward_std": 0.3465169668197632, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6716042757034302, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.13906130194664001, "step": 9 }, { "adv/mean_abs_final_conf": 0.7726255059242249, "adv/mean_abs_reasoning": 0.4702494144439697, "adv/mean_abs_step_conf": 0.7942739725112915, "adv/ratio_final_to_reasoning": 1.6430121594894262, "adv/ratio_step_to_reasoning": 1.689048296743662, "adv/std_final_conf": 0.9302437901496887, "adv/std_reasoning": 0.7391924262046814, "adv/std_step_conf": 0.9368513822555542, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.16796875, "calib/ece": 0.32607843137254894, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4392156862745098, "calib/gap": -0.002115987460815094, "calib/mean_conf": 0.8947058823529412, "calib/mu_c": 0.8937931034482758, "calib/mu_w": 0.8959090909090909, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32607843137254894, "calib/std_conf": 0.043438166157463584, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8040287769784172, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.019188012647207042, "calib/step_q_w": 0.7848407643312102, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 518.27734375, "completions/mean_terminated_length": 520.309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.010666666666666666, "grad_norm": 1.2181464433670044, "kl": 0.0018355250358581543, "learning_rate": 2.25e-06, "loss": 0.0271, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03234288841485977, "mask/share_reasoning": 0.8521287441253662, "mask/share_step_conf": 0.11162212491035461, "num_tokens": 2342301.0, "reward": 0.5642529129981995, "reward_std": 0.3629717230796814, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6429363489151001, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.17306941747665405, "step": 10 }, { "adv/mean_abs_final_conf": 0.760771632194519, "adv/mean_abs_reasoning": 0.40336328744888306, "adv/mean_abs_step_conf": 0.7808301448822021, "adv/ratio_final_to_reasoning": 1.886070586656772, "adv/ratio_step_to_reasoning": 1.9357987431643844, "adv/std_final_conf": 0.9269143342971802, "adv/std_reasoning": 0.6816110014915466, "adv/std_step_conf": 0.9368925094604492, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.5390625, "calib/ece": 0.3357539682539683, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4880952380952381, "calib/gap": -0.022485138278625083, "calib/mean_conf": 0.8982936507936509, "calib/mu_c": 0.8888356164383562, "calib/mu_w": 0.9113207547169813, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3273412698412699, "calib/std_conf": 0.0826583681952617, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8008383961117861, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.010233354094979386, "calib/step_q_w": 0.7906050420168067, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 526.703125, "completions/mean_terminated_length": 528.7686767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.011733333333333333, "grad_norm": 1.2501106262207031, "kl": 0.0008699893951416016, "learning_rate": 2.5e-06, "loss": -0.0689, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03238905593752861, "mask/share_reasoning": 0.8449083566665649, "mask/share_step_conf": 0.11879631131887436, "num_tokens": 2581617.0, "reward": 0.503549337387085, "reward_std": 0.36613786220550537, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6268417835235596, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.06931937485933304, "step": 11 }, { "adv/mean_abs_final_conf": 0.7328664660453796, "adv/mean_abs_reasoning": 0.4146099090576172, "adv/mean_abs_step_conf": 0.7816406488418579, "adv/ratio_final_to_reasoning": 1.7676048016101207, "adv/ratio_step_to_reasoning": 1.8852435307648074, "adv/std_final_conf": 0.9236744046211243, "adv/std_reasoning": 0.7013520002365112, "adv/std_step_conf": 0.9367771148681641, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.2206324110671937, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.541501976284585, "calib/gap": 0.0014240597186335746, "calib/mean_conf": 0.9004743083003952, "calib/mu_c": 0.9009302325581398, "calib/mu_w": 0.8995061728395062, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2206324110671937, "calib/std_conf": 0.0464444537550029, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.805070575461455, "calib/step_q_c_n": 921.0, "calib/step_q_gap": 0.01968009927097869, "calib/step_q_w": 0.7853904761904763, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 483.02734375, "completions/mean_terminated_length": 484.9216003417969, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0128, "grad_norm": 1.860509991645813, "kl": 0.001871347427368164, "learning_rate": 2.7500000000000004e-06, "loss": -0.0402, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0358530730009079, "mask/share_reasoning": 0.8329986333847046, "mask/share_step_conf": 0.1272420436143875, "num_tokens": 2809448.0, "reward": 0.6194204092025757, "reward_std": 0.30002284049987793, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7226663827896118, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.1849244236946106, "step": 12 }, { "adv/mean_abs_final_conf": 0.7214281558990479, "adv/mean_abs_reasoning": 0.5423422455787659, "adv/mean_abs_step_conf": 0.7932885885238647, "adv/ratio_final_to_reasoning": 1.330208298874392, "adv/ratio_step_to_reasoning": 1.4627084557598846, "adv/std_final_conf": 0.9307587742805481, "adv/std_reasoning": 0.82643061876297, "adv/std_step_conf": 0.9368108510971069, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.98828125, "calib/ece": 0.33345238095238094, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5873015873015873, "calib/gap": -0.001161798914448231, "calib/mean_conf": 0.9128174603174604, "calib/mu_c": 0.9123287671232876, "calib/mu_w": 0.9134905660377358, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33345238095238094, "calib/std_conf": 0.038775892689498104, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8002991452991454, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.007533927907840909, "calib/step_q_w": 0.7927652173913045, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 499.08984375, "completions/mean_terminated_length": 501.0470886230469, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.013866666666666666, "grad_norm": 1.0362032651901245, "kl": 0.0023255348205566406, "learning_rate": 3e-06, "loss": 0.0097, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03379807621240616, "mask/share_reasoning": 0.8479717969894409, "mask/share_step_conf": 0.11432385444641113, "num_tokens": 3041807.0, "reward": 0.5706254839897156, "reward_std": 0.35822921991348267, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6329132914543152, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.1981814205646515, "step": 13 }, { "adv/mean_abs_final_conf": 0.748199462890625, "adv/mean_abs_reasoning": 0.5265183448791504, "adv/mean_abs_step_conf": 0.7572071552276611, "adv/ratio_final_to_reasoning": 1.421032087803809, "adv/ratio_step_to_reasoning": 1.4381401191281564, "adv/std_final_conf": 0.9257729053497314, "adv/std_reasoning": 0.7753845453262329, "adv/std_step_conf": 0.9367712140083313, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.37859999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.836, "calib/gap": -0.0031185300207038358, "calib/mean_conf": 0.9305999999999999, "calib/mu_c": 0.9292028985507246, "calib/mu_w": 0.9323214285714284, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.37859999999999994, "calib/std_conf": 0.02790053762922856, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.808342175066313, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.009035816684810238, "calib/step_q_w": 0.7993063583815028, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 547.8125, "completions/mean_terminated_length": 552.1259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.014933333333333333, "grad_norm": 1.0979946851730347, "kl": 0.0044422149658203125, "learning_rate": 3.2500000000000002e-06, "loss": 0.011, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03174138069152832, "mask/share_reasoning": 0.8420265913009644, "mask/share_step_conf": 0.11841952055692673, "num_tokens": 3287447.0, "reward": 0.4753739833831787, "reward_std": 0.3641613721847534, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5850558876991272, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.06412962079048157, "step": 14 }, { "adv/mean_abs_final_conf": 0.7671672105789185, "adv/mean_abs_reasoning": 0.4791414737701416, "adv/mean_abs_step_conf": 0.7795957922935486, "adv/ratio_final_to_reasoning": 1.6011287950976905, "adv/ratio_step_to_reasoning": 1.6270680685586902, "adv/std_final_conf": 0.919727623462677, "adv/std_reasoning": 0.7393106818199158, "adv/std_step_conf": 0.936776876449585, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 4.94140625, "calib/ece": 0.343503937007874, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9173228346456693, "calib/gap": -0.0007082043343652478, "calib/mean_conf": 0.9419291338582677, "calib/mu_c": 0.9416447368421053, "calib/mu_w": 0.9423529411764705, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.343503937007874, "calib/std_conf": 0.027533424526907862, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8025877763328999, "calib/step_q_c_n": 769.0, "calib/step_q_gap": -0.01410577205419683, "calib/step_q_w": 0.8166935483870967, "calib/step_q_w_n": 496.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 474.87109375, "completions/mean_terminated_length": 478.6102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.016, "grad_norm": 1.1696498394012451, "kl": 0.0074939727783203125, "learning_rate": 3.5e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034345079213380814, "mask/share_reasoning": 0.8435898423194885, "mask/share_step_conf": 0.11425255984067917, "num_tokens": 3516894.0, "reward": 0.5808800458908081, "reward_std": 0.3675926923751831, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6316972970962524, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.21365654468536377, "step": 15 }, { "adv/mean_abs_final_conf": 0.7474151849746704, "adv/mean_abs_reasoning": 0.4137473702430725, "adv/mean_abs_step_conf": 0.7812510132789612, "adv/ratio_final_to_reasoning": 1.806453016331128, "adv/ratio_step_to_reasoning": 1.8882319730998747, "adv/std_final_conf": 0.9071982502937317, "adv/std_reasoning": 0.6817223429679871, "adv/std_step_conf": 0.936766505241394, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 6.64453125, "calib/ece": 0.3914859437751004, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9718875502008032, "calib/gap": 0.0011249182472206076, "calib/mean_conf": 0.9497188755020078, "calib/mu_c": 0.9502158273381296, "calib/mu_w": 0.949090909090909, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3914859437751004, "calib/std_conf": 0.021721153726665426, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7934504792332269, "calib/step_q_c_n": 939.0, "calib/step_q_gap": 0.019631581595431435, "calib/step_q_w": 0.7738188976377954, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 635.703125, "completions/mean_terminated_length": 645.793701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.017066666666666667, "grad_norm": 0.8430995345115662, "kl": 0.008104801177978516, "learning_rate": 3.7500000000000005e-06, "loss": -0.0103, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.025683488696813583, "mask/share_reasoning": 0.8469931483268738, "mask/share_step_conf": 0.11169835925102234, "num_tokens": 3788482.0, "reward": 0.5129640102386475, "reward_std": 0.3399650454521179, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5838007926940918, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.13900226354599, "step": 16 }, { "adv/mean_abs_final_conf": 0.734099805355072, "adv/mean_abs_reasoning": 0.4848531484603882, "adv/mean_abs_step_conf": 0.7250394821166992, "adv/ratio_final_to_reasoning": 1.514066285196139, "adv/ratio_step_to_reasoning": 1.4953795482591032, "adv/std_final_conf": 0.899758517742157, "adv/std_reasoning": 0.757487416267395, "adv/std_step_conf": 0.9219540357589722, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.1947637795275589, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9606299212598425, "calib/gap": -0.00428013250658299, "calib/mean_conf": 0.9503543307086614, "calib/mu_c": 0.9493264248704661, "calib/mu_w": 0.9536065573770491, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1926377952755904, "calib/std_conf": 0.029057918692247678, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7891545687446627, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.011122354332260276, "calib/step_q_w": 0.800276923076923, "calib/step_q_w_n": 325.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 520.890625, "completions/mean_terminated_length": 522.933349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.018133333333333335, "grad_norm": 0.9362510442733765, "kl": 0.012551307678222656, "learning_rate": 4.000000000000001e-06, "loss": 0.0486, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03210851177573204, "mask/share_reasoning": 0.8395532369613647, "mask/share_step_conf": 0.12443193793296814, "num_tokens": 4025358.0, "reward": 0.7252713441848755, "reward_std": 0.34299027919769287, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7639504075050354, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.34049856662750244, "step": 17 }, { "adv/mean_abs_final_conf": 0.7473310828208923, "adv/mean_abs_reasoning": 0.31928735971450806, "adv/mean_abs_step_conf": 0.7825541496276855, "adv/ratio_final_to_reasoning": 2.3406222015463474, "adv/ratio_step_to_reasoning": 2.4509399630709123, "adv/std_final_conf": 0.9004414081573486, "adv/std_reasoning": 0.5961259603500366, "adv/std_step_conf": 0.9365625977516174, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.5703125, "calib/ece": 0.38932806324110675, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9683794466403162, "calib/gap": 0.0016083916083913241, "calib/mean_conf": 0.9545454545454546, "calib/mu_c": 0.955244755244755, "calib/mu_w": 0.9536363636363637, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.38932806324110675, "calib/std_conf": 0.024061246560770914, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7776543209876544, "calib/step_q_c_n": 729.0, "calib/step_q_gap": 0.032991480241599924, "calib/step_q_w": 0.7446628407460545, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 521.7265625, "completions/mean_terminated_length": 523.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.0192, "grad_norm": 1.0062179565429688, "kl": 0.01352691650390625, "learning_rate": 4.25e-06, "loss": 0.0347, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031320102512836456, "mask/share_reasoning": 0.8556277751922607, "mask/share_step_conf": 0.109145887196064, "num_tokens": 4269640.0, "reward": 0.5129168033599854, "reward_std": 0.286937952041626, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5880390405654907, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.12998196482658386, "step": 18 }, { "adv/mean_abs_final_conf": 0.7074457406997681, "adv/mean_abs_reasoning": 0.3469691276550293, "adv/mean_abs_step_conf": 0.7895352840423584, "adv/ratio_final_to_reasoning": 2.0389299344324927, "adv/ratio_step_to_reasoning": 2.2755202728795694, "adv/std_final_conf": 0.887826144695282, "adv/std_reasoning": 0.6401987075805664, "adv/std_step_conf": 0.9365120530128479, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.35666666666666663, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9882352941176471, "calib/gap": -0.001958064516128899, "calib/mean_conf": 0.9645098039215687, "calib/mu_c": 0.963741935483871, "calib/mu_w": 0.9656999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35666666666666663, "calib/std_conf": 0.018269628229600838, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8057424441524311, "calib/step_q_c_n": 761.0, "calib/step_q_gap": 0.023222759113061042, "calib/step_q_w": 0.7825196850393701, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2046.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 496.859375, "completions/mean_terminated_length": 496.859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.020266666666666665, "grad_norm": 0.7913922667503357, "kl": 0.01778411865234375, "learning_rate": 4.5e-06, "loss": 0.0212, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03168884664773941, "mask/share_reasoning": 0.8563750386238098, "mask/share_step_conf": 0.11193612962961197, "num_tokens": 4501596.0, "reward": 0.5909330248832703, "reward_std": 0.302628755569458, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6258324384689331, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2372836172580719, "step": 19 }, { "adv/mean_abs_final_conf": 0.6862360239028931, "adv/mean_abs_reasoning": 0.3572538495063782, "adv/mean_abs_step_conf": 0.7743955254554749, "adv/ratio_final_to_reasoning": 1.9208639034990762, "adv/ratio_step_to_reasoning": 2.1676338170336478, "adv/std_final_conf": 0.8702552914619446, "adv/std_reasoning": 0.6610784530639648, "adv/std_step_conf": 0.93644118309021, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.8125, "calib/ece": 0.3850988142292489, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": -0.0016332948273650727, "calib/mean_conf": 0.9661264822134388, "calib/mu_c": 0.9654421768707482, "calib/mu_w": 0.9670754716981133, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3850988142292489, "calib/std_conf": 0.01746865658040114, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8005250305250305, "calib/step_q_c_n": 819.0, "calib/step_q_gap": 0.001705897490650976, "calib/step_q_w": 0.7988191330343796, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 499.59375, "completions/mean_terminated_length": 499.59375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.021333333333333333, "grad_norm": 0.941595733165741, "kl": 0.02475738525390625, "learning_rate": 4.75e-06, "loss": 0.0041, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03427626192569733, "mask/share_reasoning": 0.8326125741004944, "mask/share_step_conf": 0.1331111490726471, "num_tokens": 4734364.0, "reward": 0.5311501622200012, "reward_std": 0.2769666314125061, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5998179316520691, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.14998236298561096, "step": 20 }, { "adv/mean_abs_final_conf": 0.7747447490692139, "adv/mean_abs_reasoning": 0.5820626616477966, "adv/mean_abs_step_conf": 0.803031325340271, "adv/ratio_final_to_reasoning": 1.331033237686028, "adv/ratio_step_to_reasoning": 1.3796303701510775, "adv/std_final_conf": 0.8923851847648621, "adv/std_reasoning": 0.792894721031189, "adv/std_step_conf": 0.9368118047714233, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.93359375, "calib/ece": 0.38933070866141734, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0010267658465255014, "calib/mean_conf": 0.9680708661417322, "calib/mu_c": 0.9685034013605442, "calib/mu_w": 0.9674766355140187, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38933070866141734, "calib/std_conf": 0.014520515544936533, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8080324074074076, "calib/step_q_c_n": 864.0, "calib/step_q_gap": -0.008288203279615214, "calib/step_q_w": 0.8163206106870228, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 509.171875, "completions/mean_terminated_length": 511.1686706542969, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.0224, "grad_norm": 0.6484833359718323, "kl": 0.029376983642578125, "learning_rate": 5e-06, "loss": -0.0111, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032089974731206894, "mask/share_reasoning": 0.8372774124145508, "mask/share_step_conf": 0.1267264038324356, "num_tokens": 4967672.0, "reward": 0.5235748291015625, "reward_std": 0.4534868896007538, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6001855134963989, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.1336827576160431, "step": 21 }, { "adv/mean_abs_final_conf": 0.712620735168457, "adv/mean_abs_reasoning": 0.4498119056224823, "adv/mean_abs_step_conf": 0.7620086669921875, "adv/ratio_final_to_reasoning": 1.5842638361967785, "adv/ratio_step_to_reasoning": 1.6940606895179102, "adv/std_final_conf": 0.8673737645149231, "adv/std_reasoning": 0.7205768823623657, "adv/std_step_conf": 0.9362566471099854, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.31248031496063, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005230010952901321, "calib/mean_conf": 0.9660236220472441, "calib/mu_c": 0.9662048192771083, "calib/mu_w": 0.9656818181818182, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31248031496063, "calib/std_conf": 0.009408460396805323, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8323794549266248, "calib/step_q_c_n": 954.0, "calib/step_q_gap": 0.01707446416876901, "calib/step_q_w": 0.8153049907578558, "calib/step_q_w_n": 541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 496.84375, "completions/mean_terminated_length": 498.79217529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.023466666666666667, "grad_norm": 0.772916853427887, "kl": 0.03337860107421875, "learning_rate": 4.9722222222222224e-06, "loss": 0.0299, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03176339715719223, "mask/share_reasoning": 0.8361862897872925, "mask/share_step_conf": 0.1281440258026123, "num_tokens": 5196680.0, "reward": 0.6530857682228088, "reward_std": 0.3449816107749939, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6707980632781982, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.30724847316741943, "step": 22 }, { "adv/mean_abs_final_conf": 0.751395583152771, "adv/mean_abs_reasoning": 0.5195844769477844, "adv/mean_abs_step_conf": 0.789644718170166, "adv/ratio_final_to_reasoning": 1.4461470973242383, "adv/ratio_step_to_reasoning": 1.5197619505663587, "adv/std_final_conf": 0.8837504982948303, "adv/std_reasoning": 0.7575984001159668, "adv/std_step_conf": 0.936715841293335, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.41062745098039216, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": -0.008761863136863335, "calib/mean_conf": 0.9635686274509804, "calib/mu_c": 0.9597202797202796, "calib/mu_w": 0.968482142857143, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4067058823529412, "calib/std_conf": 0.06117966306401101, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8286507936507937, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.009565427797135317, "calib/step_q_w": 0.8190853658536584, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 527.41796875, "completions/mean_terminated_length": 529.486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.024533333333333334, "grad_norm": 0.6506996154785156, "kl": 0.033489227294921875, "learning_rate": 4.944444444444445e-06, "loss": 0.0358, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033001385629177094, "mask/share_reasoning": 0.8390417098999023, "mask/share_step_conf": 0.12405063211917877, "num_tokens": 5435635.0, "reward": 0.5348714590072632, "reward_std": 0.40363800525665283, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5811214447021484, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.17768384516239166, "step": 23 }, { "adv/mean_abs_final_conf": 0.7067166566848755, "adv/mean_abs_reasoning": 0.5085533857345581, "adv/mean_abs_step_conf": 0.7722582817077637, "adv/ratio_final_to_reasoning": 1.3896607052651688, "adv/ratio_step_to_reasoning": 1.5185392593391318, "adv/std_final_conf": 0.8816006779670715, "adv/std_reasoning": 0.7753627300262451, "adv/std_step_conf": 0.9366243481636047, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 6.7265625, "calib/ece": 0.46663999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9666399999999999, "calib/mu_c": 0.96664, "calib/mu_w": 0.9666399999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.46663999999999994, "calib/std_conf": 0.008620348020816801, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8478888888888888, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.005180555555555411, "calib/step_q_w": 0.8427083333333334, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 578.53515625, "completions/mean_terminated_length": 580.803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0256, "grad_norm": 0.8711221814155579, "kl": 0.03333091735839844, "learning_rate": 4.9166666666666665e-06, "loss": 0.0222, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029958393424749374, "mask/share_reasoning": 0.837173581123352, "mask/share_step_conf": 0.12896181643009186, "num_tokens": 5688252.0, "reward": 0.5235338807106018, "reward_std": 0.37239861488342285, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5196999907493591, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.23439905047416687, "step": 24 }, { "adv/mean_abs_final_conf": 0.7122939229011536, "adv/mean_abs_reasoning": 0.40949639678001404, "adv/mean_abs_step_conf": 0.7618334293365479, "adv/ratio_final_to_reasoning": 1.7394388045954057, "adv/ratio_step_to_reasoning": 1.860415464768578, "adv/std_final_conf": 0.8639068007469177, "adv/std_reasoning": 0.6816980838775635, "adv/std_step_conf": 0.9365185499191284, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.90234375, "calib/ece": 0.34545816733067736, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.9811066126937966e-05, "calib/mean_conf": 0.9669721115537849, "calib/mu_c": 0.9669871794871793, "calib/mu_w": 0.9669473684210523, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34545816733067736, "calib/std_conf": 0.009551455278216978, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8326247288503255, "calib/step_q_c_n": 922.0, "calib/step_q_gap": -0.0033345241208121434, "calib/step_q_w": 0.8359592529711376, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 515.70703125, "completions/mean_terminated_length": 517.7294311523438, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.02666666666666667, "grad_norm": 0.8421515226364136, "kl": 0.035717010498046875, "learning_rate": 4.888888888888889e-06, "loss": 0.046, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030966393649578094, "mask/share_reasoning": 0.8380874395370483, "mask/share_step_conf": 0.12703987956047058, "num_tokens": 5923497.0, "reward": 0.5961206555366516, "reward_std": 0.32889270782470703, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6327472925186157, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2415253072977066, "step": 25 }, { "adv/mean_abs_final_conf": 0.7019491195678711, "adv/mean_abs_reasoning": 0.4142940640449524, "adv/mean_abs_step_conf": 0.7539768815040588, "adv/ratio_final_to_reasoning": 1.6943257953406425, "adv/ratio_step_to_reasoning": 1.819907517241786, "adv/std_final_conf": 0.8584496974945068, "adv/std_reasoning": 0.7012566328048706, "adv/std_step_conf": 0.9366661310195923, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.3612204724409448, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00019740259740264054, "calib/mean_conf": 0.96751968503937, "calib/mu_c": 0.9675974025974026, "calib/mu_w": 0.9673999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3612204724409448, "calib/std_conf": 0.00777068019975997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8425837320574162, "calib/step_q_c_n": 836.0, "calib/step_q_gap": -0.008265324546357311, "calib/step_q_w": 0.8508490566037735, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 525.74609375, "completions/mean_terminated_length": 525.74609375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.027733333333333332, "grad_norm": 0.6899396181106567, "kl": 0.0352935791015625, "learning_rate": 4.861111111111111e-06, "loss": 0.0589, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.029591944068670273, "mask/share_reasoning": 0.8523272275924683, "mask/share_step_conf": 0.11808083951473236, "num_tokens": 6163328.0, "reward": 0.625487208366394, "reward_std": 0.35189133882522583, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6259245872497559, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.30629974603652954, "step": 26 }, { "adv/mean_abs_final_conf": 0.7187884449958801, "adv/mean_abs_reasoning": 0.5025259256362915, "adv/mean_abs_step_conf": 0.7646505832672119, "adv/ratio_final_to_reasoning": 1.4303509696256167, "adv/ratio_step_to_reasoning": 1.52161419791232, "adv/std_final_conf": 0.8720357418060303, "adv/std_reasoning": 0.7393778562545776, "adv/std_step_conf": 0.9367018938064575, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.4615810276679842, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.0050400000000001555, "calib/mean_conf": 0.9675098814229248, "calib/mu_c": 0.9700000000000002, "calib/mu_w": 0.96496, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4615810276679842, "calib/std_conf": 0.03497775833650305, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8392947103274561, "calib/step_q_c_n": 794.0, "calib/step_q_gap": 0.001382196484709719, "calib/step_q_w": 0.8379125138427463, "calib/step_q_w_n": 903.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 533.6875, "completions/mean_terminated_length": 535.7803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.0288, "grad_norm": 0.6312984228134155, "kl": 0.03857421875, "learning_rate": 4.833333333333333e-06, "loss": 0.0249, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03089246340095997, "mask/share_reasoning": 0.8343897461891174, "mask/share_step_conf": 0.13081152737140656, "num_tokens": 6405168.0, "reward": 0.5096137523651123, "reward_std": 0.3753834366798401, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5318887233734131, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.1904638111591339, "step": 27 }, { "adv/mean_abs_final_conf": 0.6784707903862, "adv/mean_abs_reasoning": 0.5248863101005554, "adv/mean_abs_step_conf": 0.7611353993415833, "adv/ratio_final_to_reasoning": 1.292605231514271, "adv/ratio_step_to_reasoning": 1.4500957344377456, "adv/std_final_conf": 0.8848522305488586, "adv/std_reasoning": 0.7928767204284668, "adv/std_step_conf": 0.9367220401763916, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.37109375, "calib/ece": 0.35388663967611356, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": -0.0020449172576831876, "calib/mean_conf": 0.9652226720647774, "calib/mu_c": 0.9644444444444444, "calib/mu_w": 0.9664893617021276, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.34983805668016216, "calib/std_conf": 0.06310978651519646, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8318903150525089, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.0004810486432423877, "calib/step_q_w": 0.8314092664092665, "calib/step_q_w_n": 518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 581.92578125, "completions/mean_terminated_length": 584.2078857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.029866666666666666, "grad_norm": 0.9041455388069153, "kl": 0.035736083984375, "learning_rate": 4.805555555555556e-06, "loss": 0.0401, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028866175562143326, "mask/share_reasoning": 0.8538004159927368, "mask/share_step_conf": 0.11342713236808777, "num_tokens": 6661085.0, "reward": 0.5961412191390991, "reward_std": 0.3869975209236145, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6092222929000854, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2729039788246155, "step": 28 }, { "adv/mean_abs_final_conf": 0.7007180452346802, "adv/mean_abs_reasoning": 0.529513955116272, "adv/mean_abs_step_conf": 0.8074958324432373, "adv/ratio_final_to_reasoning": 1.3233230936865767, "adv/ratio_step_to_reasoning": 1.5249755452921452, "adv/std_final_conf": 0.877121090888977, "adv/std_reasoning": 0.7576119303703308, "adv/std_step_conf": 0.9367261528968811, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.71484375, "calib/ece": 0.4946456692913386, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0024644255266269566, "calib/mean_conf": 0.971023622047244, "calib/mu_c": 0.9723140495867771, "calib/mu_w": 0.9698496240601502, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4946456692913386, "calib/std_conf": 0.014997881512830394, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8127094240837698, "calib/step_q_c_n": 764.0, "calib/step_q_gap": -0.008505235602094041, "calib/step_q_w": 0.8212146596858638, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 588.08203125, "completions/mean_terminated_length": 588.08203125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.030933333333333334, "grad_norm": 0.971588134765625, "kl": 0.051906585693359375, "learning_rate": 4.777777777777778e-06, "loss": -0.0259, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.028301922604441643, "mask/share_reasoning": 0.847674548625946, "mask/share_step_conf": 0.12402355670928955, "num_tokens": 6918762.0, "reward": 0.44611260294914246, "reward_std": 0.37981656193733215, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5029281377792358, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.09632834792137146, "step": 29 }, { "adv/mean_abs_final_conf": 0.7375785112380981, "adv/mean_abs_reasoning": 0.6047927737236023, "adv/mean_abs_step_conf": 0.784739077091217, "adv/ratio_final_to_reasoning": 1.2195557607227308, "adv/ratio_step_to_reasoning": 1.2975338185007026, "adv/std_final_conf": 0.8972796201705933, "adv/std_reasoning": 0.8266693949699402, "adv/std_step_conf": 0.9365823268890381, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 6.77734375, "calib/ece": 0.4080645161290324, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003492063492066544, "calib/mean_conf": 0.9725806451612905, "calib/mu_c": 0.9724285714285713, "calib/mu_w": 0.972777777777778, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4080645161290324, "calib/std_conf": 0.009702923382383896, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8048057259713702, "calib/step_q_c_n": 978.0, "calib/step_q_gap": -0.00458661220564438, "calib/step_q_w": 0.8093923381770146, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 641.234375, "completions/mean_terminated_length": 643.7490844726562, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.032, "grad_norm": 0.6186464428901672, "kl": 0.030355453491210938, "learning_rate": 4.75e-06, "loss": 0.0293, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.026330148801207542, "mask/share_reasoning": 0.8494928479194641, "mask/share_step_conf": 0.1202707588672638, "num_tokens": 7189902.0, "reward": 0.5468332171440125, "reward_std": 0.3901427984237671, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5685625076293945, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.2235414832830429, "step": 30 }, { "adv/mean_abs_final_conf": 0.7140194177627563, "adv/mean_abs_reasoning": 0.48760420083999634, "adv/mean_abs_step_conf": 0.7621148824691772, "adv/ratio_final_to_reasoning": 1.464342219637801, "adv/ratio_step_to_reasoning": 1.5629785000955305, "adv/std_final_conf": 0.8921587467193604, "adv/std_reasoning": 0.7576415538787842, "adv/std_step_conf": 0.9366370439529419, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 6.85546875, "calib/ece": 0.5123577235772357, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": 0.004207864794730454, "calib/mean_conf": 0.9717073170731708, "calib/mu_c": 0.9739823008849557, "calib/mu_w": 0.9697744360902253, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.5123577235772357, "calib/std_conf": 0.01788433102058716, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8073484848484851, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.017366749688667804, "calib/step_q_w": 0.7899817351598173, "calib/step_q_w_n": 1095.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 637.828125, "completions/mean_terminated_length": 640.3294677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.03306666666666667, "grad_norm": 0.6501867175102234, "kl": 0.030551910400390625, "learning_rate": 4.722222222222222e-06, "loss": 0.0195, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.027726035565137863, "mask/share_reasoning": 0.8484981060028076, "mask/share_step_conf": 0.11986958980560303, "num_tokens": 7459098.0, "reward": 0.4433118999004364, "reward_std": 0.3674618899822235, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.4717359244823456, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.13520032167434692, "step": 31 }, { "adv/mean_abs_final_conf": 0.6820220947265625, "adv/mean_abs_reasoning": 0.4612715542316437, "adv/mean_abs_step_conf": 0.797433078289032, "adv/ratio_final_to_reasoning": 1.4785695941355648, "adv/ratio_step_to_reasoning": 1.7287714166925912, "adv/std_final_conf": 0.8834781050682068, "adv/std_reasoning": 0.7392356991767883, "adv/std_step_conf": 0.9367560744285583, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.41539370078740145, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": 0.007139270696039479, "calib/mean_conf": 0.970511811023622, "calib/mu_c": 0.9736879432624113, "calib/mu_w": 0.9665486725663718, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41539370078740145, "calib/std_conf": 0.028810640570907483, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7861488250652742, "calib/step_q_c_n": 766.0, "calib/step_q_gap": 0.002327120671132943, "calib/step_q_w": 0.7838217043941412, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 522.51953125, "completions/mean_terminated_length": 524.5686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.034133333333333335, "grad_norm": 0.7369862198829651, "kl": 0.03662872314453125, "learning_rate": 4.694444444444445e-06, "loss": -0.0202, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031127819791436195, "mask/share_reasoning": 0.8445783853530884, "mask/share_step_conf": 0.12038751691579819, "num_tokens": 7699567.0, "reward": 0.5680812001228333, "reward_std": 0.372994601726532, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5786261558532715, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.24894246459007263, "step": 32 }, { "adv/mean_abs_final_conf": 0.7227578163146973, "adv/mean_abs_reasoning": 0.4513196349143982, "adv/mean_abs_step_conf": 0.7907464504241943, "adv/ratio_final_to_reasoning": 1.6014322453570689, "adv/ratio_step_to_reasoning": 1.7520763318311539, "adv/std_final_conf": 0.8949360847473145, "adv/std_reasoning": 0.7205970287322998, "adv/std_step_conf": 0.9367126822471619, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.30078125, "calib/ece": 0.4673122529644269, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.011389201349831235, "calib/mean_conf": 0.9692885375494071, "calib/mu_c": 0.9749606299212599, "calib/mu_w": 0.9635714285714286, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4673122529644269, "calib/std_conf": 0.06026528759057799, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7979490445859873, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.01914469675990016, "calib/step_q_w": 0.7788043478260871, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 544.7265625, "completions/mean_terminated_length": 546.86279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.0352, "grad_norm": 0.5968253016471863, "kl": 0.036468505859375, "learning_rate": 4.666666666666667e-06, "loss": 0.0087, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030165106058120728, "mask/share_reasoning": 0.8429036736488342, "mask/share_step_conf": 0.12302500009536743, "num_tokens": 7945889.0, "reward": 0.4956803321838379, "reward_std": 0.3600698709487915, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5272008180618286, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.16728487610816956, "step": 33 }, { "adv/mean_abs_final_conf": 0.7376139163970947, "adv/mean_abs_reasoning": 0.5954176187515259, "adv/mean_abs_step_conf": 0.7713388204574585, "adv/ratio_final_to_reasoning": 1.2388177527291293, "adv/ratio_step_to_reasoning": 1.2954585087267068, "adv/std_final_conf": 0.9104684591293335, "adv/std_reasoning": 0.8264696598052979, "adv/std_step_conf": 0.9366449117660522, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.4223046875, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.99609375, "calib/gap": 0.0011822386678996821, "calib/mean_conf": 0.9730859375, "calib/mu_c": 0.9736170212765956, "calib/mu_w": 0.9724347826086959, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4223046875, "calib/std_conf": 0.01894297666012641, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7656926658905705, "calib/step_q_c_n": 859.0, "calib/step_q_gap": -0.00820497190470515, "calib/step_q_w": 0.7738976377952757, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 478.390625, "completions/mean_terminated_length": 480.2666931152344, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.03626666666666667, "grad_norm": 0.5449492335319519, "kl": 0.039295196533203125, "learning_rate": 4.638888888888889e-06, "loss": 0.013, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03340613842010498, "mask/share_reasoning": 0.8226373195648193, "mask/share_step_conf": 0.14005030691623688, "num_tokens": 8173469.0, "reward": 0.591486930847168, "reward_std": 0.38685351610183716, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5705609321594238, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.30303794145584106, "step": 34 }, { "adv/mean_abs_final_conf": 0.7280762195587158, "adv/mean_abs_reasoning": 0.5616724491119385, "adv/mean_abs_step_conf": 0.7755687236785889, "adv/ratio_final_to_reasoning": 1.2962647904661848, "adv/ratio_step_to_reasoning": 1.380820307111097, "adv/std_final_conf": 0.9154699444770813, "adv/std_reasoning": 0.826493501663208, "adv/std_step_conf": 0.9367932677268982, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.4332539682539684, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9642857142857143, "calib/gap": -0.0011181541582151455, "calib/mean_conf": 0.9650000000000001, "calib/mu_c": 0.9644852941176469, "calib/mu_w": 0.9656034482758621, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.42928571428571444, "calib/std_conf": 0.065921429904356, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7941714947856315, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.001638381329167049, "calib/step_q_w": 0.7925331134564645, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 577.84765625, "completions/mean_terminated_length": 580.11376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 213.0, "epoch": 0.037333333333333336, "grad_norm": 0.5556899309158325, "kl": 0.037166595458984375, "learning_rate": 4.611111111111112e-06, "loss": 0.0578, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02877074107527733, "mask/share_reasoning": 0.8530111908912659, "mask/share_step_conf": 0.1143118292093277, "num_tokens": 8430654.0, "reward": 0.5062803030014038, "reward_std": 0.40529507398605347, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5568605661392212, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.15335622429847717, "step": 35 }, { "adv/mean_abs_final_conf": 0.6988523006439209, "adv/mean_abs_reasoning": 0.5278260707855225, "adv/mean_abs_step_conf": 0.790012001991272, "adv/ratio_final_to_reasoning": 1.324020050021162, "adv/ratio_step_to_reasoning": 1.4967278914729591, "adv/std_final_conf": 0.8969197273254395, "adv/std_reasoning": 0.7927491664886475, "adv/std_step_conf": 0.9367321133613586, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.27325396825396825, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": -0.007154570300637597, "calib/mean_conf": 0.9637301587301587, "calib/mu_c": 0.9616292134831461, "calib/mu_w": 0.9687837837837837, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26531746031746034, "calib/std_conf": 0.08946697650028075, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.79090026478376, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": -0.01758045555522314, "calib/step_q_w": 0.8084807203389831, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 510.828125, "completions/mean_terminated_length": 510.828125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.0384, "grad_norm": 0.6275749802589417, "kl": 0.04204559326171875, "learning_rate": 4.583333333333333e-06, "loss": 0.0566, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034487880766391754, "mask/share_reasoning": 0.8271164298057556, "mask/share_step_conf": 0.13839569687843323, "num_tokens": 8664138.0, "reward": 0.6869564652442932, "reward_std": 0.3707220256328583, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.704030454158783, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3347262442111969, "step": 36 }, { "adv/mean_abs_final_conf": 0.7420522570610046, "adv/mean_abs_reasoning": 0.5068639516830444, "adv/mean_abs_step_conf": 0.7792178392410278, "adv/ratio_final_to_reasoning": 1.464006770647264, "adv/ratio_step_to_reasoning": 1.537331342372309, "adv/std_final_conf": 0.9036746621131897, "adv/std_reasoning": 0.7575888633728027, "adv/std_step_conf": 0.936730146408081, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.484349593495935, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9390243902439024, "calib/gap": 0.0308308487378256, "calib/mean_conf": 0.9544308943089431, "calib/mu_c": 0.9705982905982906, "calib/mu_w": 0.939767441860465, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.4815853658536586, "calib/std_conf": 0.10935640109374403, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.799828125, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.03154274667689161, "calib/step_q_w": 0.7682853783231084, "calib/step_q_w_n": 978.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 563.078125, "completions/mean_terminated_length": 565.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.039466666666666664, "grad_norm": 0.6503015160560608, "kl": 0.04059600830078125, "learning_rate": 4.555555555555556e-06, "loss": -0.0207, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030944695696234703, "mask/share_reasoning": 0.8414304256439209, "mask/share_step_conf": 0.12371863424777985, "num_tokens": 8915382.0, "reward": 0.4583185315132141, "reward_std": 0.3779911994934082, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.500342607498169, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.13348199427127838, "step": 37 }, { "adv/mean_abs_final_conf": 0.7206848859786987, "adv/mean_abs_reasoning": 0.4816995859146118, "adv/mean_abs_step_conf": 0.7730225324630737, "adv/ratio_final_to_reasoning": 1.4961293450363282, "adv/ratio_step_to_reasoning": 1.604781392940834, "adv/std_final_conf": 0.8922587633132935, "adv/std_reasoning": 0.7393845319747925, "adv/std_step_conf": 0.9367142915725708, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.4535633199464525, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9357429718875502, "calib/gap": 0.019676190168732788, "calib/mean_conf": 0.9538262382864793, "calib/mu_c": 0.9633877604166667, "calib/mu_w": 0.943711570247934, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.44666666666666666, "calib/std_conf": 0.12677508929259118, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.78607458703939, "calib/step_q_c_n": 787.0, "calib/step_q_gap": -0.0025397316673074766, "calib/step_q_w": 0.7886143187066975, "calib/step_q_w_n": 866.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 554.9453125, "completions/mean_terminated_length": 554.9453125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.04053333333333333, "grad_norm": 0.6848846673965454, "kl": 0.04344940185546875, "learning_rate": 4.527777777777778e-06, "loss": 0.0688, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031157787889242172, "mask/share_reasoning": 0.8417280912399292, "mask/share_step_conf": 0.12711411714553833, "num_tokens": 9164336.0, "reward": 0.48191243410110474, "reward_std": 0.3690693974494934, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5315992832183838, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.13847549259662628, "step": 38 }, { "adv/mean_abs_final_conf": 0.724373459815979, "adv/mean_abs_reasoning": 0.44981586933135986, "adv/mean_abs_step_conf": 0.7805863618850708, "adv/ratio_final_to_reasoning": 1.6103777327659474, "adv/ratio_step_to_reasoning": 1.7353464275179822, "adv/std_final_conf": 0.8995541334152222, "adv/std_reasoning": 0.7391477823257446, "adv/std_step_conf": 0.9367110133171082, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.5144, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.944, "calib/gap": 0.029500678250759083, "calib/mean_conf": 0.9663999999999999, "calib/mu_c": 0.9825663716814159, "calib/mu_w": 0.9530656934306568, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5144, "calib/std_conf": 0.07681041596033704, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7727114285714286, "calib/step_q_c_n": 665.0, "calib/step_q_gap": 0.02679542066629026, "calib/step_q_w": 0.7459160079051383, "calib/step_q_w_n": 1012.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 563.28125, "completions/mean_terminated_length": 563.28125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.0416, "grad_norm": 0.7684149146080017, "kl": 0.036548614501953125, "learning_rate": 4.5e-06, "loss": 0.0385, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03252136707305908, "mask/share_reasoning": 0.8341522812843323, "mask/share_step_conf": 0.13332638144493103, "num_tokens": 9414624.0, "reward": 0.46233272552490234, "reward_std": 0.3484842777252197, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.4847765564918518, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.15551385283470154, "step": 39 }, { "adv/mean_abs_final_conf": 0.7425650954246521, "adv/mean_abs_reasoning": 0.4722183644771576, "adv/mean_abs_step_conf": 0.7667721509933472, "adv/ratio_final_to_reasoning": 1.5725036366318021, "adv/ratio_step_to_reasoning": 1.623766055439883, "adv/std_final_conf": 0.8895200490951538, "adv/std_reasoning": 0.7206775546073914, "adv/std_step_conf": 0.9367943406105042, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.5190908730158731, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8968253968253969, "calib/gap": 0.03966586899339186, "calib/mean_conf": 0.9475829365079365, "calib/mu_c": 0.970091743119266, "calib/mu_w": 0.9304258741258742, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5170670634920636, "calib/std_conf": 0.1392772810560324, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7678723404255319, "calib/step_q_c_n": 658.0, "calib/step_q_gap": -0.003339936594405124, "calib/step_q_w": 0.771212277019937, "calib/step_q_w_n": 953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 569.99609375, "completions/mean_terminated_length": 569.99609375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.042666666666666665, "grad_norm": 0.8380210399627686, "kl": 0.041248321533203125, "learning_rate": 4.472222222222223e-06, "loss": -0.0326, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031462833285331726, "mask/share_reasoning": 0.8427754044532776, "mask/share_step_conf": 0.12576177716255188, "num_tokens": 9667303.0, "reward": 0.4228946566581726, "reward_std": 0.36979275941848755, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.47787922620773315, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.08666008710861206, "step": 40 }, { "adv/mean_abs_final_conf": 0.7061961889266968, "adv/mean_abs_reasoning": 0.5230212211608887, "adv/mean_abs_step_conf": 0.7796332836151123, "adv/ratio_final_to_reasoning": 1.350224733442433, "adv/ratio_step_to_reasoning": 1.4906341312206262, "adv/std_final_conf": 0.8942816257476807, "adv/std_reasoning": 0.7927087545394897, "adv/std_step_conf": 0.9367564916610718, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.2352011857707511, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8537549407114624, "calib/gap": 0.039890763922324046, "calib/mean_conf": 0.9275656126482213, "calib/mu_c": 0.9381295698924731, "calib/mu_w": 0.898238805970149, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2137944664031622, "calib/std_conf": 0.17885447051677, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7756797468354432, "calib/step_q_c_n": 1106.0, "calib/step_q_gap": -0.012296156779014633, "calib/step_q_w": 0.7879759036144578, "calib/step_q_w_n": 415.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 507.85546875, "completions/mean_terminated_length": 507.85546875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.04373333333333333, "grad_norm": 0.5942373871803284, "kl": 0.041805267333984375, "learning_rate": 4.444444444444444e-06, "loss": 0.1124, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0342470183968544, "mask/share_reasoning": 0.8328951597213745, "mask/share_step_conf": 0.1328577995300293, "num_tokens": 9904562.0, "reward": 0.7286665439605713, "reward_std": 0.37414735555648804, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7430287599563599, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.37133562564849854, "step": 41 }, { "adv/mean_abs_final_conf": 0.6909000277519226, "adv/mean_abs_reasoning": 0.33909207582473755, "adv/mean_abs_step_conf": 0.7567236423492432, "adv/ratio_final_to_reasoning": 2.0374997736868963, "adv/ratio_step_to_reasoning": 2.2316170040504333, "adv/std_final_conf": 0.9244943857192993, "adv/std_reasoning": 0.6402119398117065, "adv/std_step_conf": 0.9366698265075684, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.36003952569169967, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8616600790513834, "calib/gap": 0.047049190938511454, "calib/mean_conf": 0.9473122529644267, "calib/mu_c": 0.9664666666666668, "calib/mu_w": 0.9194174757281554, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3572332015810277, "calib/std_conf": 0.1102991321158831, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7566079787234041, "calib/step_q_c_n": 940.0, "calib/step_q_gap": 0.021559679403676313, "calib/step_q_w": 0.7350482993197278, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2693.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 456.64453125, "completions/mean_terminated_length": 458.4353332519531, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.0448, "grad_norm": 0.7689395546913147, "kl": 0.04754638671875, "learning_rate": 4.416666666666667e-06, "loss": 0.0639, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03618975728750229, "mask/share_reasoning": 0.8117806911468506, "mask/share_step_conf": 0.14812332391738892, "num_tokens": 10125831.0, "reward": 0.5788991451263428, "reward_std": 0.30940932035446167, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.636014461517334, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.20694011449813843, "step": 42 }, { "adv/mean_abs_final_conf": 0.7358661890029907, "adv/mean_abs_reasoning": 0.4630781412124634, "adv/mean_abs_step_conf": 0.7779009342193604, "adv/ratio_final_to_reasoning": 1.5890756300357747, "adv/ratio_step_to_reasoning": 1.6798480968732534, "adv/std_final_conf": 0.8974462151527405, "adv/std_reasoning": 0.7205843329429626, "adv/std_step_conf": 0.9367272257804871, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.33098078431372546, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6941176470588235, "calib/gap": 0.09033996454349746, "calib/mean_conf": 0.9048623529411765, "calib/mu_c": 0.9424154362416106, "calib/mu_w": 0.8520754716981132, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3257647058823529, "calib/std_conf": 0.15720034898165416, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7649237472766885, "calib/step_q_c_n": 918.0, "calib/step_q_gap": 0.023065682760559425, "calib/step_q_w": 0.7418580645161291, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 514.421875, "completions/mean_terminated_length": 514.421875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.04586666666666667, "grad_norm": 0.8469089269638062, "kl": 0.040431976318359375, "learning_rate": 4.388888888888889e-06, "loss": 0.0478, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03337173908948898, "mask/share_reasoning": 0.8350906372070312, "mask/share_step_conf": 0.13153764605522156, "num_tokens": 10362747.0, "reward": 0.6127454042434692, "reward_std": 0.36014682054519653, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6708999872207642, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.23896577954292297, "step": 43 }, { "adv/mean_abs_final_conf": 0.7279866933822632, "adv/mean_abs_reasoning": 0.45917659997940063, "adv/mean_abs_step_conf": 0.7688028216362, "adv/ratio_final_to_reasoning": 1.5854176659152965, "adv/ratio_step_to_reasoning": 1.6743074923040278, "adv/std_final_conf": 0.906668484210968, "adv/std_reasoning": 0.7391861081123352, "adv/std_step_conf": 0.9367181658744812, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.640625, "calib/ece": 0.4168624505928854, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6403162055335968, "calib/gap": 0.10725087719298254, "calib/mean_conf": 0.8757857707509882, "calib/mu_c": 0.9321666666666667, "calib/mu_w": 0.8249157894736842, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40916996047430837, "calib/std_conf": 0.20421991971354106, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7767240277777779, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.030673007369614536, "calib/step_q_w": 0.7460510204081634, "calib/step_q_w_n": 980.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 559.30859375, "completions/mean_terminated_length": 559.30859375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.046933333333333334, "grad_norm": 1.0588027238845825, "kl": 0.0382080078125, "learning_rate": 4.361111111111112e-06, "loss": 0.0823, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030518846586346626, "mask/share_reasoning": 0.8403370380401611, "mask/share_step_conf": 0.1291441023349762, "num_tokens": 10612250.0, "reward": 0.5138987302780151, "reward_std": 0.35603389143943787, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5942080616950989, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.1421830654144287, "step": 44 }, { "adv/mean_abs_final_conf": 0.7192021012306213, "adv/mean_abs_reasoning": 0.5190655589103699, "adv/mean_abs_step_conf": 0.7783068418502808, "adv/ratio_final_to_reasoning": 1.385570837603599, "adv/ratio_step_to_reasoning": 1.4994384206189948, "adv/std_final_conf": 0.9113670587539673, "adv/std_reasoning": 0.7927655577659607, "adv/std_step_conf": 0.9367387890815735, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 6.859375, "calib/ece": 0.3155495967741935, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6532258064516129, "calib/gap": 0.10522688316036155, "calib/mean_conf": 0.8776762096774193, "calib/mu_c": 0.9213793103448276, "calib/mu_w": 0.816152427184466, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30427419354838703, "calib/std_conf": 0.21078095809850514, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7532707509881422, "calib/step_q_c_n": 1012.0, "calib/step_q_gap": 0.03838929937523916, "calib/step_q_w": 0.7148814516129031, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 535.22265625, "completions/mean_terminated_length": 539.43701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.048, "grad_norm": 1.0135393142700195, "kl": 0.0483856201171875, "learning_rate": 4.333333333333334e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03224221616983414, "mask/share_reasoning": 0.8216654062271118, "mask/share_step_conf": 0.13827982544898987, "num_tokens": 10854315.0, "reward": 0.584678590297699, "reward_std": 0.3501487970352173, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6568104028701782, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2055155485868454, "step": 45 }, { "adv/mean_abs_final_conf": 0.6938655376434326, "adv/mean_abs_reasoning": 0.36680153012275696, "adv/mean_abs_step_conf": 0.7613024711608887, "adv/ratio_final_to_reasoning": 1.8916647850711463, "adv/ratio_step_to_reasoning": 2.0755160724277912, "adv/std_final_conf": 0.9029045104980469, "adv/std_reasoning": 0.6612504720687866, "adv/std_step_conf": 0.9366840124130249, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 7.73046875, "calib/ece": 0.3072144000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.06562446946169775, "calib/mean_conf": 0.8002256000000001, "calib/mu_c": 0.8296253623188407, "calib/mu_w": 0.7640008928571429, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2777200000000001, "calib/std_conf": 0.2577455962468418, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6838827098078868, "calib/step_q_c_n": 989.0, "calib/step_q_gap": -0.01696577504059804, "calib/step_q_w": 0.7008484848484848, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 596.8671875, "completions/mean_terminated_length": 596.8671875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.04906666666666667, "grad_norm": 0.9470670819282532, "kl": 0.042751312255859375, "learning_rate": 4.305555555555556e-06, "loss": 0.0787, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03482796251773834, "mask/share_reasoning": 0.8166794180870056, "mask/share_step_conf": 0.14849263429641724, "num_tokens": 11111881.0, "reward": 0.5584321022033691, "reward_std": 0.30958062410354614, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6378525495529175, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.17588670551776886, "step": 46 }, { "adv/mean_abs_final_conf": 0.6724117994308472, "adv/mean_abs_reasoning": 0.4280226528644562, "adv/mean_abs_step_conf": 0.7653312683105469, "adv/ratio_final_to_reasoning": 1.5709724588894196, "adv/ratio_step_to_reasoning": 1.788062531711161, "adv/std_final_conf": 0.8915644288063049, "adv/std_reasoning": 0.7206438779830933, "adv/std_step_conf": 0.9366405606269836, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 7.59765625, "calib/ece": 0.18711578947368424, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4291497975708502, "calib/gap": 0.17088503958503964, "calib/mean_conf": 0.7295643724696356, "calib/mu_c": 0.7980567567567568, "calib/mu_w": 0.6271717171717172, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15874493927125508, "calib/std_conf": 0.29011530589488854, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6997773654916513, "calib/step_q_c_n": 1078.0, "calib/step_q_gap": 0.03377967229672618, "calib/step_q_w": 0.6659976931949251, "calib/step_q_w_n": 867.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 575.1328125, "completions/mean_terminated_length": 586.5896606445312, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.050133333333333335, "grad_norm": 0.9541944265365601, "kl": 0.043483734130859375, "learning_rate": 4.277777777777778e-06, "loss": -0.0626, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02935297042131424, "mask/share_reasoning": 0.8150503039360046, "mask/share_step_conf": 0.1360654979944229, "num_tokens": 11365091.0, "reward": 0.6299548745155334, "reward_std": 0.27023494243621826, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7124563455581665, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.24042212963104248, "step": 47 }, { "adv/mean_abs_final_conf": 0.7541272640228271, "adv/mean_abs_reasoning": 0.6161321997642517, "adv/mean_abs_step_conf": 0.7687797546386719, "adv/ratio_final_to_reasoning": 1.223969895277955, "adv/ratio_step_to_reasoning": 1.2477513022900395, "adv/std_final_conf": 0.9209102988243103, "adv/std_reasoning": 0.8266001343727112, "adv/std_step_conf": 0.9366723895072937, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.18569200000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.252, "calib/gap": 0.1656924756034926, "calib/mean_conf": 0.565028, "calib/mu_c": 0.6432348484848485, "calib/mu_w": 0.47754237288135587, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11136000000000003, "calib/std_conf": 0.313467885461972, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6871144343302991, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.06170363620823327, "calib/step_q_w": 0.6254107981220658, "calib/step_q_w_n": 852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 528.40625, "completions/mean_terminated_length": 530.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.0512, "grad_norm": 1.563501000404358, "kl": 0.053302764892578125, "learning_rate": 4.25e-06, "loss": 0.015, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034390293061733246, "mask/share_reasoning": 0.8203791379928589, "mask/share_step_conf": 0.14132428169250488, "num_tokens": 11604051.0, "reward": 0.6105234622955322, "reward_std": 0.30511993169784546, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7165402770042419, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.20606909692287445, "step": 48 }, { "adv/mean_abs_final_conf": 0.7155464887619019, "adv/mean_abs_reasoning": 0.37291908264160156, "adv/mean_abs_step_conf": 0.7828823328018188, "adv/ratio_final_to_reasoning": 1.918771449541472, "adv/ratio_step_to_reasoning": 2.0993356715784306, "adv/std_final_conf": 0.9060553908348083, "adv/std_reasoning": 0.6402502655982971, "adv/std_step_conf": 0.9362311959266663, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.71875, "calib/ece": 0.17695256916996044, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3715415019762846, "calib/gap": 0.19286003861003853, "calib/mean_conf": 0.6325335968379446, "calib/mu_c": 0.7125743243243242, "calib/mu_w": 0.5197142857142857, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11225296442687747, "calib/std_conf": 0.32490195836150604, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.673720405982906, "calib/step_q_c_n": 936.0, "calib/step_q_gap": 0.03133520190127326, "calib/step_q_w": 0.6423852040816327, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 504.94140625, "completions/mean_terminated_length": 506.9216003417969, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.05226666666666667, "grad_norm": 1.518766164779663, "kl": 0.052059173583984375, "learning_rate": 4.222222222222223e-06, "loss": 0.0191, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03357596695423126, "mask/share_reasoning": 0.8161840438842773, "mask/share_step_conf": 0.1463337540626526, "num_tokens": 11837852.0, "reward": 0.6512937545776367, "reward_std": 0.2607116401195526, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7307144999504089, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.25937309861183167, "step": 49 }, { "adv/mean_abs_final_conf": 0.637176513671875, "adv/mean_abs_reasoning": 0.3501752018928528, "adv/mean_abs_step_conf": 0.7724059820175171, "adv/ratio_final_to_reasoning": 1.8195934784292331, "adv/ratio_step_to_reasoning": 2.205770076928118, "adv/std_final_conf": 0.8594935536384583, "adv/std_reasoning": 0.6186580061912537, "adv/std_step_conf": 0.936378538608551, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.640625, "calib/ece": 0.1882738095238095, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.2087020285999337, "calib/mean_conf": 0.7574404761904762, "calib/mu_c": 0.8377741935483872, "calib/mu_w": 0.6290721649484535, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1653174603174603, "calib/std_conf": 0.3049627488775438, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7132121212121213, "calib/step_q_c_n": 990.0, "calib/step_q_gap": 0.015367050789586112, "calib/step_q_w": 0.6978450704225352, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 517.640625, "completions/mean_terminated_length": 517.640625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.05333333333333334, "grad_norm": 1.370573878288269, "kl": 0.06217193603515625, "learning_rate": 4.194444444444445e-06, "loss": -0.0093, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033754441887140274, "mask/share_reasoning": 0.8261618614196777, "mask/share_step_conf": 0.1400837004184723, "num_tokens": 12075728.0, "reward": 0.6809222102165222, "reward_std": 0.2406473159790039, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7345975637435913, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.3100593686103821, "step": 50 }, { "adv/mean_abs_final_conf": 0.5833197832107544, "adv/mean_abs_reasoning": 0.452068954706192, "adv/mean_abs_step_conf": 0.7631381750106812, "adv/ratio_final_to_reasoning": 1.2903336474185996, "adv/ratio_step_to_reasoning": 1.6881012665571313, "adv/std_final_conf": 0.8108258247375488, "adv/std_reasoning": 0.7205539345741272, "adv/std_step_conf": 0.9365473389625549, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.890625, "calib/ece": 0.26356862745098036, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6941176470588235, "calib/gap": 0.21229671986929743, "calib/mean_conf": 0.8235686274509804, "calib/mu_c": 0.9143150684931507, "calib/mu_w": 0.7020183486238533, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2572941176470588, "calib/std_conf": 0.2831631730030848, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7226273148148147, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.03537175925925917, "calib/step_q_w": 0.6872555555555555, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 516.8828125, "completions/mean_terminated_length": 518.9098510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.0544, "grad_norm": 0.8607607483863831, "kl": 0.0615997314453125, "learning_rate": 4.166666666666667e-06, "loss": 0.0829, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033120427280664444, "mask/share_reasoning": 0.8250494003295898, "mask/share_step_conf": 0.1379239410161972, "num_tokens": 12317346.0, "reward": 0.6647790670394897, "reward_std": 0.32644182443618774, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.709381639957428, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3076763451099396, "step": 51 }, { "adv/mean_abs_final_conf": 0.6057255864143372, "adv/mean_abs_reasoning": 0.44159531593322754, "adv/mean_abs_step_conf": 0.7482933402061462, "adv/ratio_final_to_reasoning": 1.3716757505324793, "adv/ratio_step_to_reasoning": 1.69452282034462, "adv/std_final_conf": 0.843150794506073, "adv/std_reasoning": 0.7205482125282288, "adv/std_step_conf": 0.936403751373291, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.1328125, "calib/ece": 0.2066666666666667, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7380952380952381, "calib/gap": 0.16514173998044956, "calib/mean_conf": 0.8415873015873016, "calib/mu_c": 0.8848387096774193, "calib/mu_w": 0.7196969696969697, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15507936507936512, "calib/std_conf": 0.2855040452230937, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7506818181818182, "calib/step_q_c_n": 1100.0, "calib/step_q_gap": 0.03425628626692456, "calib/step_q_w": 0.7164255319148937, "calib/step_q_w_n": 470.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 535.71484375, "completions/mean_terminated_length": 535.71484375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.055466666666666664, "grad_norm": 1.321242332458496, "kl": 0.0544891357421875, "learning_rate": 4.138888888888889e-06, "loss": 0.0318, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034158483147621155, "mask/share_reasoning": 0.8389657735824585, "mask/share_step_conf": 0.12687571346759796, "num_tokens": 12562441.0, "reward": 0.7765899896621704, "reward_std": 0.3064219355583191, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7642152309417725, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.4475584328174591, "step": 52 }, { "adv/mean_abs_final_conf": 0.4043058156967163, "adv/mean_abs_reasoning": 0.37307173013687134, "adv/mean_abs_step_conf": 0.7907155156135559, "adv/ratio_final_to_reasoning": 1.0837213946722415, "adv/ratio_step_to_reasoning": 2.119473151512876, "adv/std_final_conf": 0.6624153852462769, "adv/std_reasoning": 0.6403059363365173, "adv/std_step_conf": 0.9361966848373413, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.9921875, "calib/ece": 0.2799607843137255, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8666666666666667, "calib/gap": 0.1773737373737373, "calib/mean_conf": 0.9198823529411765, "calib/mu_c": 0.9824848484848485, "calib/mu_w": 0.8051111111111112, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2763921568627451, "calib/std_conf": 0.21595294032191215, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7545780051150895, "calib/step_q_c_n": 1173.0, "calib/step_q_gap": 0.017300857627245203, "calib/step_q_w": 0.7372771474878443, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 556.91796875, "completions/mean_terminated_length": 556.91796875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05653333333333333, "grad_norm": 1.0264867544174194, "kl": 0.05510711669921875, "learning_rate": 4.111111111111111e-06, "loss": 0.0098, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03013557940721512, "mask/share_reasoning": 0.8389697074890137, "mask/share_step_conf": 0.130894735455513, "num_tokens": 12810836.0, "reward": 0.6848220229148865, "reward_std": 0.30382293462753296, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7287152409553528, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3128037452697754, "step": 53 }, { "adv/mean_abs_final_conf": 0.41899287700653076, "adv/mean_abs_reasoning": 0.3465266525745392, "adv/mean_abs_step_conf": 0.7725863456726074, "adv/ratio_final_to_reasoning": 1.2091216473353483, "adv/ratio_step_to_reasoning": 2.2295149303311415, "adv/std_final_conf": 0.6827141046524048, "adv/std_reasoning": 0.6185320615768433, "adv/std_step_conf": 0.9359965324401855, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.21948818897637792, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8385826771653543, "calib/gap": 0.19312104283054, "calib/mean_conf": 0.914763779527559, "calib/mu_c": 0.9717877094972066, "calib/mu_w": 0.7786666666666666, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21476377952755904, "calib/std_conf": 0.20738278946248823, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7775675675675675, "calib/step_q_c_n": 1073.0, "calib/step_q_gap": 0.03423423423423422, "calib/step_q_w": 0.7433333333333333, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 477.9375, "completions/mean_terminated_length": 479.8117980957031, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.0576, "grad_norm": 1.097700834274292, "kl": 0.06143951416015625, "learning_rate": 4.083333333333334e-06, "loss": 0.0147, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03568841516971588, "mask/share_reasoning": 0.8217997550964355, "mask/share_step_conf": 0.13860559463500977, "num_tokens": 13039420.0, "reward": 0.7917571067810059, "reward_std": 0.27344125509262085, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7790261507034302, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.46620669960975647, "step": 54 }, { "adv/mean_abs_final_conf": 0.5309213399887085, "adv/mean_abs_reasoning": 0.4365116357803345, "adv/mean_abs_step_conf": 0.7545424103736877, "adv/ratio_final_to_reasoning": 1.2162822167147997, "adv/ratio_step_to_reasoning": 1.728573418265981, "adv/std_final_conf": 0.7940976023674011, "adv/std_reasoning": 0.720463752746582, "adv/std_step_conf": 0.9364340901374817, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.30098814229249005, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7312252964426877, "calib/gap": 0.1792237762237764, "calib/mean_conf": 0.8583003952569169, "calib/mu_c": 0.9362237762237762, "calib/mu_w": 0.7569999999999998, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2970355731225296, "calib/std_conf": 0.248207559849901, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7686855036855037, "calib/step_q_c_n": 814.0, "calib/step_q_gap": -0.005950155229224929, "calib/step_q_w": 0.7746356589147286, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 481.125, "completions/mean_terminated_length": 483.01177978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.058666666666666666, "grad_norm": 1.0645291805267334, "kl": 0.06328582763671875, "learning_rate": 4.055555555555556e-06, "loss": 0.0133, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033719368278980255, "mask/share_reasoning": 0.8342823386192322, "mask/share_step_conf": 0.12809205055236816, "num_tokens": 13270412.0, "reward": 0.6241215467453003, "reward_std": 0.3561294674873352, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6866933107376099, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.25139355659484863, "step": 55 }, { "adv/mean_abs_final_conf": 0.5191867351531982, "adv/mean_abs_reasoning": 0.47845232486724854, "adv/mean_abs_step_conf": 0.764076292514801, "adv/ratio_final_to_reasoning": 1.0851378667608145, "adv/ratio_step_to_reasoning": 1.5969747722028558, "adv/std_final_conf": 0.7764801383018494, "adv/std_reasoning": 0.7575269341468811, "adv/std_step_conf": 0.9364977478981018, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 6.60546875, "calib/ece": 0.297, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.724, "calib/gap": 0.16607607345925124, "calib/mean_conf": 0.8683600000000001, "calib/mu_c": 0.9394405594405595, "calib/mu_w": 0.7733644859813082, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29667999999999994, "calib/std_conf": 0.2304554412462418, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7734939759036145, "calib/step_q_c_n": 913.0, "calib/step_q_gap": 0.015049245826493696, "calib/step_q_w": 0.7584447300771208, "calib/step_q_w_n": 778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 536.734375, "completions/mean_terminated_length": 536.734375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.05973333333333333, "grad_norm": 1.4025778770446777, "kl": 0.05811309814453125, "learning_rate": 4.027777777777779e-06, "loss": 0.0594, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032585591077804565, "mask/share_reasoning": 0.8330211639404297, "mask/share_step_conf": 0.13439321517944336, "num_tokens": 13514656.0, "reward": 0.6440907716751099, "reward_std": 0.3055835962295532, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6761792898178101, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.304970920085907, "step": 56 }, { "adv/mean_abs_final_conf": 0.6217701435089111, "adv/mean_abs_reasoning": 0.5029103755950928, "adv/mean_abs_step_conf": 0.7950608730316162, "adv/ratio_final_to_reasoning": 1.2363438371562165, "adv/ratio_step_to_reasoning": 1.5809196063827922, "adv/std_final_conf": 0.8430944681167603, "adv/std_reasoning": 0.7575104236602783, "adv/std_step_conf": 0.9365062713623047, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.17940711462450593, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6126482213438735, "calib/gap": 0.29207111537407493, "calib/mean_conf": 0.8257312252964427, "calib/mu_c": 0.9284756097560974, "calib/mu_w": 0.6364044943820225, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17845849802371544, "calib/std_conf": 0.2527023098517849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.775268703898841, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.02319553316713363, "calib/step_q_w": 0.7520731707317073, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 512.73828125, "completions/mean_terminated_length": 512.73828125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.0608, "grad_norm": 1.1734751462936401, "kl": 0.05725860595703125, "learning_rate": 4.000000000000001e-06, "loss": 0.027, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0328928679227829, "mask/share_reasoning": 0.8418260812759399, "mask/share_step_conf": 0.12528111040592194, "num_tokens": 13752709.0, "reward": 0.7211673259735107, "reward_std": 0.361972451210022, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.8003137111663818, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3146771192550659, "step": 57 }, { "adv/mean_abs_final_conf": 0.6758686304092407, "adv/mean_abs_reasoning": 0.5550025701522827, "adv/mean_abs_step_conf": 0.7941849231719971, "adv/ratio_final_to_reasoning": 1.2177756766491992, "adv/ratio_step_to_reasoning": 1.4309571989082628, "adv/std_final_conf": 0.8757584095001221, "adv/std_reasoning": 0.7755085825920105, "adv/std_step_conf": 0.9366655945777893, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 7.2265625, "calib/ece": 0.19848000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.388, "calib/gap": 0.11988315356959445, "calib/mean_conf": 0.7143999999999999, "calib/mu_c": 0.7709848484848485, "calib/mu_w": 0.651101694915254, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19244000000000006, "calib/std_conf": 0.27004710700172296, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7455698711595639, "calib/step_q_c_n": 1009.0, "calib/step_q_gap": -0.0009699623719461314, "calib/step_q_w": 0.74653983353151, "calib/step_q_w_n": 841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 604.65234375, "completions/mean_terminated_length": 604.65234375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.06186666666666667, "grad_norm": 1.2279466390609741, "kl": 0.05637359619140625, "learning_rate": 3.972222222222223e-06, "loss": -0.0186, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029751218855381012, "mask/share_reasoning": 0.8399531841278076, "mask/share_step_conf": 0.13029563426971436, "num_tokens": 14013820.0, "reward": 0.5667500495910645, "reward_std": 0.35580188035964966, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6863937377929688, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.14710628986358643, "step": 58 }, { "adv/mean_abs_final_conf": 0.6799726486206055, "adv/mean_abs_reasoning": 0.49774420261383057, "adv/mean_abs_step_conf": 0.7958185076713562, "adv/ratio_final_to_reasoning": 1.3661086257757076, "adv/ratio_step_to_reasoning": 1.5988503803604988, "adv/std_final_conf": 0.88865065574646, "adv/std_reasoning": 0.7395114898681641, "adv/std_step_conf": 0.9360986948013306, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.890625, "calib/ece": 0.18123505976095614, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3745019920318725, "calib/gap": 0.13348110661268564, "calib/mean_conf": 0.7456972111553785, "calib/mu_c": 0.7962179487179488, "calib/mu_w": 0.6627368421052632, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1527091633466135, "calib/std_conf": 0.2387797080079582, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.759892703862661, "calib/step_q_c_n": 932.0, "calib/step_q_gap": 0.0012121483071053651, "calib/step_q_w": 0.7586805555555556, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 552.23046875, "completions/mean_terminated_length": 552.23046875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.06293333333333333, "grad_norm": 1.2705607414245605, "kl": 0.05764007568359375, "learning_rate": 3.944444444444445e-06, "loss": 0.0598, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033475276082754135, "mask/share_reasoning": 0.8477275371551514, "mask/share_step_conf": 0.1187971979379654, "num_tokens": 14261439.0, "reward": 0.6229273676872253, "reward_std": 0.35119789838790894, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7364722490310669, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.192976176738739, "step": 59 }, { "adv/mean_abs_final_conf": 0.7194099426269531, "adv/mean_abs_reasoning": 0.5117816925048828, "adv/mean_abs_step_conf": 0.7612699270248413, "adv/ratio_final_to_reasoning": 1.405696907808185, "adv/ratio_step_to_reasoning": 1.4874895647377582, "adv/std_final_conf": 0.920421838760376, "adv/std_reasoning": 0.7753655910491943, "adv/std_step_conf": 0.9363244771957397, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.16939759036144586, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.20481927710843373, "calib/gap": 0.2127325656617931, "calib/mean_conf": 0.6636144578313253, "calib/mu_c": 0.7644274809160304, "calib/mu_w": 0.5516949152542373, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15345381526104424, "calib/std_conf": 0.24049436189695617, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7572103658536585, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.015746035440642125, "calib/step_q_w": 0.7414643304130164, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 516.1640625, "completions/mean_terminated_length": 520.2283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.064, "grad_norm": 2.8753914833068848, "kl": 0.0553741455078125, "learning_rate": 3.916666666666667e-06, "loss": -0.0193, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03429453447461128, "mask/share_reasoning": 0.8375608921051025, "mask/share_step_conf": 0.12033206224441528, "num_tokens": 14502433.0, "reward": 0.6331133842468262, "reward_std": 0.30463024973869324, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7553706765174866, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.21476224064826965, "step": 60 }, { "adv/mean_abs_final_conf": 0.6823130249977112, "adv/mean_abs_reasoning": 0.33459240198135376, "adv/mean_abs_step_conf": 0.8027042746543884, "adv/ratio_final_to_reasoning": 2.0392364589191576, "adv/ratio_step_to_reasoning": 2.3990511138358777, "adv/std_final_conf": 0.8760694861412048, "adv/std_reasoning": 0.6185065507888794, "adv/std_step_conf": 0.935340166091919, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.20703125, "calib/ece": 0.13654901960784316, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.29411764705882354, "calib/gap": 0.16559063690800224, "calib/mean_conf": 0.7366274509803923, "calib/mu_c": 0.7937724550898204, "calib/mu_w": 0.6281818181818182, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10913725490196081, "calib/std_conf": 0.23828060248571112, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7659772492244054, "calib/step_q_c_n": 967.0, "calib/step_q_gap": 0.04361390517295838, "calib/step_q_w": 0.722363344051447, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 442.81640625, "completions/mean_terminated_length": 444.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 1.8816365003585815, "kl": 0.06717681884765625, "learning_rate": 3.88888888888889e-06, "loss": 0.0202, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.038990095257759094, "mask/share_reasoning": 0.8130738735198975, "mask/share_step_conf": 0.14402973651885986, "num_tokens": 14719858.0, "reward": 0.711406409740448, "reward_std": 0.26123905181884766, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7797722816467285, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.31413429975509644, "step": 61 }, { "adv/mean_abs_final_conf": 0.6974444389343262, "adv/mean_abs_reasoning": 0.564460277557373, "adv/mean_abs_step_conf": 0.7973159551620483, "adv/ratio_final_to_reasoning": 1.2355952520741131, "adv/ratio_step_to_reasoning": 1.4125280145705335, "adv/std_final_conf": 0.8790695667266846, "adv/std_reasoning": 0.7754994034767151, "adv/std_step_conf": 0.9366229772567749, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.765625, "calib/ece": 0.21040000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.38, "calib/gap": 0.11602112676056364, "calib/mean_conf": 0.7084000000000001, "calib/mu_c": 0.7585211267605635, "calib/mu_w": 0.6424999999999998, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17540000000000003, "calib/std_conf": 0.2708531705555613, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7588387978142077, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.03311299136259471, "calib/step_q_w": 0.725725806451613, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 534.20703125, "completions/mean_terminated_length": 534.20703125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.06613333333333334, "grad_norm": 1.1402149200439453, "kl": 0.05263519287109375, "learning_rate": 3.861111111111112e-06, "loss": 0.0333, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03286831080913544, "mask/share_reasoning": 0.8496203422546387, "mask/share_step_conf": 0.11751138418912888, "num_tokens": 14963695.0, "reward": 0.6288319230079651, "reward_std": 0.34103110432624817, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.701648473739624, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.24976545572280884, "step": 62 }, { "adv/mean_abs_final_conf": 0.6985242366790771, "adv/mean_abs_reasoning": 0.47608500719070435, "adv/mean_abs_step_conf": 0.7647526264190674, "adv/ratio_final_to_reasoning": 1.4672258653994343, "adv/ratio_step_to_reasoning": 1.6063362947128728, "adv/std_final_conf": 0.8931014537811279, "adv/std_reasoning": 0.7575433254241943, "adv/std_step_conf": 0.9361869692802429, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.16624, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.48, "calib/gap": 0.3310333333333335, "calib/mean_conf": 0.7463200000000001, "calib/mu_c": 0.8787333333333334, "calib/mu_w": 0.5476999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15628, "calib/std_conf": 0.27805765157607154, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7747361299052774, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.036880212238038945, "calib/step_q_w": 0.7378559176672385, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 527.42578125, "completions/mean_terminated_length": 529.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.0672, "grad_norm": 2.1986336708068848, "kl": 0.0567779541015625, "learning_rate": 3.833333333333334e-06, "loss": 0.0487, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03372789919376373, "mask/share_reasoning": 0.8515186905860901, "mask/share_step_conf": 0.11084714531898499, "num_tokens": 15207356.0, "reward": 0.7147345542907715, "reward_std": 0.32877272367477417, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7955179810523987, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.32301366329193115, "step": 63 }, { "adv/mean_abs_final_conf": 0.7458844184875488, "adv/mean_abs_reasoning": 0.4996477961540222, "adv/mean_abs_step_conf": 0.7594409584999084, "adv/ratio_final_to_reasoning": 1.4928203911413258, "adv/ratio_step_to_reasoning": 1.5199525832908947, "adv/std_final_conf": 0.9173795580863953, "adv/std_reasoning": 0.7752702236175537, "adv/std_step_conf": 0.9364438652992249, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.16024000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.452, "calib/gap": 0.12790931611740852, "calib/mean_conf": 0.75176, "calib/mu_c": 0.7911560693641618, "calib/mu_w": 0.6632467532467533, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11000000000000004, "calib/std_conf": 0.26784193547687785, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7549272550921435, "calib/step_q_c_n": 1031.0, "calib/step_q_gap": 0.005903350709673405, "calib/step_q_w": 0.7490239043824701, "calib/step_q_w_n": 502.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 480.77734375, "completions/mean_terminated_length": 486.478271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.06826666666666667, "grad_norm": 3.391042470932007, "kl": 0.06589508056640625, "learning_rate": 3.8055555555555556e-06, "loss": 0.0052, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03431929647922516, "mask/share_reasoning": 0.8302887678146362, "mask/share_step_conf": 0.1236732229590416, "num_tokens": 15434211.0, "reward": 0.6802728176116943, "reward_std": 0.34100550413131714, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7481226921081543, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2819541394710541, "step": 64 }, { "adv/mean_abs_final_conf": 0.5557708740234375, "adv/mean_abs_reasoning": 0.2861083149909973, "adv/mean_abs_step_conf": 0.7856212258338928, "adv/ratio_final_to_reasoning": 1.9425191261600536, "adv/ratio_step_to_reasoning": 2.745887430285321, "adv/std_final_conf": 0.7884252071380615, "adv/std_reasoning": 0.5725942254066467, "adv/std_step_conf": 0.9360816478729248, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 4.8828125, "calib/ece": 0.3342578125000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.83984375, "calib/gap": 0.06841227400050931, "calib/mean_conf": 0.9306640625, "calib/mu_c": 0.9579220779220778, "calib/mu_w": 0.8895098039215685, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33167968750000004, "calib/std_conf": 0.15547290646924977, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7886223862238624, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.010956482333702167, "calib/step_q_w": 0.7776659038901602, "calib/step_q_w_n": 437.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 401.25390625, "completions/mean_terminated_length": 402.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.06933333333333333, "grad_norm": 1.4421082735061646, "kl": 0.06890869140625, "learning_rate": 3.777777777777778e-06, "loss": -0.0499, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04204752668738365, "mask/share_reasoning": 0.8294590711593628, "mask/share_step_conf": 0.12458708882331848, "num_tokens": 15641956.0, "reward": 0.6563754081726074, "reward_std": 0.2625671625137329, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6606301069259644, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.33180826902389526, "step": 65 }, { "adv/mean_abs_final_conf": 0.6745815277099609, "adv/mean_abs_reasoning": 0.5538687109947205, "adv/mean_abs_step_conf": 0.7744074463844299, "adv/ratio_final_to_reasoning": 1.217944820350018, "adv/ratio_step_to_reasoning": 1.398178721801477, "adv/std_final_conf": 0.8730720281600952, "adv/std_reasoning": 0.8098120093345642, "adv/std_step_conf": 0.936484694480896, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 6.8125, "calib/ece": 0.34854838709677416, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6653225806451613, "calib/gap": 0.176860139404599, "calib/mean_conf": 0.8275, "calib/mu_c": 0.9194957983193277, "calib/mu_w": 0.7426356589147287, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3481048387096774, "calib/std_conf": 0.27978858666108314, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7754331864904551, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.07594166106672628, "calib/step_q_w": 0.6994915254237288, "calib/step_q_w_n": 1062.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 579.8359375, "completions/mean_terminated_length": 582.10986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.0704, "grad_norm": 1.84421968460083, "kl": 0.058441162109375, "learning_rate": 3.7500000000000005e-06, "loss": -0.0156, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034632038325071335, "mask/share_reasoning": 0.8389459848403931, "mask/share_step_conf": 0.1225157380104065, "num_tokens": 15896746.0, "reward": 0.5241647958755493, "reward_std": 0.3743875026702881, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6195570230484009, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.14205381274223328, "step": 66 }, { "adv/mean_abs_final_conf": 0.5234067440032959, "adv/mean_abs_reasoning": 0.3804228603839874, "adv/mean_abs_step_conf": 0.7819041609764099, "adv/ratio_final_to_reasoning": 1.3758551299335293, "adv/ratio_step_to_reasoning": 2.055355349011306, "adv/std_final_conf": 0.7471958994865417, "adv/std_reasoning": 0.661289632320404, "adv/std_step_conf": 0.9361180067062378, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.24458498023715428, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7905138339920948, "calib/gap": 0.19568907563025206, "calib/mean_conf": 0.9078260869565218, "calib/mu_c": 0.9735714285714285, "calib/mu_w": 0.7778823529411765, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24418972332015823, "calib/std_conf": 0.18875611497385733, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7749007314524555, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.015121614986592036, "calib/step_q_w": 0.7597791164658635, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 504.49609375, "completions/mean_terminated_length": 506.47454833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.07146666666666666, "grad_norm": 1.4049971103668213, "kl": 0.06395721435546875, "learning_rate": 3.7222222222222225e-06, "loss": 0.0566, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03399176895618439, "mask/share_reasoning": 0.842154324054718, "mask/share_step_conf": 0.11994767189025879, "num_tokens": 16130905.0, "reward": 0.7375431060791016, "reward_std": 0.28779536485671997, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7601422071456909, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.38603776693344116, "step": 67 }, { "adv/mean_abs_final_conf": 0.6347408890724182, "adv/mean_abs_reasoning": 0.5049393177032471, "adv/mean_abs_step_conf": 0.732918381690979, "adv/ratio_final_to_reasoning": 1.257063704129009, "adv/ratio_step_to_reasoning": 1.4514979443959943, "adv/std_final_conf": 0.8453660607337952, "adv/std_reasoning": 0.77528977394104, "adv/std_step_conf": 0.9214169383049011, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.3634000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.836, "calib/gap": 0.13891697452848528, "calib/mean_conf": 0.9193999999999999, "calib/mu_c": 0.9810791366906475, "calib/mu_w": 0.8421621621621622, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3634000000000001, "calib/std_conf": 0.1951123778749057, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7657393850658858, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.02913127695777762, "calib/step_q_w": 0.7366081081081082, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 495.12109375, "completions/mean_terminated_length": 495.12109375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.07253333333333334, "grad_norm": 1.468184471130371, "kl": 0.07402801513671875, "learning_rate": 3.694444444444445e-06, "loss": -0.0321, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037854477763175964, "mask/share_reasoning": 0.8348889350891113, "mask/share_step_conf": 0.12725655734539032, "num_tokens": 16361744.0, "reward": 0.5645387172698975, "reward_std": 0.3623424172401428, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6343308687210083, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.19084039330482483, "step": 68 }, { "adv/mean_abs_final_conf": 0.6927582621574402, "adv/mean_abs_reasoning": 0.4724434018135071, "adv/mean_abs_step_conf": 0.7636895179748535, "adv/ratio_final_to_reasoning": 1.4663306959060898, "adv/ratio_step_to_reasoning": 1.6164677399311278, "adv/std_final_conf": 0.8652121424674988, "adv/std_reasoning": 0.7207993268966675, "adv/std_step_conf": 0.9366608262062073, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.38693877551020406, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7183673469387755, "calib/gap": 0.16247395833333333, "calib/mean_conf": 0.8584489795918367, "calib/mu_c": 0.9433333333333334, "calib/mu_w": 0.780859375, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.38391836734693874, "calib/std_conf": 0.23909532393489044, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.782883895131086, "calib/step_q_c_n": 534.0, "calib/step_q_gap": 0.027872505609445986, "calib/step_q_w": 0.75501138952164, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 575.27734375, "completions/mean_terminated_length": 584.4087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0736, "grad_norm": 1.6111221313476562, "kl": 0.045558929443359375, "learning_rate": 3.6666666666666666e-06, "loss": -0.0662, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.032633282244205475, "mask/share_reasoning": 0.8495101928710938, "mask/share_step_conf": 0.10223151743412018, "num_tokens": 16613511.0, "reward": 0.5018537640571594, "reward_std": 0.37982651591300964, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.5945999622344971, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.12785755097866058, "step": 69 }, { "adv/mean_abs_final_conf": 0.5887276530265808, "adv/mean_abs_reasoning": 0.41501444578170776, "adv/mean_abs_step_conf": 0.7523496150970459, "adv/ratio_final_to_reasoning": 1.4185714714524515, "adv/ratio_step_to_reasoning": 1.8128275358703347, "adv/std_final_conf": 0.818272054195404, "adv/std_reasoning": 0.720436155796051, "adv/std_step_conf": 0.9363917708396912, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.7578125, "calib/ece": 0.2592741935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6411290322580645, "calib/gap": 0.33134273772204825, "calib/mean_conf": 0.7915322580645162, "calib/mu_c": 0.9465151515151515, "calib/mu_w": 0.6151724137931033, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2592741935483871, "calib/std_conf": 0.308895128761074, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7758091286307054, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.04212071318463351, "calib/step_q_w": 0.7336884154460719, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 548.21875, "completions/mean_terminated_length": 550.36865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.07466666666666667, "grad_norm": 1.3070794343948364, "kl": 0.0540618896484375, "learning_rate": 3.638888888888889e-06, "loss": -0.0255, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034946221858263016, "mask/share_reasoning": 0.8387826085090637, "mask/share_step_conf": 0.12236493825912476, "num_tokens": 16860847.0, "reward": 0.6069908142089844, "reward_std": 0.30986493825912476, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7269101142883301, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.19097766280174255, "step": 70 }, { "adv/mean_abs_final_conf": 0.6929099559783936, "adv/mean_abs_reasoning": 0.6004456281661987, "adv/mean_abs_step_conf": 0.7805822491645813, "adv/ratio_final_to_reasoning": 1.153992840441835, "adv/ratio_step_to_reasoning": 1.3000048839534928, "adv/std_final_conf": 0.8895405530929565, "adv/std_reasoning": 0.8099554777145386, "adv/std_step_conf": 0.9366074204444885, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 6.27734375, "calib/ece": 0.2645381526104418, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6385542168674698, "calib/gap": 0.1272188875550222, "calib/mean_conf": 0.8086345381526104, "calib/mu_c": 0.8607482993197278, "calib/mu_w": 0.7335294117647057, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.24140562248995984, "calib/std_conf": 0.28106892738196115, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7678726114649681, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.018712027523362318, "calib/step_q_w": 0.7491605839416058, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 509.8984375, "completions/mean_terminated_length": 511.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.07573333333333333, "grad_norm": 1.1254944801330566, "kl": 0.052642822265625, "learning_rate": 3.6111111111111115e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03587288036942482, "mask/share_reasoning": 0.8306654691696167, "mask/share_step_conf": 0.1295553594827652, "num_tokens": 17095789.0, "reward": 0.6199588179588318, "reward_std": 0.32760918140411377, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6662914156913757, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.2658137381076813, "step": 71 }, { "adv/mean_abs_final_conf": 0.6310667991638184, "adv/mean_abs_reasoning": 0.35013920068740845, "adv/mean_abs_step_conf": 0.7473705410957336, "adv/ratio_final_to_reasoning": 1.802331181212731, "adv/ratio_step_to_reasoning": 2.13449547959344, "adv/std_final_conf": 0.8359543085098267, "adv/std_reasoning": 0.6402967572212219, "adv/std_step_conf": 0.9362786412239075, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.26565737051792826, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6135458167330677, "calib/gap": 0.3100805626598465, "calib/mean_conf": 0.8074900398406375, "calib/mu_c": 0.9495588235294117, "calib/mu_w": 0.6394782608695652, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.26565737051792826, "calib/std_conf": 0.28020282014769854, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7877134986225895, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.035618697399347865, "calib/step_q_w": 0.7520948012232417, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 480.8125, "completions/mean_terminated_length": 480.8125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0768, "grad_norm": 2.718691349029541, "kl": 0.05579376220703125, "learning_rate": 3.5833333333333335e-06, "loss": 0.0156, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033517055213451385, "mask/share_reasoning": 0.8466134071350098, "mask/share_step_conf": 0.11986956000328064, "num_tokens": 17323285.0, "reward": 0.6529368162155151, "reward_std": 0.2858799695968628, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7379339933395386, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2663770914077759, "step": 72 }, { "adv/mean_abs_final_conf": 0.7095045447349548, "adv/mean_abs_reasoning": 0.6011914610862732, "adv/mean_abs_step_conf": 0.7971264123916626, "adv/ratio_final_to_reasoning": 1.1801640420058102, "adv/ratio_step_to_reasoning": 1.325911068249973, "adv/std_final_conf": 0.8819000720977783, "adv/std_reasoning": 0.8265854120254517, "adv/std_step_conf": 0.9366573095321655, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.2890625, "calib/ece": 0.1800787401574803, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5196850393700787, "calib/gap": 0.26234734441186025, "calib/mean_conf": 0.7318110236220472, "calib/mu_c": 0.8340645161290321, "calib/mu_w": 0.5717171717171718, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1508267716535433, "calib/std_conf": 0.3114265306183273, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7930548926014319, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.02328745074096683, "calib/step_q_w": 0.7697674418604651, "calib/step_q_w_n": 516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2681.0, "completions/max_terminated_length": 2681.0, "completions/mean_length": 485.62109375, "completions/mean_terminated_length": 485.62109375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.07786666666666667, "grad_norm": 2.3058297634124756, "kl": 0.05544281005859375, "learning_rate": 3.555555555555556e-06, "loss": 0.0662, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03415922820568085, "mask/share_reasoning": 0.8481609225273132, "mask/share_step_conf": 0.11767985671758652, "num_tokens": 17554636.0, "reward": 0.6871067881584167, "reward_std": 0.34462395310401917, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7655718922615051, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.28989166021347046, "step": 73 }, { "adv/mean_abs_final_conf": 0.6680521965026855, "adv/mean_abs_reasoning": 0.532854437828064, "adv/mean_abs_step_conf": 0.7628600597381592, "adv/ratio_final_to_reasoning": 1.2537236233326554, "adv/ratio_step_to_reasoning": 1.4316481304868311, "adv/std_final_conf": 0.8794238567352295, "adv/std_reasoning": 0.7927346229553223, "adv/std_step_conf": 0.936627984046936, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.578125, "calib/ece": 0.15714859437751, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.40963855421686746, "calib/gap": 0.33644774193548393, "calib/mean_conf": 0.6229317269076305, "calib/mu_c": 0.7904800000000002, "calib/mu_w": 0.45403225806451625, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1390361445783132, "calib/std_conf": 0.3521680403899794, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7759126365054603, "calib/step_q_c_n": 641.0, "calib/step_q_gap": 0.03545520321448181, "calib/step_q_w": 0.7404574332909785, "calib/step_q_w_n": 787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 492.03515625, "completions/mean_terminated_length": 492.03515625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.07893333333333333, "grad_norm": 2.130279541015625, "kl": 0.06341552734375, "learning_rate": 3.5277777777777784e-06, "loss": 0.0535, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03629348799586296, "mask/share_reasoning": 0.8359097242355347, "mask/share_step_conf": 0.12779682874679565, "num_tokens": 17784525.0, "reward": 0.6383036375045776, "reward_std": 0.3283129930496216, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7582637071609497, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.226156085729599, "step": 74 }, { "adv/mean_abs_final_conf": 0.6502747535705566, "adv/mean_abs_reasoning": 0.39697492122650146, "adv/mean_abs_step_conf": 0.7453141212463379, "adv/ratio_final_to_reasoning": 1.6380751498393276, "adv/ratio_step_to_reasoning": 1.8774841467153665, "adv/std_final_conf": 0.855506181716919, "adv/std_reasoning": 0.7013623118400574, "adv/std_step_conf": 0.9365238547325134, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.06317269076305225, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5341365461847389, "calib/gap": 0.43237439613526585, "calib/mean_conf": 0.7302409638554217, "calib/mu_c": 0.8500555555555557, "calib/mu_w": 0.4176811594202898, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.035261044176706856, "calib/std_conf": 0.32481434636897294, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7744477028347996, "calib/step_q_c_n": 1023.0, "calib/step_q_gap": 0.025341673728770497, "calib/step_q_w": 0.7491060291060291, "calib/step_q_w_n": 481.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 471.84765625, "completions/mean_terminated_length": 473.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.08, "grad_norm": 2.2590935230255127, "kl": 0.08824920654296875, "learning_rate": 3.5e-06, "loss": 0.0566, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.036440733820199966, "mask/share_reasoning": 0.8240508437156677, "mask/share_step_conf": 0.13560213148593903, "num_tokens": 18010070.0, "reward": 0.7897834777832031, "reward_std": 0.2697499990463257, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.8383581638336182, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.4068337380886078, "step": 75 }, { "adv/mean_abs_final_conf": 0.6710430383682251, "adv/mean_abs_reasoning": 0.48223674297332764, "adv/mean_abs_step_conf": 0.7656504511833191, "adv/ratio_final_to_reasoning": 1.3915220027216804, "adv/ratio_step_to_reasoning": 1.587706582585448, "adv/std_final_conf": 0.8715046048164368, "adv/std_reasoning": 0.7392695546150208, "adv/std_step_conf": 0.9366810917854309, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.17326771653543302, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.38976377952755903, "calib/gap": 0.34802981029810304, "calib/mean_conf": 0.5732677165354331, "calib/mu_c": 0.6965853658536586, "calib/mu_w": 0.34855555555555556, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05043307086614171, "calib/std_conf": 0.3677404285852259, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7817391304347825, "calib/step_q_c_n": 782.0, "calib/step_q_gap": 0.039421612186607224, "calib/step_q_w": 0.7423175182481753, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 475.54296875, "completions/mean_terminated_length": 477.4078674316406, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.08106666666666666, "grad_norm": 4.335661888122559, "kl": 0.5023956298828125, "learning_rate": 3.4722222222222224e-06, "loss": 0.0026, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03703823685646057, "mask/share_reasoning": 0.8404496908187866, "mask/share_step_conf": 0.11860580742359161, "num_tokens": 18234865.0, "reward": 0.7124652862548828, "reward_std": 0.28656065464019775, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7838175892829895, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.314550518989563, "step": 76 }, { "adv/mean_abs_final_conf": 0.7689943313598633, "adv/mean_abs_reasoning": 0.45446068048477173, "adv/mean_abs_step_conf": 0.7774375677108765, "adv/ratio_final_to_reasoning": 1.6921031111857237, "adv/ratio_step_to_reasoning": 1.710681696118543, "adv/std_final_conf": 0.9220561981201172, "adv/std_reasoning": 0.7014573812484741, "adv/std_step_conf": 0.9365091919898987, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.30090909090909096, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.18972332015810275, "calib/gap": 0.1746666666666667, "calib/mean_conf": 0.43482213438735173, "calib/mu_c": 0.4955757575757576, "calib/mu_w": 0.3209090909090909, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.041778656126482225, "calib/std_conf": 0.33027546890417286, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7638888888888888, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.009722222222222188, "calib/step_q_w": 0.7541666666666667, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 494.52734375, "completions/mean_terminated_length": 494.52734375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.08213333333333334, "grad_norm": 2.8443989753723145, "kl": 0.06838226318359375, "learning_rate": 3.444444444444445e-06, "loss": 0.1167, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0373874232172966, "mask/share_reasoning": 0.8274275660514832, "mask/share_step_conf": 0.13518501818180084, "num_tokens": 18466128.0, "reward": 0.6740167140960693, "reward_std": 0.2678437829017639, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6879199743270874, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3335510194301605, "step": 77 }, { "adv/mean_abs_final_conf": 0.7825330495834351, "adv/mean_abs_reasoning": 0.55494225025177, "adv/mean_abs_step_conf": 0.7956753373146057, "adv/ratio_final_to_reasoning": 1.4101161863750868, "adv/ratio_step_to_reasoning": 1.4337984483135287, "adv/std_final_conf": 0.9336245656013489, "adv/std_reasoning": 0.7928445339202881, "adv/std_step_conf": 0.9366925358772278, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.20328000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.196, "calib/gap": 0.23339532376524486, "calib/mean_conf": 0.48624, "calib/mu_c": 0.576797385620915, "calib/mu_w": 0.3434020618556701, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.03876000000000006, "calib/std_conf": 0.3240152193956327, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8028456221198157, "calib/step_q_c_n": 868.0, "calib/step_q_gap": 0.026834592708050886, "calib/step_q_w": 0.7760110294117648, "calib/step_q_w_n": 544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2547.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 519.40625, "completions/mean_terminated_length": 519.40625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0832, "grad_norm": 1.9317562580108643, "kl": 0.065826416015625, "learning_rate": 3.416666666666667e-06, "loss": 0.0532, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032831624150276184, "mask/share_reasoning": 0.8527945280075073, "mask/share_step_conf": 0.1143738180398941, "num_tokens": 18707120.0, "reward": 0.6522242426872253, "reward_std": 0.2948106527328491, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.729086697101593, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.260517954826355, "step": 78 }, { "adv/mean_abs_final_conf": 0.7491778135299683, "adv/mean_abs_reasoning": 0.4690690040588379, "adv/mean_abs_step_conf": 0.801034688949585, "adv/ratio_final_to_reasoning": 1.597159068383028, "adv/ratio_step_to_reasoning": 1.7077118334792099, "adv/std_final_conf": 0.9215131998062134, "adv/std_reasoning": 0.7392135858535767, "adv/std_step_conf": 0.9365547895431519, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.76953125, "calib/ece": 0.2438735177865613, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2134387351778656, "calib/gap": 0.20133881975110407, "calib/mean_conf": 0.5038735177865613, "calib/mu_c": 0.578679245283019, "calib/mu_w": 0.3773404255319149, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05964426877470358, "calib/std_conf": 0.3364814682395642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7924321389793703, "calib/step_q_c_n": 921.0, "calib/step_q_gap": 0.01029185120958609, "calib/step_q_w": 0.7821402877697842, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 518.52734375, "completions/mean_terminated_length": 520.560791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.08426666666666667, "grad_norm": 3.054436206817627, "kl": 0.06148529052734375, "learning_rate": 3.3888888888888893e-06, "loss": -0.0115, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03253207355737686, "mask/share_reasoning": 0.8500226736068726, "mask/share_step_conf": 0.11353901028633118, "num_tokens": 18946239.0, "reward": 0.629299521446228, "reward_std": 0.276027649641037, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7232093811035156, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.21273338794708252, "step": 79 }, { "adv/mean_abs_final_conf": 0.6969304084777832, "adv/mean_abs_reasoning": 0.49512577056884766, "adv/mean_abs_step_conf": 0.7704160213470459, "adv/ratio_final_to_reasoning": 1.4075825778106503, "adv/ratio_step_to_reasoning": 1.5560006510303808, "adv/std_final_conf": 0.8863787651062012, "adv/std_reasoning": 0.7393211722373962, "adv/std_step_conf": 0.9365900158882141, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.21875, "calib/ece": 0.19215686274509802, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4549019607843137, "calib/gap": 0.2739028275587415, "calib/mean_conf": 0.6703529411764706, "calib/mu_c": 0.770246913580247, "calib/mu_w": 0.4963440860215055, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11360784313725489, "calib/std_conf": 0.3431541598076734, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8013428280773144, "calib/step_q_c_n": 983.0, "calib/step_q_gap": -0.007212180132866286, "calib/step_q_w": 0.8085550082101807, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 483.17578125, "completions/mean_terminated_length": 483.17578125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.08533333333333333, "grad_norm": 1.5569194555282593, "kl": 0.06230926513671875, "learning_rate": 3.3611111111111117e-06, "loss": 0.0417, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03493821993470192, "mask/share_reasoning": 0.830466091632843, "mask/share_step_conf": 0.13459573686122894, "num_tokens": 19172092.0, "reward": 0.6752669215202332, "reward_std": 0.279674768447876, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7732125520706177, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.25154006481170654, "step": 80 }, { "adv/mean_abs_final_conf": 0.6821689009666443, "adv/mean_abs_reasoning": 0.4455212950706482, "adv/mean_abs_step_conf": 0.7899991273880005, "adv/ratio_final_to_reasoning": 1.5311701337608787, "adv/ratio_step_to_reasoning": 1.773201721508569, "adv/std_final_conf": 0.8754222989082336, "adv/std_reasoning": 0.7014862895011902, "adv/std_step_conf": 0.9367194175720215, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.35546875, "calib/ece": 0.14544354838709686, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4596774193548387, "calib/gap": 0.36070340136054424, "calib/mean_conf": 0.6563306451612904, "calib/mu_c": 0.7988666666666667, "calib/mu_w": 0.4381632653061225, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.09846774193548394, "calib/std_conf": 0.3605638555090953, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8175808720112517, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.04571723564761543, "calib/step_q_w": 0.7718636363636363, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 517.40625, "completions/mean_terminated_length": 521.4802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0864, "grad_norm": 1.4852168560028076, "kl": 0.06805419921875, "learning_rate": 3.3333333333333333e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03399985283613205, "mask/share_reasoning": 0.8429831862449646, "mask/share_step_conf": 0.11520448327064514, "num_tokens": 19410796.0, "reward": 0.6309086084365845, "reward_std": 0.31020477414131165, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7757323980331421, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.1743660271167755, "step": 81 }, { "adv/mean_abs_final_conf": 0.6997003555297852, "adv/mean_abs_reasoning": 0.5604740381240845, "adv/mean_abs_step_conf": 0.7764509320259094, "adv/ratio_final_to_reasoning": 1.2484081472742135, "adv/ratio_step_to_reasoning": 1.385346830023926, "adv/std_final_conf": 0.8651977777481079, "adv/std_reasoning": 0.809860110282898, "adv/std_step_conf": 0.9366908669471741, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.17492000000000008, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.584, "calib/gap": 0.35077249852950787, "calib/mean_conf": 0.73924, "calib/mu_c": 0.8893706293706293, "calib/mu_w": 0.5385981308411214, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17108000000000007, "calib/std_conf": 0.3327074126015229, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8289730423620025, "calib/step_q_c_n": 779.0, "calib/step_q_gap": 0.028326490637864632, "calib/step_q_w": 0.8006465517241379, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 513.19921875, "completions/mean_terminated_length": 513.19921875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.08746666666666666, "grad_norm": 1.382436752319336, "kl": 0.054737091064453125, "learning_rate": 3.3055555555555558e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035021133720874786, "mask/share_reasoning": 0.8393304347991943, "mask/share_step_conf": 0.1256483793258667, "num_tokens": 19647727.0, "reward": 0.6758458614349365, "reward_std": 0.3302364945411682, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7658421993255615, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.27959945797920227, "step": 82 }, { "adv/mean_abs_final_conf": 0.6789208650588989, "adv/mean_abs_reasoning": 0.4090383052825928, "adv/mean_abs_step_conf": 0.7418469190597534, "adv/ratio_final_to_reasoning": 1.6597977653702922, "adv/ratio_step_to_reasoning": 1.8136367901955608, "adv/std_final_conf": 0.8657461404800415, "adv/std_reasoning": 0.7204188108444214, "adv/std_step_conf": 0.9365704655647278, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.23223107569721108, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5896414342629482, "calib/gap": 0.307012312427857, "calib/mean_conf": 0.7480876494023906, "calib/mu_c": 0.886304347826087, "calib/mu_w": 0.57929203539823, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2152589641434262, "calib/std_conf": 0.3327483170758125, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.830485294117647, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.04520096679448482, "calib/step_q_w": 0.7852843273231622, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 582.22265625, "completions/mean_terminated_length": 582.22265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.08853333333333334, "grad_norm": 1.319812297821045, "kl": 0.056301116943359375, "learning_rate": 3.277777777777778e-06, "loss": 0.0761, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032107748091220856, "mask/share_reasoning": 0.8575824499130249, "mask/share_step_conf": 0.11030974984169006, "num_tokens": 19904040.0, "reward": 0.6097712516784668, "reward_std": 0.3136539161205292, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7396894693374634, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.17594672739505768, "step": 83 }, { "adv/mean_abs_final_conf": 0.6824579834938049, "adv/mean_abs_reasoning": 0.514862596988678, "adv/mean_abs_step_conf": 0.7856142520904541, "adv/ratio_final_to_reasoning": 1.3255147829447251, "adv/ratio_step_to_reasoning": 1.5258716727246164, "adv/std_final_conf": 0.858221173286438, "adv/std_reasoning": 0.7575433254241943, "adv/std_step_conf": 0.9367170929908752, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 4.828125, "calib/ece": 0.29371999999999987, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.696, "calib/gap": 0.2985152979066026, "calib/mean_conf": 0.83372, "calib/mu_c": 0.9710370370370373, "calib/mu_w": 0.6725217391304347, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.29371999999999987, "calib/std_conf": 0.2792521469926418, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8312995896032831, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.0063490945537781895, "calib/step_q_w": 0.8249504950495049, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 460.80078125, "completions/mean_terminated_length": 466.26483154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.0896, "grad_norm": 1.2343428134918213, "kl": 0.06479644775390625, "learning_rate": 3.2500000000000002e-06, "loss": 0.0058, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0363972969353199, "mask/share_reasoning": 0.8376258015632629, "mask/share_step_conf": 0.11425817757844925, "num_tokens": 20127925.0, "reward": 0.6092539429664612, "reward_std": 0.3542028069496155, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.718407392501831, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.19931918382644653, "step": 84 }, { "adv/mean_abs_final_conf": 0.7023517489433289, "adv/mean_abs_reasoning": 0.5211501717567444, "adv/mean_abs_step_conf": 0.7787065505981445, "adv/ratio_final_to_reasoning": 1.3476955146647507, "adv/ratio_step_to_reasoning": 1.4942076061746343, "adv/std_final_conf": 0.8711166977882385, "adv/std_reasoning": 0.7753753662109375, "adv/std_step_conf": 0.9367332458496094, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.34765625, "calib/ece": 0.26643724696356275, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7165991902834008, "calib/gap": 0.28331478537360877, "calib/mean_conf": 0.8135627530364373, "calib/mu_c": 0.9408823529411764, "calib/mu_w": 0.6575675675675676, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.26469635627530363, "calib/std_conf": 0.308824759195269, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8370168067226891, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.021322150234139525, "calib/step_q_w": 0.8156946564885496, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 561.671875, "completions/mean_terminated_length": 566.094482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.09066666666666667, "grad_norm": 1.2337369918823242, "kl": 0.052219390869140625, "learning_rate": 3.2222222222222227e-06, "loss": 0.0538, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03350468724966049, "mask/share_reasoning": 0.8466283082962036, "mask/share_step_conf": 0.1120544970035553, "num_tokens": 20379537.0, "reward": 0.6182559728622437, "reward_std": 0.38047146797180176, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6987718343734741, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.23930248618125916, "step": 85 }, { "adv/mean_abs_final_conf": 0.6275197863578796, "adv/mean_abs_reasoning": 0.42547503113746643, "adv/mean_abs_step_conf": 0.760039746761322, "adv/ratio_final_to_reasoning": 1.474868653702818, "adv/ratio_step_to_reasoning": 1.7863321961089682, "adv/std_final_conf": 0.8319432139396667, "adv/std_reasoning": 0.7013487815856934, "adv/std_step_conf": 0.9365622997283936, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.46875, "calib/ece": 0.34464566929133866, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7519685039370079, "calib/gap": 0.18054614286600867, "calib/mean_conf": 0.8541732283464567, "calib/mu_c": 0.9416030534351144, "calib/mu_w": 0.7610569105691057, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3415354330708662, "calib/std_conf": 0.27537645226921803, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8309948320413437, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.005339879964666339, "calib/step_q_w": 0.8256549520766774, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 510.06640625, "completions/mean_terminated_length": 510.06640625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.09173333333333333, "grad_norm": 1.5324760675430298, "kl": 0.06156158447265625, "learning_rate": 3.1944444444444443e-06, "loss": 0.08, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03464901074767113, "mask/share_reasoning": 0.8442748785018921, "mask/share_step_conf": 0.12107609957456589, "num_tokens": 20615626.0, "reward": 0.5600494146347046, "reward_std": 0.3438895344734192, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6441769599914551, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.1759217530488968, "step": 86 }, { "adv/mean_abs_final_conf": 0.5247563123703003, "adv/mean_abs_reasoning": 0.391268789768219, "adv/mean_abs_step_conf": 0.7580701112747192, "adv/ratio_final_to_reasoning": 1.3411657819197822, "adv/ratio_step_to_reasoning": 1.9374663430829409, "adv/std_final_conf": 0.7688370943069458, "adv/std_reasoning": 0.7013257145881653, "adv/std_step_conf": 0.9364421963691711, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.2661904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9047619047619048, "calib/gap": 0.05209400046688972, "calib/mean_conf": 0.9476984126984128, "calib/mu_c": 0.9623756906077348, "calib/mu_w": 0.910281690140845, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24781746031746033, "calib/std_conf": 0.171772564185915, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8463071512309495, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.019551703289060973, "calib/step_q_w": 0.8267554479418885, "calib/step_q_w_n": 413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 473.49609375, "completions/mean_terminated_length": 475.35296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.0928, "grad_norm": 0.6942335367202759, "kl": 0.06700897216796875, "learning_rate": 3.1666666666666667e-06, "loss": -0.0102, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03588268905878067, "mask/share_reasoning": 0.8471142649650574, "mask/share_step_conf": 0.11309675872325897, "num_tokens": 20842337.0, "reward": 0.7081435918807983, "reward_std": 0.3356953263282776, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7250593900680542, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3529464900493622, "step": 87 }, { "adv/mean_abs_final_conf": 0.5471068024635315, "adv/mean_abs_reasoning": 0.4318411946296692, "adv/mean_abs_step_conf": 0.7614994645118713, "adv/ratio_final_to_reasoning": 1.2669166565563754, "adv/ratio_step_to_reasoning": 1.7633784687097875, "adv/std_final_conf": 0.7545063495635986, "adv/std_reasoning": 0.7013445496559143, "adv/std_step_conf": 0.9367469549179077, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.48046875, "calib/ece": 0.24741176470588236, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8470588235294118, "calib/gap": 0.1924749373433584, "calib/mean_conf": 0.9180000000000001, "calib/mu_c": 0.9814035087719298, "calib/mu_w": 0.7889285714285714, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24741176470588236, "calib/std_conf": 0.20403075508267823, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.84220696937698, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.012930653587506447, "calib/step_q_w": 0.8292763157894736, "calib/step_q_w_n": 456.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 510.6015625, "completions/mean_terminated_length": 512.6039428710938, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.09386666666666667, "grad_norm": 0.9162920117378235, "kl": 0.06510162353515625, "learning_rate": 3.138888888888889e-06, "loss": 0.0351, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03183043748140335, "mask/share_reasoning": 0.8538945913314819, "mask/share_step_conf": 0.1103687658905983, "num_tokens": 21082899.0, "reward": 0.7086239457130432, "reward_std": 0.3570191562175751, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.750508189201355, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.33548974990844727, "step": 88 }, { "adv/mean_abs_final_conf": 0.6150949597358704, "adv/mean_abs_reasoning": 0.4049031436443329, "adv/mean_abs_step_conf": 0.7896219491958618, "adv/ratio_final_to_reasoning": 1.5191162859337295, "adv/ratio_step_to_reasoning": 1.9501502065132548, "adv/std_final_conf": 0.8125619292259216, "adv/std_reasoning": 0.6612932085990906, "adv/std_step_conf": 0.9367706775665283, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.38310483870967743, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8024193548387096, "calib/gap": 0.14812259787636006, "calib/mean_conf": 0.9032661290322581, "calib/mu_c": 0.9743410852713179, "calib/mu_w": 0.8262184873949578, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.38310483870967743, "calib/std_conf": 0.21250668334351452, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8440178571428573, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.038383049724882934, "calib/step_q_w": 0.8056348074179743, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2851.0, "completions/max_terminated_length": 2851.0, "completions/mean_length": 547.0390625, "completions/mean_terminated_length": 551.346435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.09493333333333333, "grad_norm": 1.050075650215149, "kl": 0.060760498046875, "learning_rate": 3.1111111111111116e-06, "loss": -0.0545, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03269446641206741, "mask/share_reasoning": 0.8474637269973755, "mask/share_step_conf": 0.11202934384346008, "num_tokens": 21331829.0, "reward": 0.5130444765090942, "reward_std": 0.356639564037323, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6114406585693359, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.1208982989192009, "step": 89 }, { "adv/mean_abs_final_conf": 0.6150104403495789, "adv/mean_abs_reasoning": 0.4428023099899292, "adv/mean_abs_step_conf": 0.7560378909111023, "adv/ratio_final_to_reasoning": 1.3889052213019535, "adv/ratio_step_to_reasoning": 1.7073937372374979, "adv/std_final_conf": 0.8170931339263916, "adv/std_reasoning": 0.7206278443336487, "adv/std_step_conf": 0.9366317987442017, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.31274900398406374, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8406374501992032, "calib/gap": 0.1404973009446694, "calib/mean_conf": 0.9234262948207171, "calib/mu_c": 0.9766025641025641, "calib/mu_w": 0.8361052631578947, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3073306772908367, "calib/std_conf": 0.19756942787408874, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8255796316359697, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.01099898647467934, "calib/step_q_w": 0.8145806451612904, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2530.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 530.96484375, "completions/mean_terminated_length": 533.047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.096, "grad_norm": 0.9591420292854309, "kl": 0.0709228515625, "learning_rate": 3.0833333333333336e-06, "loss": 0.0511, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03326074779033661, "mask/share_reasoning": 0.8390543460845947, "mask/share_step_conf": 0.12377868592739105, "num_tokens": 21571076.0, "reward": 0.6548340320587158, "reward_std": 0.34971415996551514, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6834644675254822, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3090159595012665, "step": 90 }, { "adv/mean_abs_final_conf": 0.6077797412872314, "adv/mean_abs_reasoning": 0.5076763033866882, "adv/mean_abs_step_conf": 0.7608697414398193, "adv/ratio_final_to_reasoning": 1.197179654107858, "adv/ratio_step_to_reasoning": 1.4987300694637269, "adv/std_final_conf": 0.8183364272117615, "adv/std_reasoning": 0.7576113343238831, "adv/std_step_conf": 0.9366979002952576, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 6.22265625, "calib/ece": 0.27914285714285714, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8612244897959184, "calib/gap": 0.14485507246376816, "calib/mean_conf": 0.9254285714285715, "calib/mu_c": 0.9750931677018634, "calib/mu_w": 0.8302380952380952, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.2737142857142857, "calib/std_conf": 0.20326598639341947, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8328809523809524, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.04329263896793789, "calib/step_q_w": 0.7895883134130145, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 577.6953125, "completions/mean_terminated_length": 582.2440795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 0.09706666666666666, "grad_norm": 1.0348371267318726, "kl": 0.06566619873046875, "learning_rate": 3.055555555555556e-06, "loss": 0.0446, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029645482078194618, "mask/share_reasoning": 0.8496586084365845, "mask/share_step_conf": 0.11288341879844666, "num_tokens": 21826678.0, "reward": 0.6462156772613525, "reward_std": 0.379304975271225, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6915441751480103, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2844810485839844, "step": 91 }, { "adv/mean_abs_final_conf": 0.6056889295578003, "adv/mean_abs_reasoning": 0.38807982206344604, "adv/mean_abs_step_conf": 0.76021409034729, "adv/ratio_final_to_reasoning": 1.5607328573212393, "adv/ratio_step_to_reasoning": 1.9589116648868306, "adv/std_final_conf": 0.8189154863357544, "adv/std_reasoning": 0.6612099409103394, "adv/std_step_conf": 0.9368066787719727, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.1484375, "calib/ece": 0.3374509803921567, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8196078431372549, "calib/gap": 0.1550869236583522, "calib/mean_conf": 0.9112549019607843, "calib/mu_c": 0.9769387755102041, "calib/mu_w": 0.8218518518518519, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33611764705882335, "calib/std_conf": 0.20149829397781704, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8382225237449118, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.014952299992760376, "calib/step_q_w": 0.8232702237521514, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 483.61328125, "completions/mean_terminated_length": 485.50982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.09813333333333334, "grad_norm": 56.60649108886719, "kl": 4.22686767578125, "learning_rate": 3.0277777777777776e-06, "loss": 0.0419, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03396128863096237, "mask/share_reasoning": 0.8479499816894531, "mask/share_step_conf": 0.11418244242668152, "num_tokens": 22057203.0, "reward": 0.6557959914207458, "reward_std": 0.3515685796737671, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6726886630058289, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.325622022151947, "step": 92 }, { "adv/mean_abs_final_conf": 0.6063660979270935, "adv/mean_abs_reasoning": 0.45978569984436035, "adv/mean_abs_step_conf": 0.7917192578315735, "adv/ratio_final_to_reasoning": 1.3188015593620057, "adv/ratio_step_to_reasoning": 1.721931017209918, "adv/std_final_conf": 0.7995532751083374, "adv/std_reasoning": 0.7206925749778748, "adv/std_step_conf": 0.9366562962532043, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 6.40625, "calib/ece": 0.34852589641434256, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8247011952191236, "calib/gap": 0.10706126687435102, "calib/mean_conf": 0.9188047808764941, "calib/mu_c": 0.9644444444444444, "calib/mu_w": 0.8573831775700934, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34681274900398396, "calib/std_conf": 0.19090726388337884, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8218408551068884, "calib/step_q_c_n": 842.0, "calib/step_q_gap": 0.019271932801123937, "calib/step_q_w": 0.8025689223057645, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 558.96875, "completions/mean_terminated_length": 558.96875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.0992, "grad_norm": 0.8443036079406738, "kl": 0.0611724853515625, "learning_rate": 3e-06, "loss": 0.0288, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03264810889959335, "mask/share_reasoning": 0.837859034538269, "mask/share_step_conf": 0.1294928789138794, "num_tokens": 22306075.0, "reward": 0.5780031681060791, "reward_std": 0.3760235905647278, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6395211219787598, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.20789138972759247, "step": 93 }, { "adv/mean_abs_final_conf": 0.661847710609436, "adv/mean_abs_reasoning": 0.4413014054298401, "adv/mean_abs_step_conf": 0.7869787812232971, "adv/ratio_final_to_reasoning": 1.4997634325790954, "adv/ratio_step_to_reasoning": 1.7833135619787057, "adv/std_final_conf": 0.8583675026893616, "adv/std_reasoning": 0.7205637097358704, "adv/std_step_conf": 0.9367392659187317, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.33620000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.776, "calib/gap": 0.14432716815894397, "calib/mean_conf": 0.8962, "calib/mu_c": 0.9579720279720281, "calib/mu_w": 0.8136448598130841, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.33020000000000005, "calib/std_conf": 0.21095108437739782, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8321834625322998, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.03166096616074676, "calib/step_q_w": 0.800522496371553, "calib/step_q_w_n": 689.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 530.57421875, "completions/mean_terminated_length": 530.57421875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.10026666666666667, "grad_norm": 0.7442113161087036, "kl": 0.06824493408203125, "learning_rate": 2.9722222222222225e-06, "loss": 0.0738, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0326605960726738, "mask/share_reasoning": 0.8487422466278076, "mask/share_step_conf": 0.11859709024429321, "num_tokens": 22550582.0, "reward": 0.5864527225494385, "reward_std": 0.3541906476020813, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6603956818580627, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2054784595966339, "step": 94 }, { "adv/mean_abs_final_conf": 0.6467571258544922, "adv/mean_abs_reasoning": 0.49849826097488403, "adv/mean_abs_step_conf": 0.7880775928497314, "adv/ratio_final_to_reasoning": 1.2974109971610872, "adv/ratio_step_to_reasoning": 1.580903394343912, "adv/std_final_conf": 0.8548763990402222, "adv/std_reasoning": 0.7574633359909058, "adv/std_step_conf": 0.9366299510002136, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.05859375, "calib/ece": 0.2860629921259841, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7598425196850394, "calib/gap": 0.18047702834799606, "calib/mean_conf": 0.8795275590551181, "calib/mu_c": 0.9498709677419355, "calib/mu_w": 0.7693939393939394, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2776771653543305, "calib/std_conf": 0.24073332279737553, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8096130346232179, "calib/step_q_c_n": 982.0, "calib/step_q_gap": 0.018857322848173874, "calib/step_q_w": 0.790755711775044, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 514.2109375, "completions/mean_terminated_length": 516.2274780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.10133333333333333, "grad_norm": 0.9038649797439575, "kl": 0.06461334228515625, "learning_rate": 2.944444444444445e-06, "loss": 0.0013, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031866710633039474, "mask/share_reasoning": 0.8381522297859192, "mask/share_step_conf": 0.12607482075691223, "num_tokens": 22788348.0, "reward": 0.656410813331604, "reward_std": 0.37393712997436523, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.711928129196167, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.28136223554611206, "step": 95 }, { "adv/mean_abs_final_conf": 0.5371603965759277, "adv/mean_abs_reasoning": 0.36885321140289307, "adv/mean_abs_step_conf": 0.7852956056594849, "adv/ratio_final_to_reasoning": 1.4562985490431182, "adv/ratio_step_to_reasoning": 2.129019299229356, "adv/std_final_conf": 0.7626716494560242, "adv/std_reasoning": 0.6403284668922424, "adv/std_step_conf": 0.9367201328277588, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.18372549019607845, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.226942084942085, "calib/mean_conf": 0.9092156862745098, "calib/mu_c": 0.9715135135135136, "calib/mu_w": 0.7445714285714286, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18372549019607845, "calib/std_conf": 0.19535631736479103, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.819140127388535, "calib/step_q_c_n": 942.0, "calib/step_q_gap": -0.006423975175567587, "calib/step_q_w": 0.8255641025641026, "calib/step_q_w_n": 390.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 472.27734375, "completions/mean_terminated_length": 472.27734375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.1024, "grad_norm": 0.8621717691421509, "kl": 0.07381439208984375, "learning_rate": 2.916666666666667e-06, "loss": 0.0203, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03368357568979263, "mask/share_reasoning": 0.8493810892105103, "mask/share_step_conf": 0.1169353723526001, "num_tokens": 23015067.0, "reward": 0.7415355443954468, "reward_std": 0.3293728530406952, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8160414099693298, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.32406085729599, "step": 96 }, { "adv/mean_abs_final_conf": 0.7130811214447021, "adv/mean_abs_reasoning": 0.5386027693748474, "adv/mean_abs_step_conf": 0.778464674949646, "adv/ratio_final_to_reasoning": 1.323946258710052, "adv/ratio_step_to_reasoning": 1.4453410179327608, "adv/std_final_conf": 0.9063007235527039, "adv/std_reasoning": 0.8098030686378479, "adv/std_step_conf": 0.9367033839225769, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.2570517928286853, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6374501992031872, "calib/gap": 0.19428375196232361, "calib/mean_conf": 0.8254183266932271, "calib/mu_c": 0.9059183673469389, "calib/mu_w": 0.7116346153846153, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24840637450199207, "calib/std_conf": 0.2610462482547438, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7994057142857143, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.017371815980629535, "calib/step_q_w": 0.7820338983050847, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 502.62109375, "completions/mean_terminated_length": 504.5921936035156, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.10346666666666667, "grad_norm": 1.1886435747146606, "kl": 0.07137298583984375, "learning_rate": 2.888888888888889e-06, "loss": 0.0212, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032748110592365265, "mask/share_reasoning": 0.8374589681625366, "mask/share_step_conf": 0.12588664889335632, "num_tokens": 23248810.0, "reward": 0.6359817385673523, "reward_std": 0.3654692769050598, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7090933322906494, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2527138888835907, "step": 97 }, { "adv/mean_abs_final_conf": 0.6551649570465088, "adv/mean_abs_reasoning": 0.45474952459335327, "adv/mean_abs_step_conf": 0.7897055149078369, "adv/ratio_final_to_reasoning": 1.4407160900989864, "adv/ratio_step_to_reasoning": 1.7365724914479206, "adv/std_final_conf": 0.8604001402854919, "adv/std_reasoning": 0.7393489480018616, "adv/std_step_conf": 0.9367654323577881, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.3131983805668016, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6963562753036437, "calib/gap": 0.1515800692779109, "calib/mean_conf": 0.8378947368421052, "calib/mu_c": 0.9041726618705035, "calib/mu_w": 0.7525925925925926, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.2941700404858299, "calib/std_conf": 0.27627441371343825, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7969787234042554, "calib/step_q_c_n": 705.0, "calib/step_q_gap": -8.110565557362115e-05, "calib/step_q_w": 0.797059829059829, "calib/step_q_w_n": 585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 548.609375, "completions/mean_terminated_length": 548.609375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.10453333333333334, "grad_norm": 0.8640843033790588, "kl": 0.06402587890625, "learning_rate": 2.861111111111111e-06, "loss": 0.0267, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03189465403556824, "mask/share_reasoning": 0.8586874008178711, "mask/share_step_conf": 0.10941793024539948, "num_tokens": 23495438.0, "reward": 0.5579974055290222, "reward_std": 0.3751477599143982, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6481816172599792, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.16781318187713623, "step": 98 }, { "adv/mean_abs_final_conf": 0.7477514147758484, "adv/mean_abs_reasoning": 0.5264423489570618, "adv/mean_abs_step_conf": 0.7878356575965881, "adv/ratio_final_to_reasoning": 1.4203861377361138, "adv/ratio_step_to_reasoning": 1.4965278898199856, "adv/std_final_conf": 0.9127273559570312, "adv/std_reasoning": 0.739500880241394, "adv/std_step_conf": 0.9368241429328918, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.26943775100401607, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.41767068273092367, "calib/gap": 0.2981454545454546, "calib/mean_conf": 0.658273092369478, "calib/mu_c": 0.8378787878787879, "calib/mu_w": 0.5397333333333333, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.26506024096385544, "calib/std_conf": 0.3410458603094492, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7854545454545456, "calib/step_q_c_n": 583.0, "calib/step_q_gap": 0.021364657814096244, "calib/step_q_w": 0.7640898876404494, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 610.56640625, "completions/mean_terminated_length": 610.56640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1056, "grad_norm": 1.3795263767242432, "kl": 0.06223297119140625, "learning_rate": 2.8333333333333335e-06, "loss": 0.0808, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02904464118182659, "mask/share_reasoning": 0.8638001680374146, "mask/share_step_conf": 0.10715514421463013, "num_tokens": 23757543.0, "reward": 0.47928860783576965, "reward_std": 0.37459295988082886, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.6956437826156616, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": -0.00816030241549015, "step": 99 }, { "adv/mean_abs_final_conf": 0.7018420696258545, "adv/mean_abs_reasoning": 0.47234195470809937, "adv/mean_abs_step_conf": 0.7685372829437256, "adv/ratio_final_to_reasoning": 1.4858770486724664, "adv/ratio_step_to_reasoning": 1.6270781692866363, "adv/std_final_conf": 0.8932068347930908, "adv/std_reasoning": 0.7393435835838318, "adv/std_step_conf": 0.9367656707763672, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.12095999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.468, "calib/gap": 0.43056493506493515, "calib/mean_conf": 0.64648, "calib/mu_c": 0.8359285714285715, "calib/mu_w": 0.4053636363636363, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10371999999999995, "calib/std_conf": 0.36804511897320413, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7726681870011404, "calib/step_q_c_n": 877.0, "calib/step_q_gap": 0.011287715620668992, "calib/step_q_w": 0.7613804713804714, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 575.53515625, "completions/mean_terminated_length": 577.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.10666666666666667, "grad_norm": 0.8686370849609375, "kl": 0.0657806396484375, "learning_rate": 2.805555555555556e-06, "loss": 0.0596, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029684949666261673, "mask/share_reasoning": 0.8518041372299194, "mask/share_step_conf": 0.1146046593785286, "num_tokens": 24012288.0, "reward": 0.6547666788101196, "reward_std": 0.36246195435523987, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7961000204086304, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.21030843257904053, "step": 100 }, { "adv/mean_abs_final_conf": 0.7467896342277527, "adv/mean_abs_reasoning": 0.44826531410217285, "adv/mean_abs_step_conf": 0.7710946798324585, "adv/ratio_final_to_reasoning": 1.665954537935847, "adv/ratio_step_to_reasoning": 1.7201747616294563, "adv/std_final_conf": 0.9199966788291931, "adv/std_reasoning": 0.72060626745224, "adv/std_step_conf": 0.9366609454154968, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.1777510040160642, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.2518793548387096, "calib/mean_conf": 0.5784738955823294, "calib/mu_c": 0.7049193548387096, "calib/mu_w": 0.45304, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1291164658634538, "calib/std_conf": 0.3520497641913054, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7452801992528021, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.018828144458281626, "calib/step_q_w": 0.7264520547945205, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 601.86328125, "completions/mean_terminated_length": 601.86328125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.10773333333333333, "grad_norm": 0.766709566116333, "kl": 0.06708526611328125, "learning_rate": 2.7777777777777783e-06, "loss": 0.0134, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02983863279223442, "mask/share_reasoning": 0.8579814434051514, "mask/share_step_conf": 0.11217986047267914, "num_tokens": 24273357.0, "reward": 0.5754482746124268, "reward_std": 0.3189234733581543, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7175558805465698, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.14271578192710876, "step": 101 }, { "adv/mean_abs_final_conf": 0.651282548904419, "adv/mean_abs_reasoning": 0.31044554710388184, "adv/mean_abs_step_conf": 0.7756525278091431, "adv/ratio_final_to_reasoning": 2.0978962493750495, "adv/ratio_step_to_reasoning": 2.4985139424454132, "adv/std_final_conf": 0.8459413647651672, "adv/std_reasoning": 0.5960164070129395, "adv/std_step_conf": 0.9365149140357971, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.13889328063241108, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.38735177865612647, "calib/gap": 0.363922290388548, "calib/mean_conf": 0.6191304347826088, "calib/mu_c": 0.7485889570552147, "calib/mu_w": 0.38466666666666666, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05687747035573125, "calib/std_conf": 0.357715681342693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7567685076380728, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.023392094067798075, "calib/step_q_w": 0.7333764135702747, "calib/step_q_w_n": 619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 476.82421875, "completions/mean_terminated_length": 476.82421875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1088, "grad_norm": 1.4313592910766602, "kl": 0.08031463623046875, "learning_rate": 2.7500000000000004e-06, "loss": 0.0763, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03701265901327133, "mask/share_reasoning": 0.8306695818901062, "mask/share_step_conf": 0.13231775164604187, "num_tokens": 24502120.0, "reward": 0.6905773878097534, "reward_std": 0.24921801686286926, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.799552321434021, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.25660234689712524, "step": 102 }, { "adv/mean_abs_final_conf": 0.6482324600219727, "adv/mean_abs_reasoning": 0.4307445287704468, "adv/mean_abs_step_conf": 0.795738935470581, "adv/ratio_final_to_reasoning": 1.504911651164419, "adv/ratio_step_to_reasoning": 1.8473570349042037, "adv/std_final_conf": 0.8590838313102722, "adv/std_reasoning": 0.7206352353096008, "adv/std_step_conf": 0.9366804361343384, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 5.1875, "calib/ece": 0.09653061224489798, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3673469387755102, "calib/gap": 0.4500049316612654, "calib/mean_conf": 0.5727755102040817, "calib/mu_c": 0.7454304635761589, "calib/mu_w": 0.29542553191489357, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.026489795918367386, "calib/std_conf": 0.37462641338316166, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7727855153203341, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.046851089090825915, "calib/step_q_w": 0.7259344262295082, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 625.03515625, "completions/mean_terminated_length": 625.03515625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.10986666666666667, "grad_norm": 1.1235955953598022, "kl": 0.061309814453125, "learning_rate": 2.7222222222222224e-06, "loss": 0.0605, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03038378804922104, "mask/share_reasoning": 0.8688158988952637, "mask/share_step_conf": 0.10080033540725708, "num_tokens": 24766681.0, "reward": 0.6661745309829712, "reward_std": 0.28348755836486816, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7982730269432068, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.22470103204250336, "step": 103 }, { "adv/mean_abs_final_conf": 0.70388263463974, "adv/mean_abs_reasoning": 0.5676292181015015, "adv/mean_abs_step_conf": 0.7807614803314209, "adv/ratio_final_to_reasoning": 1.2400394697685808, "adv/ratio_step_to_reasoning": 1.3754779624325255, "adv/std_final_conf": 0.9006258845329285, "adv/std_reasoning": 0.8098313212394714, "adv/std_step_conf": 0.936539351940155, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.1660869565217391, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.25296442687747034, "calib/gap": 0.34760217472815885, "calib/mean_conf": 0.44901185770750984, "calib/mu_c": 0.6221259842519684, "calib/mu_w": 0.27452380952380956, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.056561264822134374, "calib/std_conf": 0.3723372019477189, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7584905660377359, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.044066839496180954, "calib/step_q_w": 0.714423726541555, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 535.02734375, "completions/mean_terminated_length": 539.2401733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.11093333333333333, "grad_norm": 1.6565605401992798, "kl": 0.076202392578125, "learning_rate": 2.6944444444444444e-06, "loss": 0.0113, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03172758221626282, "mask/share_reasoning": 0.8427647352218628, "mask/share_step_conf": 0.11769521236419678, "num_tokens": 25010328.0, "reward": 0.6450490951538086, "reward_std": 0.2692990303039551, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7635113596916199, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.23205558955669403, "step": 104 }, { "adv/mean_abs_final_conf": 0.7665470838546753, "adv/mean_abs_reasoning": 0.6247068047523499, "adv/mean_abs_step_conf": 0.7995249629020691, "adv/ratio_final_to_reasoning": 1.227050958983158, "adv/ratio_step_to_reasoning": 1.2798403296071375, "adv/std_final_conf": 0.9220882058143616, "adv/std_reasoning": 0.8430242538452148, "adv/std_step_conf": 0.9368162155151367, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.265625, "calib/ece": 0.22653225806451616, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.35080645161290325, "calib/gap": 0.27427505827505827, "calib/mean_conf": 0.545483870967742, "calib/mu_c": 0.6616083916083916, "calib/mu_w": 0.3873333333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.09770161290322579, "calib/std_conf": 0.39339989266178754, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7520144927536232, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.043093520109246275, "calib/step_q_w": 0.7089209726443769, "calib/step_q_w_n": 658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 505.15234375, "completions/mean_terminated_length": 507.13336181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.112, "grad_norm": 0.9889106154441833, "kl": 0.0729522705078125, "learning_rate": 2.666666666666667e-06, "loss": 0.0325, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03318943455815315, "mask/share_reasoning": 0.8480772972106934, "mask/share_step_conf": 0.1148269772529602, "num_tokens": 25245407.0, "reward": 0.6198134422302246, "reward_std": 0.36246803402900696, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7072093486785889, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.2277298867702484, "step": 105 }, { "adv/mean_abs_final_conf": 0.6540548801422119, "adv/mean_abs_reasoning": 0.4116325378417969, "adv/mean_abs_step_conf": 0.766207218170166, "adv/ratio_final_to_reasoning": 1.5889290083127137, "adv/ratio_step_to_reasoning": 1.8613864253477532, "adv/std_final_conf": 0.8604157567024231, "adv/std_reasoning": 0.6816043257713318, "adv/std_step_conf": 0.936654269695282, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.1938888888888889, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.3894924242424243, "calib/mean_conf": 0.6096031746031746, "calib/mu_c": 0.7950757575757577, "calib/mu_w": 0.40558333333333335, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13984126984126982, "calib/std_conf": 0.3962470785905042, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.732231520223152, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.04639569932762955, "calib/step_q_w": 0.6858358208955224, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 509.609375, "completions/mean_terminated_length": 511.6078796386719, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.11306666666666666, "grad_norm": 1.0946513414382935, "kl": 0.07303619384765625, "learning_rate": 2.6388888888888893e-06, "loss": -0.021, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03281043469905853, "mask/share_reasoning": 0.849319338798523, "mask/share_step_conf": 0.11396399140357971, "num_tokens": 25480451.0, "reward": 0.6373354196548462, "reward_std": 0.28787219524383545, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7683038711547852, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2063669115304947, "step": 106 }, { "adv/mean_abs_final_conf": 0.668190598487854, "adv/mean_abs_reasoning": 0.5299859046936035, "adv/mean_abs_step_conf": 0.7711976766586304, "adv/ratio_final_to_reasoning": 1.2607705083669152, "adv/ratio_step_to_reasoning": 1.4551286549865448, "adv/std_final_conf": 0.8769477605819702, "adv/std_reasoning": 0.7927711009979248, "adv/std_step_conf": 0.9366758465766907, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 6.08984375, "calib/ece": 0.29555555555555557, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5555555555555556, "calib/gap": 0.1993287037037036, "calib/mean_conf": 0.709920634920635, "calib/mu_c": 0.7953472222222222, "calib/mu_w": 0.5960185185185186, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2170238095238095, "calib/std_conf": 0.37554615473481395, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7316727272727273, "calib/step_q_c_n": 825.0, "calib/step_q_gap": 0.07807599702749568, "calib/step_q_w": 0.6535967302452316, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 511.6484375, "completions/mean_terminated_length": 515.6771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.11413333333333334, "grad_norm": 0.8395653963088989, "kl": 0.07398223876953125, "learning_rate": 2.6111111111111113e-06, "loss": -0.0325, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03259489685297012, "mask/share_reasoning": 0.8318241834640503, "mask/share_step_conf": 0.12776845693588257, "num_tokens": 25716049.0, "reward": 0.61835116147995, "reward_std": 0.33907413482666016, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6779172420501709, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2501913905143738, "step": 107 }, { "adv/mean_abs_final_conf": 0.5778203010559082, "adv/mean_abs_reasoning": 0.4173446297645569, "adv/mean_abs_step_conf": 0.7508407831192017, "adv/ratio_final_to_reasoning": 1.3845159607825384, "adv/ratio_step_to_reasoning": 1.7990905586655928, "adv/std_final_conf": 0.8275368809700012, "adv/std_reasoning": 0.7203985452651978, "adv/std_step_conf": 0.9366967678070068, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.94140625, "calib/ece": 0.19338582677165367, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7401574803149606, "calib/gap": 0.3346145251396647, "calib/mean_conf": 0.8018110236220474, "calib/mu_c": 0.9006145251396647, "calib/mu_w": 0.566, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14523622047244106, "calib/std_conf": 0.35392938777674715, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7407552320291173, "calib/step_q_c_n": 1099.0, "calib/step_q_gap": 0.05793812288457445, "calib/step_q_w": 0.6828171091445429, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 551.515625, "completions/mean_terminated_length": 553.678466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.1152, "grad_norm": 1.02792489528656, "kl": 0.06510162353515625, "learning_rate": 2.5833333333333337e-06, "loss": -0.005, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031672731041908264, "mask/share_reasoning": 0.8348386883735657, "mask/share_step_conf": 0.12958231568336487, "num_tokens": 25960469.0, "reward": 0.7275914549827576, "reward_std": 0.3252040445804596, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.789444088935852, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.32823869585990906, "step": 108 }, { "adv/mean_abs_final_conf": 0.5223947763442993, "adv/mean_abs_reasoning": 0.28969240188598633, "adv/mean_abs_step_conf": 0.7875484824180603, "adv/ratio_final_to_reasoning": 1.8032740000888847, "adv/ratio_step_to_reasoning": 2.7185679613648075, "adv/std_final_conf": 0.7764906883239746, "adv/std_reasoning": 0.5961942672729492, "adv/std_step_conf": 0.9366160035133362, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.14680161943319836, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5748987854251012, "calib/gap": 0.531466165413534, "calib/mean_conf": 0.6585425101214575, "calib/mu_c": 0.9038345864661654, "calib/mu_w": 0.3723684210526315, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13344129554655867, "calib/std_conf": 0.418115471835271, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7339974457215837, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.057439474707090876, "calib/step_q_w": 0.6765579710144928, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 584.14453125, "completions/mean_terminated_length": 584.14453125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.11626666666666667, "grad_norm": 0.8495370745658875, "kl": 0.06689453125, "learning_rate": 2.5555555555555557e-06, "loss": 0.0332, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029666254296898842, "mask/share_reasoning": 0.8502966165542603, "mask/share_step_conf": 0.12003707885742188, "num_tokens": 26214610.0, "reward": 0.6510058641433716, "reward_std": 0.28166115283966064, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7934441566467285, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.21247389912605286, "step": 109 }, { "adv/mean_abs_final_conf": 0.6491693258285522, "adv/mean_abs_reasoning": 0.5007895231246948, "adv/mean_abs_step_conf": 0.7815115451812744, "adv/ratio_final_to_reasoning": 1.2962917470358328, "adv/ratio_step_to_reasoning": 1.5605588956913556, "adv/std_final_conf": 0.875318706035614, "adv/std_reasoning": 0.7576372027397156, "adv/std_step_conf": 0.9364836812019348, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.29084, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.652, "calib/gap": 0.23132689210950086, "calib/mean_conf": 0.7539600000000001, "calib/mu_c": 0.8603703703703705, "calib/mu_w": 0.6290434782608696, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2524, "calib/std_conf": 0.3668753444972829, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7674918566775244, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.03588131494985236, "calib/step_q_w": 0.731610541727672, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 507.0234375, "completions/mean_terminated_length": 507.0234375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.11733333333333333, "grad_norm": 1.3282954692840576, "kl": 0.07544708251953125, "learning_rate": 2.5277777777777778e-06, "loss": 0.0755, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033980920910835266, "mask/share_reasoning": 0.849612832069397, "mask/share_step_conf": 0.11640624701976776, "num_tokens": 26449328.0, "reward": 0.569250226020813, "reward_std": 0.3724881410598755, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6700652241706848, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.1684352606534958, "step": 110 }, { "adv/mean_abs_final_conf": 0.5762059092521667, "adv/mean_abs_reasoning": 0.4507063627243042, "adv/mean_abs_step_conf": 0.7776529788970947, "adv/ratio_final_to_reasoning": 1.2784507983629916, "adv/ratio_step_to_reasoning": 1.7254093645285031, "adv/std_final_conf": 0.8093517422676086, "adv/std_reasoning": 0.7206481099128723, "adv/std_step_conf": 0.9365012049674988, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.2808661417322834, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6456692913385826, "calib/gap": 0.2535009935419772, "calib/mean_conf": 0.7295275590551181, "calib/mu_c": 0.8512878787878787, "calib/mu_w": 0.5977868852459015, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2453543307086614, "calib/std_conf": 0.38614052002995286, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7360538116591929, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.025330755724677134, "calib/step_q_w": 0.7107230559345158, "calib/step_q_w_n": 733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 496.47265625, "completions/mean_terminated_length": 498.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.1184, "grad_norm": 0.9494348764419556, "kl": 0.07242584228515625, "learning_rate": 2.5e-06, "loss": 0.0115, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03668651729822159, "mask/share_reasoning": 0.8385127782821655, "mask/share_step_conf": 0.12089445441961288, "num_tokens": 26683833.0, "reward": 0.5836626291275024, "reward_std": 0.31816649436950684, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6746547222137451, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.19188937544822693, "step": 111 }, { "adv/mean_abs_final_conf": 0.632652759552002, "adv/mean_abs_reasoning": 0.5271589159965515, "adv/mean_abs_step_conf": 0.7774137258529663, "adv/ratio_final_to_reasoning": 1.200117726086493, "adv/ratio_step_to_reasoning": 1.4747236597209556, "adv/std_final_conf": 0.8433780074119568, "adv/std_reasoning": 0.792841374874115, "adv/std_step_conf": 0.936805009841919, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.21611336032388656, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5991902834008097, "calib/gap": 0.41698323272971166, "calib/mean_conf": 0.6772469635627529, "calib/mu_c": 0.8545070422535211, "calib/mu_w": 0.4375238095238095, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15923076923076912, "calib/std_conf": 0.41697060516024764, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7186170212765959, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.03604099265405558, "calib/step_q_w": 0.6825760286225403, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 569.046875, "completions/mean_terminated_length": 569.046875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.11946666666666667, "grad_norm": 0.8138297200202942, "kl": 0.064910888671875, "learning_rate": 2.4722222222222226e-06, "loss": 0.1368, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03094610944390297, "mask/share_reasoning": 0.8668504953384399, "mask/share_step_conf": 0.10220340639352798, "num_tokens": 26937429.0, "reward": 0.6470686197280884, "reward_std": 0.3719423711299896, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7478343844413757, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.24239663779735565, "step": 112 }, { "adv/mean_abs_final_conf": 0.5964900255203247, "adv/mean_abs_reasoning": 0.43386268615722656, "adv/mean_abs_step_conf": 0.7635525465011597, "adv/ratio_final_to_reasoning": 1.3748359666591932, "adv/ratio_step_to_reasoning": 1.7598944801269623, "adv/std_final_conf": 0.8118496537208557, "adv/std_reasoning": 0.7013782262802124, "adv/std_step_conf": 0.9364071488380432, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.2612698412698413, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6746031746031746, "calib/gap": 0.2881071428571429, "calib/mean_conf": 0.7652380952380953, "calib/mu_c": 0.8932857142857143, "calib/mu_w": 0.6051785714285715, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23547619047619053, "calib/std_conf": 0.36802284756048864, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7440573770491803, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.03815675593116796, "calib/step_q_w": 0.7059006211180123, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 471.53515625, "completions/mean_terminated_length": 475.2480163574219, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.12053333333333334, "grad_norm": 1.1503803730010986, "kl": 0.08345794677734375, "learning_rate": 2.4444444444444447e-06, "loss": 0.0661, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036294907331466675, "mask/share_reasoning": 0.8331960439682007, "mask/share_step_conf": 0.12269649654626846, "num_tokens": 27163342.0, "reward": 0.6439971327781677, "reward_std": 0.3417161703109741, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7047671675682068, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.2769771218299866, "step": 113 }, { "adv/mean_abs_final_conf": 0.5136664509773254, "adv/mean_abs_reasoning": 0.4266672134399414, "adv/mean_abs_step_conf": 0.7686975002288818, "adv/ratio_final_to_reasoning": 1.2039042016750374, "adv/ratio_step_to_reasoning": 1.8016324573696951, "adv/std_final_conf": 0.741142213344574, "adv/std_reasoning": 0.7013077139854431, "adv/std_step_conf": 0.9365781545639038, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.25838582677165356, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8228346456692913, "calib/gap": 0.2925883438818565, "calib/mean_conf": 0.8581496062992126, "calib/mu_c": 0.9687341772151898, "calib/mu_w": 0.6761458333333333, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.247244094488189, "calib/std_conf": 0.3166248576037594, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7601415094339623, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.12415936657681947, "calib/step_q_w": 0.6359821428571428, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 501.9921875, "completions/mean_terminated_length": 503.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.1216, "grad_norm": 1.0907721519470215, "kl": 0.070404052734375, "learning_rate": 2.4166666666666667e-06, "loss": -0.0512, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03666098043322563, "mask/share_reasoning": 0.832012951374054, "mask/share_step_conf": 0.1274198293685913, "num_tokens": 27396876.0, "reward": 0.6833349466323853, "reward_std": 0.3609473705291748, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7367398142814636, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3088362216949463, "step": 114 }, { "adv/mean_abs_final_conf": 0.6670311093330383, "adv/mean_abs_reasoning": 0.4771580100059509, "adv/mean_abs_step_conf": 0.7725792527198792, "adv/ratio_final_to_reasoning": 1.3979249962181697, "adv/ratio_step_to_reasoning": 1.6191266551519146, "adv/std_final_conf": 0.8599995970726013, "adv/std_reasoning": 0.7392193675041199, "adv/std_step_conf": 0.9367433786392212, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.1484375, "calib/ece": 0.36890625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.240178266178266, "calib/mean_conf": 0.798828125, "calib/mu_c": 0.9207936507936507, "calib/mu_w": 0.6806153846153847, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3377734375, "calib/std_conf": 0.36609130754360225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7379122541603631, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.00969307607817127, "calib/step_q_w": 0.7282191780821918, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 446.7734375, "completions/mean_terminated_length": 448.5255126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.12266666666666666, "grad_norm": 1.2250382900238037, "kl": 0.07220458984375, "learning_rate": 2.388888888888889e-06, "loss": -0.0392, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03574581444263458, "mask/share_reasoning": 0.8383454084396362, "mask/share_step_conf": 0.12200252711772919, "num_tokens": 27616514.0, "reward": 0.5333312749862671, "reward_std": 0.39866939187049866, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6420695185661316, "rewards/format_reward_step": 1.0, "rewards/step_correlation_reward": 0.12615548074245453, "step": 115 }, { "adv/mean_abs_final_conf": 0.545147180557251, "adv/mean_abs_reasoning": 0.44270390272140503, "adv/mean_abs_step_conf": 0.7896296977996826, "adv/ratio_final_to_reasoning": 1.2314036022860946, "adv/ratio_step_to_reasoning": 1.7836519916486915, "adv/std_final_conf": 0.777513861656189, "adv/std_reasoning": 0.7014328241348267, "adv/std_step_conf": 0.9364913702011108, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.29494117647058826, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7725490196078432, "calib/gap": 0.30950143337903535, "calib/mean_conf": 0.8234117647058824, "calib/mu_c": 0.9605633802816902, "calib/mu_w": 0.6510619469026548, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2807450980392157, "calib/std_conf": 0.34806992628936195, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.720025974025974, "calib/step_q_c_n": 770.0, "calib/step_q_gap": -0.0011063789152024883, "calib/step_q_w": 0.7211323529411765, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 512.8515625, "completions/mean_terminated_length": 514.86279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.12373333333333333, "grad_norm": 1.5596277713775635, "kl": 0.0673370361328125, "learning_rate": 2.361111111111111e-06, "loss": 0.01, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03566065430641174, "mask/share_reasoning": 0.8397046327590942, "mask/share_step_conf": 0.12072847038507462, "num_tokens": 27852324.0, "reward": 0.6405564546585083, "reward_std": 0.34321245551109314, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7033078074455261, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2692112922668457, "step": 116 }, { "adv/mean_abs_final_conf": 0.5895428657531738, "adv/mean_abs_reasoning": 0.5014504194259644, "adv/mean_abs_step_conf": 0.7911335825920105, "adv/ratio_final_to_reasoning": 1.1756752869566913, "adv/ratio_step_to_reasoning": 1.5776905391716713, "adv/std_final_conf": 0.8263259530067444, "adv/std_reasoning": 0.775360107421875, "adv/std_step_conf": 0.9366973638534546, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 5.48046875, "calib/ece": 0.32960159362549807, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7250996015936255, "calib/gap": 0.2751060452120905, "calib/mean_conf": 0.803625498007968, "calib/mu_c": 0.9428225806451612, "calib/mu_w": 0.6677165354330707, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31960159362549806, "calib/std_conf": 0.3499265762302744, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7149256198347107, "calib/step_q_c_n": 605.0, "calib/step_q_gap": 0.014712587253257081, "calib/step_q_w": 0.7002130325814536, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 523.91015625, "completions/mean_terminated_length": 523.91015625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.1248, "grad_norm": 1.2674314975738525, "kl": 0.06781005859375, "learning_rate": 2.3333333333333336e-06, "loss": 0.0752, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034410979598760605, "mask/share_reasoning": 0.8443933725357056, "mask/share_step_conf": 0.12119565159082413, "num_tokens": 28093045.0, "reward": 0.5557481050491333, "reward_std": 0.35638684034347534, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6561957001686096, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.16233177483081818, "step": 117 }, { "adv/mean_abs_final_conf": 0.5556694865226746, "adv/mean_abs_reasoning": 0.39469343423843384, "adv/mean_abs_step_conf": 0.7684829235076904, "adv/ratio_final_to_reasoning": 1.4078508490896133, "adv/ratio_step_to_reasoning": 1.947037515307262, "adv/std_final_conf": 0.8207305073738098, "adv/std_reasoning": 0.6815216541290283, "adv/std_step_conf": 0.9362694025039673, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 6.4140625, "calib/ece": 0.2635365853658536, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7317073170731707, "calib/gap": 0.35051127214170696, "calib/mean_conf": 0.7939430894308943, "calib/mu_c": 0.9478260869565218, "calib/mu_w": 0.5973148148148149, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.24825203252032516, "calib/std_conf": 0.36621373653404626, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6907354838709676, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.04270780221006809, "calib/step_q_w": 0.6480276816608995, "calib/step_q_w_n": 867.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 555.86328125, "completions/mean_terminated_length": 562.45458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.12586666666666665, "grad_norm": 1.0534653663635254, "kl": 0.06500625610351562, "learning_rate": 2.305555555555556e-06, "loss": -0.0019, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03326330706477165, "mask/share_reasoning": 0.827534019947052, "mask/share_step_conf": 0.12748391926288605, "num_tokens": 28339354.0, "reward": 0.5999181270599365, "reward_std": 0.2976059317588806, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7013800740242004, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.20001864433288574, "step": 118 }, { "adv/mean_abs_final_conf": 0.5927442312240601, "adv/mean_abs_reasoning": 0.5033291578292847, "adv/mean_abs_step_conf": 0.7902476787567139, "adv/ratio_final_to_reasoning": 1.1776473148911084, "adv/ratio_step_to_reasoning": 1.5700415254399867, "adv/std_final_conf": 0.8087579607963562, "adv/std_reasoning": 0.7575428485870361, "adv/std_step_conf": 0.9365953803062439, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.21289682539682528, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5912698412698413, "calib/gap": 0.4155880628135793, "calib/mean_conf": 0.6889285714285714, "calib/mu_c": 0.8587919463087249, "calib/mu_w": 0.4432038834951456, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15527777777777768, "calib/std_conf": 0.42262037223132454, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.724692513368984, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.07850619604729547, "calib/step_q_w": 0.6461863173216885, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 569.24609375, "completions/mean_terminated_length": 571.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.12693333333333334, "grad_norm": 1.2062466144561768, "kl": 0.0637054443359375, "learning_rate": 2.277777777777778e-06, "loss": 0.0076, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031620241701602936, "mask/share_reasoning": 0.8546348810195923, "mask/share_step_conf": 0.10983864217996597, "num_tokens": 28590145.0, "reward": 0.6657212972640991, "reward_std": 0.32895010709762573, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7551402449607849, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.26380234956741333, "step": 119 }, { "adv/mean_abs_final_conf": 0.6236361861228943, "adv/mean_abs_reasoning": 0.42792919278144836, "adv/mean_abs_step_conf": 0.7829275131225586, "adv/ratio_final_to_reasoning": 1.4573349905609203, "adv/ratio_step_to_reasoning": 1.8295725702509262, "adv/std_final_conf": 0.841181218624115, "adv/std_reasoning": 0.701341450214386, "adv/std_step_conf": 0.9366744160652161, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.18359375, "calib/ece": 0.2569019607843137, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.41574193548387106, "calib/mean_conf": 0.5867058823529412, "calib/mu_c": 0.749741935483871, "calib/mu_w": 0.33399999999999996, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11788235294117651, "calib/std_conf": 0.4627192980511266, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.690958904109589, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.014966537697375215, "calib/step_q_w": 0.6759923664122138, "calib/step_q_w_n": 524.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 461.953125, "completions/mean_terminated_length": 461.953125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.128, "grad_norm": 1.1702357530593872, "kl": 0.07584381103515625, "learning_rate": 2.25e-06, "loss": 0.0508, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03543039411306381, "mask/share_reasoning": 0.8455789685249329, "mask/share_step_conf": 0.11899064481258392, "num_tokens": 28815093.0, "reward": 0.6928844451904297, "reward_std": 0.34249556064605713, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.738457441329956, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3277801275253296, "step": 120 }, { "adv/mean_abs_final_conf": 0.7098880410194397, "adv/mean_abs_reasoning": 0.48638004064559937, "adv/mean_abs_step_conf": 0.7969322204589844, "adv/ratio_final_to_reasoning": 1.4595336602981606, "adv/ratio_step_to_reasoning": 1.6384969650505636, "adv/std_final_conf": 0.8877682685852051, "adv/std_reasoning": 0.7393484115600586, "adv/std_step_conf": 0.9366835355758667, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.2248412698412698, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5317460317460317, "calib/gap": 0.4255606407322655, "calib/mean_conf": 0.6085714285714287, "calib/mu_c": 0.8010869565217392, "calib/mu_w": 0.3755263157894737, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14289682539682533, "calib/std_conf": 0.45119455959956445, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7103973509933773, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.03656475187443453, "calib/step_q_w": 0.6738325991189428, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 553.79296875, "completions/mean_terminated_length": 555.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.12906666666666666, "grad_norm": 1.2838155031204224, "kl": 0.06760406494140625, "learning_rate": 2.222222222222222e-06, "loss": 0.0016, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032008204609155655, "mask/share_reasoning": 0.853866457939148, "mask/share_step_conf": 0.11021904647350311, "num_tokens": 29061920.0, "reward": 0.6536507606506348, "reward_std": 0.33676546812057495, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7440164089202881, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.25859755277633667, "step": 121 }, { "adv/mean_abs_final_conf": 0.5955484509468079, "adv/mean_abs_reasoning": 0.4420047402381897, "adv/mean_abs_step_conf": 0.7843633890151978, "adv/ratio_final_to_reasoning": 1.3473802353926696, "adv/ratio_step_to_reasoning": 1.7745587719093603, "adv/std_final_conf": 0.7974062561988831, "adv/std_reasoning": 0.7204664349555969, "adv/std_step_conf": 0.9366613626480103, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.20418972332015814, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6047430830039525, "calib/gap": 0.45039682539682546, "calib/mean_conf": 0.687588932806324, "calib/mu_c": 0.8638311688311688, "calib/mu_w": 0.4134343434343434, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14154150197628462, "calib/std_conf": 0.4286020536019695, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.722987012987013, "calib/step_q_c_n": 847.0, "calib/step_q_gap": 0.0757434232434232, "calib/step_q_w": 0.6472435897435898, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 495.70703125, "completions/mean_terminated_length": 497.6510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.13013333333333332, "grad_norm": 1.0565286874771118, "kl": 0.07469940185546875, "learning_rate": 2.1944444444444445e-06, "loss": 0.0271, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034954264760017395, "mask/share_reasoning": 0.8376418352127075, "mask/share_step_conf": 0.1234976127743721, "num_tokens": 29296165.0, "reward": 0.7111918330192566, "reward_std": 0.31621798872947693, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7733242511749268, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3318719267845154, "step": 122 }, { "adv/mean_abs_final_conf": 0.7253831624984741, "adv/mean_abs_reasoning": 0.560309648513794, "adv/mean_abs_step_conf": 0.7853028774261475, "adv/ratio_final_to_reasoning": 1.2946112286706701, "adv/ratio_step_to_reasoning": 1.401551587607213, "adv/std_final_conf": 0.8891106843948364, "adv/std_reasoning": 0.7928450703620911, "adv/std_step_conf": 0.936756432056427, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.6953125, "calib/ece": 0.2343650793650795, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.32718367346938787, "calib/mean_conf": 0.6490476190476191, "calib/mu_c": 0.7853741496598641, "calib/mu_w": 0.4581904761904762, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15003968253968264, "calib/std_conf": 0.4174258390010849, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7152200488997554, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.040079423899755495, "calib/step_q_w": 0.6751406249999999, "calib/step_q_w_n": 640.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 559.21875, "completions/mean_terminated_length": 563.6220703125, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.1312, "grad_norm": 1.1301532983779907, "kl": 0.06143951416015625, "learning_rate": 2.166666666666667e-06, "loss": -0.0218, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032075509428977966, "mask/share_reasoning": 0.8524587154388428, "mask/share_step_conf": 0.10765329003334045, "num_tokens": 29544613.0, "reward": 0.6427807211875916, "reward_std": 0.35771605372428894, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7221007943153381, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2525230944156647, "step": 123 }, { "adv/mean_abs_final_conf": 0.6043316125869751, "adv/mean_abs_reasoning": 0.4103482961654663, "adv/mean_abs_step_conf": 0.7751355767250061, "adv/ratio_final_to_reasoning": 1.4727284558854075, "adv/ratio_step_to_reasoning": 1.88896989208515, "adv/std_final_conf": 0.8031203746795654, "adv/std_reasoning": 0.6815477609634399, "adv/std_step_conf": 0.936602771282196, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.19318897637795282, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5551181102362205, "calib/gap": 0.4291692202928158, "calib/mean_conf": 0.689015748031496, "calib/mu_c": 0.8393939393939394, "calib/mu_w": 0.41022471910112357, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11629921259842527, "calib/std_conf": 0.4091466726639721, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7401255707762556, "calib/step_q_c_n": 876.0, "calib/step_q_gap": 0.026077951728636584, "calib/step_q_w": 0.714047619047619, "calib/step_q_w_n": 504.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 504.796875, "completions/mean_terminated_length": 504.796875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.13226666666666667, "grad_norm": 1.4001808166503906, "kl": 0.06858062744140625, "learning_rate": 2.138888888888889e-06, "loss": -0.0127, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03315574675798416, "mask/share_reasoning": 0.8505468368530273, "mask/share_step_conf": 0.1162973940372467, "num_tokens": 29780657.0, "reward": 0.7478765249252319, "reward_std": 0.32260704040527344, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7925605177879333, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3758487403392792, "step": 124 }, { "adv/mean_abs_final_conf": 0.7216381430625916, "adv/mean_abs_reasoning": 0.4820384383201599, "adv/mean_abs_step_conf": 0.7660722732543945, "adv/ratio_final_to_reasoning": 1.4970551841828317, "adv/ratio_step_to_reasoning": 1.5892348251812758, "adv/std_final_conf": 0.8866348266601562, "adv/std_reasoning": 0.7394478917121887, "adv/std_step_conf": 0.9367514252662659, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.3047808764940239, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5338645418326693, "calib/gap": 0.23268122496446564, "calib/mean_conf": 0.6358565737051792, "calib/mu_c": 0.7369014084507042, "calib/mu_w": 0.5042201834862385, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18745019920318726, "calib/std_conf": 0.42958745537938625, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7440947075208914, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.047042075941943984, "calib/step_q_w": 0.6970526315789474, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 553.06640625, "completions/mean_terminated_length": 553.06640625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.13333333333333333, "grad_norm": 1.118679165840149, "kl": 0.065460205078125, "learning_rate": 2.1111111111111114e-06, "loss": 0.0447, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03368306905031204, "mask/share_reasoning": 0.8552190065383911, "mask/share_step_conf": 0.11109784990549088, "num_tokens": 30027050.0, "reward": 0.5980710387229919, "reward_std": 0.3668225407600403, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6582159996032715, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.2324572205543518, "step": 125 }, { "adv/mean_abs_final_conf": 0.6120800971984863, "adv/mean_abs_reasoning": 0.3681120276451111, "adv/mean_abs_step_conf": 0.7925801873207092, "adv/ratio_final_to_reasoning": 1.6627549529258512, "adv/ratio_step_to_reasoning": 2.153095057477499, "adv/std_final_conf": 0.8127871751785278, "adv/std_reasoning": 0.6404399871826172, "adv/std_step_conf": 0.936386227607727, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 5.76953125, "calib/ece": 0.23934959349593504, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.573170731707317, "calib/gap": 0.4280940988835727, "calib/mean_conf": 0.6547967479674797, "calib/mu_c": 0.8531818181818183, "calib/mu_w": 0.42508771929824557, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17878048780487815, "calib/std_conf": 0.43788265420225647, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7490751445086705, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.07107514450867047, "calib/step_q_w": 0.678, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 486.50390625, "completions/mean_terminated_length": 498.1800231933594, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.1344, "grad_norm": 0.8561756610870361, "kl": 0.068450927734375, "learning_rate": 2.0833333333333334e-06, "loss": -0.0653, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03521300107240677, "mask/share_reasoning": 0.8178188800811768, "mask/share_step_conf": 0.12353064864873886, "num_tokens": 30257059.0, "reward": 0.5955885648727417, "reward_std": 0.302139550447464, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7288945317268372, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.16696998476982117, "step": 126 }, { "adv/mean_abs_final_conf": 0.6318913698196411, "adv/mean_abs_reasoning": 0.38751620054244995, "adv/mean_abs_step_conf": 0.7821398377418518, "adv/ratio_final_to_reasoning": 1.6306192332994383, "adv/ratio_step_to_reasoning": 2.018341005219918, "adv/std_final_conf": 0.8311188817024231, "adv/std_reasoning": 0.6613225340843201, "adv/std_step_conf": 0.9359875321388245, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.91796875, "calib/ece": 0.2321285140562248, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5301204819277109, "calib/gap": 0.4300709952239577, "calib/mean_conf": 0.6475502008032129, "calib/mu_c": 0.8582677165354331, "calib/mu_w": 0.4281967213114754, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18481927710843365, "calib/std_conf": 0.431401031395983, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7445374449339207, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.0685302506893164, "calib/step_q_w": 0.6760071942446043, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 527.3671875, "completions/mean_terminated_length": 529.435302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.13546666666666668, "grad_norm": 1.0631660223007202, "kl": 0.06509017944335938, "learning_rate": 2.0555555555555555e-06, "loss": 0.0746, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03511449694633484, "mask/share_reasoning": 0.8345162868499756, "mask/share_step_conf": 0.12646296620368958, "num_tokens": 30495737.0, "reward": 0.6311262845993042, "reward_std": 0.29402485489845276, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7392515540122986, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.22925099730491638, "step": 127 }, { "adv/mean_abs_final_conf": 0.6419897675514221, "adv/mean_abs_reasoning": 0.5253900289535522, "adv/mean_abs_step_conf": 0.7852475643157959, "adv/ratio_final_to_reasoning": 1.2219298657610764, "adv/ratio_step_to_reasoning": 1.494599290130831, "adv/std_final_conf": 0.8184292316436768, "adv/std_reasoning": 0.7576269507408142, "adv/std_step_conf": 0.9365777373313904, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 4.77734375, "calib/ece": 0.24467999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.608, "calib/gap": 0.37397822282714366, "calib/mean_conf": 0.70604, "calib/mu_c": 0.8720863309352518, "calib/mu_w": 0.4981081081081081, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19735999999999998, "calib/std_conf": 0.4069308521112648, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7571746031746032, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.049636660510185004, "calib/step_q_w": 0.7075379426644182, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 506.5859375, "completions/mean_terminated_length": 506.5859375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.13653333333333334, "grad_norm": 1.0540531873703003, "kl": 0.06600570678710938, "learning_rate": 2.027777777777778e-06, "loss": -0.0066, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03855003044009209, "mask/share_reasoning": 0.8464581966400146, "mask/share_step_conf": 0.11499175429344177, "num_tokens": 30732087.0, "reward": 0.6308173537254333, "reward_std": 0.37330326437950134, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7321043014526367, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2256241887807846, "step": 128 }, { "adv/mean_abs_final_conf": 0.6031314134597778, "adv/mean_abs_reasoning": 0.3617158532142639, "adv/mean_abs_step_conf": 0.7796334028244019, "adv/ratio_final_to_reasoning": 1.667417692921826, "adv/ratio_step_to_reasoning": 2.155375264579799, "adv/std_final_conf": 0.8147671818733215, "adv/std_reasoning": 0.6611849665641785, "adv/std_step_conf": 0.9361515045166016, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.2651968503937008, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.610236220472441, "calib/gap": 0.26665004793863867, "calib/mean_conf": 0.752992125984252, "calib/mu_c": 0.8632214765100672, "calib/mu_w": 0.5965714285714285, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2157874015748031, "calib/std_conf": 0.36551078478790144, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7692422360248448, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.0254348965753034, "calib/step_q_w": 0.7438073394495414, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 464.59375, "completions/mean_terminated_length": 464.59375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1376, "grad_norm": 1.2516224384307861, "kl": 0.0685882568359375, "learning_rate": 2.0000000000000003e-06, "loss": 0.0744, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.038642313331365585, "mask/share_reasoning": 0.8288546204566956, "mask/share_step_conf": 0.13250306248664856, "num_tokens": 30953407.0, "reward": 0.6465780735015869, "reward_std": 0.3090284466743469, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.719878077507019, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.25843435525894165, "step": 129 }, { "adv/mean_abs_final_conf": 0.5859402418136597, "adv/mean_abs_reasoning": 0.29773178696632385, "adv/mean_abs_step_conf": 0.7566789984703064, "adv/ratio_final_to_reasoning": 1.9680137206173918, "adv/ratio_step_to_reasoning": 2.5414787120324966, "adv/std_final_conf": 0.807863712310791, "adv/std_reasoning": 0.5960418581962585, "adv/std_step_conf": 0.9366109371185303, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 4.6875, "calib/ece": 0.22381889763779533, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6614173228346457, "calib/gap": 0.36159141981613907, "calib/mean_conf": 0.7689370078740158, "calib/mu_c": 0.8956363636363638, "calib/mu_w": 0.5340449438202247, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17157480314960635, "calib/std_conf": 0.37061458317278684, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7777064220183487, "calib/step_q_c_n": 763.0, "calib/step_q_gap": -0.0006459807276467666, "calib/step_q_w": 0.7783524027459955, "calib/step_q_w_n": 437.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 425.9921875, "completions/mean_terminated_length": 429.3464660644531, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.13866666666666666, "grad_norm": 1.1606765985488892, "kl": 0.07639312744140625, "learning_rate": 1.9722222222222224e-06, "loss": -0.1065, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03955155238509178, "mask/share_reasoning": 0.8354448676109314, "mask/share_step_conf": 0.11719108372926712, "num_tokens": 31167749.0, "reward": 0.7203888893127441, "reward_std": 0.31650310754776, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.779260516166687, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.33417338132858276, "step": 130 }, { "adv/mean_abs_final_conf": 0.666415810585022, "adv/mean_abs_reasoning": 0.36885005235671997, "adv/mean_abs_step_conf": 0.7741599082946777, "adv/ratio_final_to_reasoning": 1.8067390971671113, "adv/ratio_step_to_reasoning": 2.098847223548655, "adv/std_final_conf": 0.8580986261367798, "adv/std_reasoning": 0.681431770324707, "adv/std_step_conf": 0.9363067746162415, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 4.59765625, "calib/ece": 0.2230830039525692, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4426877470355731, "calib/gap": 0.43748826291079795, "calib/mean_conf": 0.6177865612648222, "calib/mu_c": 0.8633333333333332, "calib/mu_w": 0.4258450704225352, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20106719367588932, "calib/std_conf": 0.42161537871196775, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7795635673624289, "calib/step_q_c_n": 527.0, "calib/step_q_gap": 0.027471259670121206, "calib/step_q_w": 0.7520923076923077, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 455.6328125, "completions/mean_terminated_length": 455.6328125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.13973333333333332, "grad_norm": 1.2342342138290405, "kl": 0.0707244873046875, "learning_rate": 1.944444444444445e-06, "loss": 0.0519, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037672512233257294, "mask/share_reasoning": 0.8480336666107178, "mask/share_step_conf": 0.11429374665021896, "num_tokens": 31390599.0, "reward": 0.5820455551147461, "reward_std": 0.29385387897491455, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.7504953145980835, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.12922075390815735, "step": 131 }, { "adv/mean_abs_final_conf": 0.5761880278587341, "adv/mean_abs_reasoning": 0.4815966486930847, "adv/mean_abs_step_conf": 0.7841116189956665, "adv/ratio_final_to_reasoning": 1.1964120377962415, "adv/ratio_step_to_reasoning": 1.628150073559525, "adv/std_final_conf": 0.7813647985458374, "adv/std_reasoning": 0.739295482635498, "adv/std_step_conf": 0.9361831545829773, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.1968503937007874, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6220472440944882, "calib/gap": 0.4039695274051101, "calib/mean_conf": 0.7380314960629921, "calib/mu_c": 0.8827607361963189, "calib/mu_w": 0.4787912087912088, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1465748031496063, "calib/std_conf": 0.38993950544364675, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7746878680800942, "calib/step_q_c_n": 849.0, "calib/step_q_gap": 0.03248751029834107, "calib/step_q_w": 0.7422003577817531, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 497.36328125, "completions/mean_terminated_length": 497.36328125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1408, "grad_norm": 0.6892319917678833, "kl": 0.0630340576171875, "learning_rate": 1.916666666666667e-06, "loss": 0.0274, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03631316497921944, "mask/share_reasoning": 0.8372159600257874, "mask/share_step_conf": 0.1264708936214447, "num_tokens": 31623516.0, "reward": 0.7428784966468811, "reward_std": 0.3167954683303833, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7844421863555908, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.37631478905677795, "step": 132 }, { "adv/mean_abs_final_conf": 0.7061909437179565, "adv/mean_abs_reasoning": 0.5574727058410645, "adv/mean_abs_step_conf": 0.7696073055267334, "adv/ratio_final_to_reasoning": 1.2667722317499284, "adv/ratio_step_to_reasoning": 1.3805291227049752, "adv/std_final_conf": 0.8849393129348755, "adv/std_reasoning": 0.8100109696388245, "adv/std_step_conf": 0.9368394017219543, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.30447154471544724, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5121951219512195, "calib/gap": 0.29895895895895885, "calib/mean_conf": 0.6434146341463416, "calib/mu_c": 0.8074774774774774, "calib/mu_w": 0.5085185185185186, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2483333333333334, "calib/std_conf": 0.42032958277978183, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7722741433021806, "calib/step_q_c_n": 642.0, "calib/step_q_gap": 0.02947221093503083, "calib/step_q_w": 0.7428019323671498, "calib/step_q_w_n": 1035.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 597.9296875, "completions/mean_terminated_length": 600.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.14186666666666667, "grad_norm": 1.599558711051941, "kl": 0.06075286865234375, "learning_rate": 1.888888888888889e-06, "loss": -0.0051, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03061661683022976, "mask/share_reasoning": 0.8480206727981567, "mask/share_step_conf": 0.11745644360780716, "num_tokens": 31882930.0, "reward": 0.5435722470283508, "reward_std": 0.4058837592601776, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6560851335525513, "rewards/format_reward_step": 0.95703125, "rewards/step_correlation_reward": 0.15293434262275696, "step": 133 }, { "adv/mean_abs_final_conf": 0.7444385290145874, "adv/mean_abs_reasoning": 0.589407205581665, "adv/mean_abs_step_conf": 0.7931458950042725, "adv/ratio_final_to_reasoning": 1.263029229986979, "adv/ratio_step_to_reasoning": 1.34566711688152, "adv/std_final_conf": 0.9073399901390076, "adv/std_reasoning": 0.809898853302002, "adv/std_step_conf": 0.9366623759269714, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.29043307086614173, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.2618579234972677, "calib/mean_conf": 0.6325590551181103, "calib/mu_c": 0.7583333333333333, "calib/mu_w": 0.4964754098360656, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20165354330708662, "calib/std_conf": 0.42547720598371147, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7759807956104252, "calib/step_q_c_n": 729.0, "calib/step_q_gap": 0.04129871468556978, "calib/step_q_w": 0.7346820809248554, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 555.3203125, "completions/mean_terminated_length": 555.3203125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.14293333333333333, "grad_norm": 1.160878300666809, "kl": 0.057708740234375, "learning_rate": 1.8611111111111113e-06, "loss": 0.0277, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03189656883478165, "mask/share_reasoning": 0.8589299321174622, "mask/share_step_conf": 0.1091734915971756, "num_tokens": 32134044.0, "reward": 0.5736523866653442, "reward_std": 0.3868893086910248, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.678691029548645, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.16783255338668823, "step": 134 }, { "adv/mean_abs_final_conf": 0.6356910467147827, "adv/mean_abs_reasoning": 0.5048134922981262, "adv/mean_abs_step_conf": 0.7687832117080688, "adv/ratio_final_to_reasoning": 1.2592592242747833, "adv/ratio_step_to_reasoning": 1.522905436239907, "adv/std_final_conf": 0.8391205668449402, "adv/std_reasoning": 0.757508397102356, "adv/std_step_conf": 0.9362074136734009, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.88671875, "calib/ece": 0.3074803149606299, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5787401574803149, "calib/gap": 0.25218670725520054, "calib/mean_conf": 0.6886614173228347, "calib/mu_c": 0.7958904109589042, "calib/mu_w": 0.5437037037037037, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21066929133858264, "calib/std_conf": 0.41595892855384226, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.755761421319797, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.0126598914171544, "calib/step_q_w": 0.7431015299026426, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 532.046875, "completions/mean_terminated_length": 532.046875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.144, "grad_norm": 1.0681238174438477, "kl": 0.05681610107421875, "learning_rate": 1.8333333333333333e-06, "loss": 0.0755, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03461921960115433, "mask/share_reasoning": 0.8411166071891785, "mask/share_step_conf": 0.12426416575908661, "num_tokens": 32376128.0, "reward": 0.6416856050491333, "reward_std": 0.33301347494125366, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.687390148639679, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.2842622399330139, "step": 135 }, { "adv/mean_abs_final_conf": 0.642751157283783, "adv/mean_abs_reasoning": 0.47995132207870483, "adv/mean_abs_step_conf": 0.7643371820449829, "adv/ratio_final_to_reasoning": 1.3392007224815632, "adv/ratio_step_to_reasoning": 1.5925306315119245, "adv/std_final_conf": 0.8588317632675171, "adv/std_reasoning": 0.720670759677887, "adv/std_step_conf": 0.936138391494751, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.26171875, "calib/ece": 0.2332806324110672, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.466403162055336, "calib/gap": 0.4076762940735185, "calib/mean_conf": 0.5818181818181818, "calib/mu_c": 0.7816279069767443, "calib/mu_w": 0.3739516129032258, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15260869565217391, "calib/std_conf": 0.44426500849401623, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7722076023391814, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.021559035220026, "calib/step_q_w": 0.7506485671191554, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 490.33203125, "completions/mean_terminated_length": 492.25494384765625, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.14506666666666668, "grad_norm": 1.1984667778015137, "kl": 0.06342315673828125, "learning_rate": 1.8055555555555557e-06, "loss": -0.0034, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035547420382499695, "mask/share_reasoning": 0.8394378423690796, "mask/share_step_conf": 0.12110848724842072, "num_tokens": 32610141.0, "reward": 0.6734086871147156, "reward_std": 0.28505226969718933, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7425054311752319, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.30509310960769653, "step": 136 }, { "adv/mean_abs_final_conf": 0.6236674785614014, "adv/mean_abs_reasoning": 0.44112664461135864, "adv/mean_abs_step_conf": 0.787781834602356, "adv/ratio_final_to_reasoning": 1.4138059584019569, "adv/ratio_step_to_reasoning": 1.7858405159280448, "adv/std_final_conf": 0.8301323056221008, "adv/std_reasoning": 0.7206512093544006, "adv/std_step_conf": 0.9363427758216858, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 6.37109375, "calib/ece": 0.25276000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.552, "calib/gap": 0.3686814182194618, "calib/mean_conf": 0.66628, "calib/mu_c": 0.8314492753623189, "calib/mu_w": 0.4627678571428571, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18352000000000016, "calib/std_conf": 0.4228656543158832, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7584940554821665, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.05876865502450068, "calib/step_q_w": 0.6997254004576658, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 545.9296875, "completions/mean_terminated_length": 550.2283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.14613333333333334, "grad_norm": 0.9670639634132385, "kl": 0.05837249755859375, "learning_rate": 1.777777777777778e-06, "loss": -0.0709, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031904879957437515, "mask/share_reasoning": 0.8407139778137207, "mask/share_step_conf": 0.11956866085529327, "num_tokens": 32856883.0, "reward": 0.673137903213501, "reward_std": 0.32800668478012085, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7256796956062317, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.31825241446495056, "step": 137 }, { "adv/mean_abs_final_conf": 0.6485915184020996, "adv/mean_abs_reasoning": 0.5356817841529846, "adv/mean_abs_step_conf": 0.799678385257721, "adv/ratio_final_to_reasoning": 1.2107776250552311, "adv/ratio_step_to_reasoning": 1.492823554794131, "adv/std_final_conf": 0.8604959845542908, "adv/std_reasoning": 0.7754703760147095, "adv/std_step_conf": 0.9364621639251709, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.45703125, "calib/ece": 0.1934, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.56, "calib/gap": 0.4300981996726677, "calib/mean_conf": 0.6663600000000001, "calib/mu_c": 0.8280769230769232, "calib/mu_w": 0.39797872340425544, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11788, "calib/std_conf": 0.42195728504198143, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7794161490683229, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.06171344636562026, "calib/step_q_w": 0.7177027027027026, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 523.69921875, "completions/mean_terminated_length": 523.69921875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1472, "grad_norm": 0.9509099721908569, "kl": 0.06456756591796875, "learning_rate": 1.75e-06, "loss": 0.0622, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03561119735240936, "mask/share_reasoning": 0.8460170030593872, "mask/share_step_conf": 0.11837181448936462, "num_tokens": 33095286.0, "reward": 0.7132434844970703, "reward_std": 0.3461204767227173, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7687472105026245, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.34211456775665283, "step": 138 }, { "adv/mean_abs_final_conf": 0.6053095459938049, "adv/mean_abs_reasoning": 0.373541921377182, "adv/mean_abs_step_conf": 0.7681447863578796, "adv/ratio_final_to_reasoning": 1.6204594755044823, "adv/ratio_step_to_reasoning": 2.056381740303385, "adv/std_final_conf": 0.8284891247749329, "adv/std_reasoning": 0.6813934445381165, "adv/std_step_conf": 0.9363941550254822, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.14152941176470588, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5019607843137255, "calib/gap": 0.5119793621013133, "calib/mean_conf": 0.6561960784313726, "calib/mu_c": 0.8389024390243903, "calib/mu_w": 0.3269230769230769, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0772941176470588, "calib/std_conf": 0.4065156109492053, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7863399503722084, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.01720551206649934, "calib/step_q_w": 0.769134438305709, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 463.91796875, "completions/mean_terminated_length": 463.91796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.14826666666666666, "grad_norm": 1.222753882408142, "kl": 0.06946563720703125, "learning_rate": 1.7222222222222224e-06, "loss": 0.06, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036902882158756256, "mask/share_reasoning": 0.8469626903533936, "mask/share_step_conf": 0.11613443493843079, "num_tokens": 33317145.0, "reward": 0.7846008539199829, "reward_std": 0.28596794605255127, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.8367917537689209, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.4050661027431488, "step": 139 }, { "adv/mean_abs_final_conf": 0.5658727884292603, "adv/mean_abs_reasoning": 0.36575832962989807, "adv/mean_abs_step_conf": 0.7847108840942383, "adv/ratio_final_to_reasoning": 1.5471220819546423, "adv/ratio_step_to_reasoning": 2.1454354433657548, "adv/std_final_conf": 0.8223967552185059, "adv/std_reasoning": 0.6402462124824524, "adv/std_step_conf": 0.9362854957580566, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.16248031496062992, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6496062992125984, "calib/gap": 0.45311206896551726, "calib/mean_conf": 0.7481496062992127, "calib/mu_c": 0.8908620689655172, "calib/mu_w": 0.43775, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11279527559055116, "calib/std_conf": 0.38605225011995414, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7944318181818182, "calib/step_q_c_n": 880.0, "calib/step_q_gap": 0.04142276840806269, "calib/step_q_w": 0.7530090497737555, "calib/step_q_w_n": 442.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 476.54296875, "completions/mean_terminated_length": 476.54296875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.14933333333333335, "grad_norm": 1.129404902458191, "kl": 0.06432342529296875, "learning_rate": 1.6944444444444446e-06, "loss": 0.0477, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03652922064065933, "mask/share_reasoning": 0.8423087000846863, "mask/share_step_conf": 0.12116207182407379, "num_tokens": 33544156.0, "reward": 0.7760652899742126, "reward_std": 0.25756433606147766, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.8202886581420898, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.39746689796447754, "step": 140 }, { "adv/mean_abs_final_conf": 0.5698114633560181, "adv/mean_abs_reasoning": 0.4262760877609253, "adv/mean_abs_step_conf": 0.7613615989685059, "adv/ratio_final_to_reasoning": 1.3367192758782984, "adv/ratio_step_to_reasoning": 1.786076256277156, "adv/std_final_conf": 0.8269156813621521, "adv/std_reasoning": 0.7204464077949524, "adv/std_step_conf": 0.936262309551239, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.13992156862745117, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5725490196078431, "calib/gap": 0.5900812739526893, "calib/mean_conf": 0.6641568627450981, "calib/mu_c": 0.8816770186335403, "calib/mu_w": 0.29159574468085103, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08635294117647077, "calib/std_conf": 0.4382934540518116, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7781044267877412, "calib/step_q_c_n": 881.0, "calib/step_q_gap": 0.02945915049540604, "calib/step_q_w": 0.7486452762923351, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 543.65625, "completions/mean_terminated_length": 543.65625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1504, "grad_norm": 1.2356594800949097, "kl": 0.05931854248046875, "learning_rate": 1.6666666666666667e-06, "loss": 0.0461, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03289218991994858, "mask/share_reasoning": 0.8534107208251953, "mask/share_step_conf": 0.1136970967054367, "num_tokens": 33790428.0, "reward": 0.7739890217781067, "reward_std": 0.29194456338882446, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.8415335416793823, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.3822256922721863, "step": 141 }, { "adv/mean_abs_final_conf": 0.6223675012588501, "adv/mean_abs_reasoning": 0.4059096574783325, "adv/mean_abs_step_conf": 0.7754439115524292, "adv/ratio_final_to_reasoning": 1.5332660600519763, "adv/ratio_step_to_reasoning": 1.9103854694411218, "adv/std_final_conf": 0.8296803832054138, "adv/std_reasoning": 0.6816391944885254, "adv/std_step_conf": 0.9365256428718567, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.73046875, "calib/ece": 0.21577689243027892, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5378486055776892, "calib/gap": 0.4868471743844878, "calib/mean_conf": 0.623585657370518, "calib/mu_c": 0.8505223880597015, "calib/mu_w": 0.36367521367521366, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15274900398406377, "calib/std_conf": 0.44704952580154755, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7987088274044796, "calib/step_q_c_n": 759.0, "calib/step_q_gap": 0.03267775395815198, "calib/step_q_w": 0.7660310734463276, "calib/step_q_w_n": 708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 538.14453125, "completions/mean_terminated_length": 540.2549438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.15146666666666667, "grad_norm": 0.9943637847900391, "kl": 0.06036376953125, "learning_rate": 1.638888888888889e-06, "loss": 0.0574, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03388165682554245, "mask/share_reasoning": 0.8415210247039795, "mask/share_step_conf": 0.12069105356931686, "num_tokens": 34033353.0, "reward": 0.6756160259246826, "reward_std": 0.3330475091934204, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7663015127182007, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.28493040800094604, "step": 142 }, { "adv/mean_abs_final_conf": 0.5717636346817017, "adv/mean_abs_reasoning": 0.36487457156181335, "adv/mean_abs_step_conf": 0.7797194719314575, "adv/ratio_final_to_reasoning": 1.567014199521545, "adv/ratio_step_to_reasoning": 2.136952072581921, "adv/std_final_conf": 0.7936625480651855, "adv/std_reasoning": 0.6402057409286499, "adv/std_step_conf": 0.936142086982727, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.34375, "calib/ece": 0.15418972332015812, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.49407114624505927, "calib/gap": 0.5077342540010324, "calib/mean_conf": 0.6300790513833993, "calib/mu_c": 0.8387919463087248, "calib/mu_w": 0.33105769230769233, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09766798418972335, "calib/std_conf": 0.4263485854093139, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7874529485570891, "calib/step_q_c_n": 797.0, "calib/step_q_gap": 0.08412767648937469, "calib/step_q_w": 0.7033252720677144, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 520.98828125, "completions/mean_terminated_length": 520.98828125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.15253333333333333, "grad_norm": 1.2129403352737427, "kl": 0.06390380859375, "learning_rate": 1.6111111111111113e-06, "loss": -0.0244, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03274935856461525, "mask/share_reasoning": 0.8389418125152588, "mask/share_step_conf": 0.12830880284309387, "num_tokens": 34274062.0, "reward": 0.7248586416244507, "reward_std": 0.2750049829483032, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.810666024684906, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.3249887228012085, "step": 143 }, { "adv/mean_abs_final_conf": 0.5650076270103455, "adv/mean_abs_reasoning": 0.37447911500930786, "adv/mean_abs_step_conf": 0.7701300382614136, "adv/ratio_final_to_reasoning": 1.5087827447901918, "adv/ratio_step_to_reasoning": 2.056536686277609, "adv/std_final_conf": 0.777367353439331, "adv/std_reasoning": 0.6612008213996887, "adv/std_step_conf": 0.9364955425262451, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.25279527559055104, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.515748031496063, "calib/gap": 0.4561072261072261, "calib/mean_conf": 0.5712992125984252, "calib/mu_c": 0.7113636363636364, "calib/mu_w": 0.2552564102564103, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0655905511811023, "calib/std_conf": 0.46625443047373594, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.782332990750257, "calib/step_q_c_n": 973.0, "calib/step_q_gap": 0.011696858434735447, "calib/step_q_w": 0.7706361323155215, "calib/step_q_w_n": 393.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 497.33984375, "completions/mean_terminated_length": 497.33984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1536, "grad_norm": 1.0443801879882812, "kl": 0.07248687744140625, "learning_rate": 1.5833333333333333e-06, "loss": 0.0468, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035232387483119965, "mask/share_reasoning": 0.8452960848808289, "mask/share_step_conf": 0.11947152018547058, "num_tokens": 34505509.0, "reward": 0.7161462306976318, "reward_std": 0.26303207874298096, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7432847619056702, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.35307005047798157, "step": 144 }, { "adv/mean_abs_final_conf": 0.5799102783203125, "adv/mean_abs_reasoning": 0.38810279965400696, "adv/mean_abs_step_conf": 0.7869246006011963, "adv/ratio_final_to_reasoning": 1.4942182299053282, "adv/ratio_step_to_reasoning": 2.027618974412806, "adv/std_final_conf": 0.8257190585136414, "adv/std_reasoning": 0.6612595319747925, "adv/std_step_conf": 0.9363152384757996, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.859375, "calib/ece": 0.20509803921568617, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5529411764705883, "calib/gap": 0.41492227284910205, "calib/mean_conf": 0.6705882352941176, "calib/mu_c": 0.8186585365853658, "calib/mu_w": 0.40373626373626376, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11627450980392148, "calib/std_conf": 0.41495481334296347, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7913289036544852, "calib/step_q_c_n": 903.0, "calib/step_q_gap": 0.04030712811009651, "calib/step_q_w": 0.7510217755443886, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 483.30859375, "completions/mean_terminated_length": 483.30859375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.15466666666666667, "grad_norm": 1.0792020559310913, "kl": 0.06833648681640625, "learning_rate": 1.5555555555555558e-06, "loss": 0.0689, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036209069192409515, "mask/share_reasoning": 0.8316611051559448, "mask/share_step_conf": 0.13212978839874268, "num_tokens": 34731940.0, "reward": 0.7233527898788452, "reward_std": 0.2534133493900299, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7849280834197998, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3344336152076721, "step": 145 }, { "adv/mean_abs_final_conf": 0.5825879573822021, "adv/mean_abs_reasoning": 0.4273000657558441, "adv/mean_abs_step_conf": 0.7795478105545044, "adv/ratio_final_to_reasoning": 1.363416493633512, "adv/ratio_step_to_reasoning": 1.824356870096837, "adv/std_final_conf": 0.8095422387123108, "adv/std_reasoning": 0.6817421317100525, "adv/std_step_conf": 0.9366424679756165, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.24007936507936495, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4246031746031746, "calib/gap": 0.40491050234169496, "calib/mean_conf": 0.5349999999999999, "calib/mu_c": 0.7647706422018349, "calib/mu_w": 0.3598601398601399, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17126984126984116, "calib/std_conf": 0.4514329038369564, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7802083333333334, "calib/step_q_c_n": 624.0, "calib/step_q_gap": 0.017543358714044, "calib/step_q_w": 0.7626649746192894, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 518.2265625, "completions/mean_terminated_length": 522.3070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.15573333333333333, "grad_norm": 0.8863820433616638, "kl": 0.06182098388671875, "learning_rate": 1.527777777777778e-06, "loss": -0.0396, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034002456814050674, "mask/share_reasoning": 0.8431822657585144, "mask/share_step_conf": 0.11500281095504761, "num_tokens": 34971822.0, "reward": 0.5688153505325317, "reward_std": 0.3346751928329468, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7274835705757141, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.12811581790447235, "step": 146 }, { "adv/mean_abs_final_conf": 0.5607386231422424, "adv/mean_abs_reasoning": 0.4408961236476898, "adv/mean_abs_step_conf": 0.7701042890548706, "adv/ratio_final_to_reasoning": 1.2718157249899436, "adv/ratio_step_to_reasoning": 1.7466796547983343, "adv/std_final_conf": 0.7955544590950012, "adv/std_reasoning": 0.7205079793930054, "adv/std_step_conf": 0.9366425275802612, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.28837944664031623, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5533596837944664, "calib/gap": 0.40376332622601274, "calib/mean_conf": 0.6104347826086957, "calib/mu_c": 0.8242857142857143, "calib/mu_w": 0.42052238805970155, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21422924901185775, "calib/std_conf": 0.464865368643528, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.789968895800933, "calib/step_q_c_n": 643.0, "calib/step_q_gap": 0.032272418836163386, "calib/step_q_w": 0.7576964769647696, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2461.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 509.8359375, "completions/mean_terminated_length": 509.8359375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.1568, "grad_norm": 0.824609100818634, "kl": 0.0667724609375, "learning_rate": 1.5e-06, "loss": 0.0311, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0330023393034935, "mask/share_reasoning": 0.8518515825271606, "mask/share_step_conf": 0.11514610797166824, "num_tokens": 35206020.0, "reward": 0.5513256788253784, "reward_std": 0.31428655982017517, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.704379677772522, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.10842788219451904, "step": 147 }, { "adv/mean_abs_final_conf": 0.5014116764068604, "adv/mean_abs_reasoning": 0.4464901089668274, "adv/mean_abs_step_conf": 0.7467024326324463, "adv/ratio_final_to_reasoning": 1.1230073552292588, "adv/ratio_step_to_reasoning": 1.672382921002005, "adv/std_final_conf": 0.7539214491844177, "adv/std_reasoning": 0.7205377221107483, "adv/std_step_conf": 0.9365345239639282, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.04296875, "calib/ece": 0.24138888888888896, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6507936507936508, "calib/gap": 0.39733328145669616, "calib/mean_conf": 0.7018650793650794, "calib/mu_c": 0.8138121546961327, "calib/mu_w": 0.4164788732394365, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11250000000000004, "calib/std_conf": 0.43841898227454124, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8063097949886106, "calib/step_q_c_n": 878.0, "calib/step_q_gap": 0.04364635673195194, "calib/step_q_w": 0.7626634382566586, "calib/step_q_w_n": 413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 460.0703125, "completions/mean_terminated_length": 460.0703125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.15786666666666666, "grad_norm": 0.8574560880661011, "kl": 0.0750579833984375, "learning_rate": 1.4722222222222225e-06, "loss": 0.0691, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03910716623067856, "mask/share_reasoning": 0.8357589244842529, "mask/share_step_conf": 0.1251339316368103, "num_tokens": 35428910.0, "reward": 0.7605050802230835, "reward_std": 0.31956398487091064, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7500936985015869, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.4334164261817932, "step": 148 }, { "adv/mean_abs_final_conf": 0.559969961643219, "adv/mean_abs_reasoning": 0.5347915887832642, "adv/mean_abs_step_conf": 0.779393196105957, "adv/ratio_final_to_reasoning": 1.0470807196448988, "adv/ratio_step_to_reasoning": 1.4573774390865055, "adv/std_final_conf": 0.7960095405578613, "adv/std_reasoning": 0.7926666140556335, "adv/std_step_conf": 0.9364703893661499, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.15338645418326702, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5537848605577689, "calib/gap": 0.6183800500065797, "calib/mean_conf": 0.6292430278884462, "calib/mu_c": 0.8805369127516778, "calib/mu_w": 0.26215686274509803, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09450199203187258, "calib/std_conf": 0.45343380535403827, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8122222222222223, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.058528791565287897, "calib/step_q_w": 0.7536934306569344, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 540.37890625, "completions/mean_terminated_length": 540.37890625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.15893333333333334, "grad_norm": 0.8028203845024109, "kl": 0.06542205810546875, "learning_rate": 1.4444444444444445e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035116180777549744, "mask/share_reasoning": 0.8498522043228149, "mask/share_step_conf": 0.11503161489963531, "num_tokens": 35671703.0, "reward": 0.7139464616775513, "reward_std": 0.28929561376571655, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.8336374759674072, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2801927924156189, "step": 149 }, { "adv/mean_abs_final_conf": 0.5604312419891357, "adv/mean_abs_reasoning": 0.42686715722084045, "adv/mean_abs_step_conf": 0.7721836566925049, "adv/ratio_final_to_reasoning": 1.3128937949639345, "adv/ratio_step_to_reasoning": 1.808955417699222, "adv/std_final_conf": 0.8109138011932373, "adv/std_reasoning": 0.7013243436813354, "adv/std_step_conf": 0.9364561438560486, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.37109375, "calib/ece": 0.21272727272727263, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5928853754940712, "calib/gap": 0.46234474522292995, "calib/mean_conf": 0.6762845849802371, "calib/mu_c": 0.8517197452229299, "calib/mu_w": 0.38937499999999997, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1342292490118576, "calib/std_conf": 0.4354993840153992, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7802990430622009, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.0202248315594179, "calib/step_q_w": 0.760074211502783, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 449.515625, "completions/mean_terminated_length": 449.515625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.16, "grad_norm": 1.2784284353256226, "kl": 0.07811737060546875, "learning_rate": 1.4166666666666667e-06, "loss": -0.0072, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04103165864944458, "mask/share_reasoning": 0.8245840072631836, "mask/share_step_conf": 0.13438434898853302, "num_tokens": 35891739.0, "reward": 0.7148318290710449, "reward_std": 0.3131522536277771, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7763429880142212, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.33378952741622925, "step": 150 }, { "adv/mean_abs_final_conf": 0.6466965675354004, "adv/mean_abs_reasoning": 0.4281116724014282, "adv/mean_abs_step_conf": 0.7777887582778931, "adv/ratio_final_to_reasoning": 1.5105791531164123, "adv/ratio_step_to_reasoning": 1.8167894229909773, "adv/std_final_conf": 0.8248745203018188, "adv/std_reasoning": 0.6818034648895264, "adv/std_step_conf": 0.9367329478263855, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.73828125, "calib/ece": 0.26645418326693227, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4103585657370518, "calib/gap": 0.3799674329501915, "calib/mean_conf": 0.503824701195219, "calib/mu_c": 0.7081896551724137, "calib/mu_w": 0.32822222222222225, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1540637450199203, "calib/std_conf": 0.4633024030594787, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7923764258555133, "calib/step_q_c_n": 526.0, "calib/step_q_gap": 0.019756338519268724, "calib/step_q_w": 0.7726200873362445, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 511.9140625, "completions/mean_terminated_length": 511.9140625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.16106666666666666, "grad_norm": 0.9701210856437683, "kl": 0.06146240234375, "learning_rate": 1.3888888888888892e-06, "loss": 0.0083, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034223850816488266, "mask/share_reasoning": 0.8605751991271973, "mask/share_step_conf": 0.10520097613334656, "num_tokens": 36129813.0, "reward": 0.5742936134338379, "reward_std": 0.34442201256752014, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7058960795402527, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.1567535698413849, "step": 151 }, { "adv/mean_abs_final_conf": 0.704607367515564, "adv/mean_abs_reasoning": 0.5353569984436035, "adv/mean_abs_step_conf": 0.7802865505218506, "adv/ratio_final_to_reasoning": 1.3161448707386048, "adv/ratio_step_to_reasoning": 1.4575069585161105, "adv/std_final_conf": 0.8944112062454224, "adv/std_reasoning": 0.7928844690322876, "adv/std_step_conf": 0.9365404844284058, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.25390625, "calib/ece": 0.21467741935483878, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3870967741935484, "calib/gap": 0.4737096774193549, "calib/mean_conf": 0.4905645161290323, "calib/mu_c": 0.7274193548387097, "calib/mu_w": 0.2537096774193548, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.10262096774193558, "calib/std_conf": 0.45762148469780556, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7988473053892216, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.03602603507902946, "calib/step_q_w": 0.7628212703101921, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 498.03125, "completions/mean_terminated_length": 499.9843444824219, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.16213333333333332, "grad_norm": 1.3831433057785034, "kl": 0.06764984130859375, "learning_rate": 1.3611111111111112e-06, "loss": -0.0439, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03510105982422829, "mask/share_reasoning": 0.8406826257705688, "mask/share_step_conf": 0.12031005322933197, "num_tokens": 36362701.0, "reward": 0.6193888187408447, "reward_std": 0.3312366008758545, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7491500377655029, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.19978387653827667, "step": 152 }, { "adv/mean_abs_final_conf": 0.687443196773529, "adv/mean_abs_reasoning": 0.42590320110321045, "adv/mean_abs_step_conf": 0.7630718350410461, "adv/ratio_final_to_reasoning": 1.6140831883696944, "adv/ratio_step_to_reasoning": 1.7916555523989324, "adv/std_final_conf": 0.8911882638931274, "adv/std_reasoning": 0.7014328837394714, "adv/std_step_conf": 0.9365268349647522, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.3052610441767068, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3815261044176707, "calib/gap": 0.3449684791174153, "calib/mean_conf": 0.4744176706827309, "calib/mu_c": 0.6240425531914894, "calib/mu_w": 0.2790740740740741, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10670682730923693, "calib/std_conf": 0.46422514169065293, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8049849397590362, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.06106273554509789, "calib/step_q_w": 0.7439222042139383, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2649.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 527.59375, "completions/mean_terminated_length": 527.59375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1632, "grad_norm": 1.2195909023284912, "kl": 0.0627899169921875, "learning_rate": 1.3333333333333334e-06, "loss": 0.0899, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03333558142185211, "mask/share_reasoning": 0.8590217232704163, "mask/share_step_conf": 0.10764269530773163, "num_tokens": 36605085.0, "reward": 0.6315809488296509, "reward_std": 0.29708153009414673, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6768605709075928, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.2823951244354248, "step": 153 }, { "adv/mean_abs_final_conf": 0.5603320598602295, "adv/mean_abs_reasoning": 0.3724971115589142, "adv/mean_abs_step_conf": 0.7861723303794861, "adv/ratio_final_to_reasoning": 1.5042588048944034, "adv/ratio_step_to_reasoning": 2.1105461115908413, "adv/std_final_conf": 0.7921335697174072, "adv/std_reasoning": 0.6815275549888611, "adv/std_step_conf": 0.936652421951294, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 4.6171875, "calib/ece": 0.2451778656126483, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4505928853754941, "calib/gap": 0.4696871480415467, "calib/mean_conf": 0.5033596837944664, "calib/mu_c": 0.7465573770491803, "calib/mu_w": 0.27687022900763364, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13316205533596845, "calib/std_conf": 0.47590165834353765, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.825287569573284, "calib/step_q_c_n": 539.0, "calib/step_q_gap": 0.04208383706939589, "calib/step_q_w": 0.7832037325038881, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 461.99609375, "completions/mean_terminated_length": 461.99609375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.16426666666666667, "grad_norm": 0.8897591233253479, "kl": 0.0737762451171875, "learning_rate": 1.3055555555555556e-06, "loss": 0.0144, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03781069815158844, "mask/share_reasoning": 0.8530268669128418, "mask/share_step_conf": 0.10916240513324738, "num_tokens": 36827796.0, "reward": 0.5996102094650269, "reward_std": 0.31764936447143555, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7490511536598206, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.15720053017139435, "step": 154 }, { "adv/mean_abs_final_conf": 0.7118638753890991, "adv/mean_abs_reasoning": 0.4428885281085968, "adv/mean_abs_step_conf": 0.7825101613998413, "adv/ratio_final_to_reasoning": 1.607320646640342, "adv/ratio_step_to_reasoning": 1.7668332136342193, "adv/std_final_conf": 0.8903141617774963, "adv/std_reasoning": 0.7205024361610413, "adv/std_step_conf": 0.9366476535797119, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.57421875, "calib/ece": 0.278515625, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.35546875, "calib/gap": 0.3834916559691913, "calib/mean_conf": 0.416796875, "calib/mu_c": 0.6010526315789474, "calib/mu_w": 0.2175609756097561, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08789062500000003, "calib/std_conf": 0.46407575081039776, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8091276252019386, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.04829429186860523, "calib/step_q_w": 0.7608333333333334, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 424.3828125, "completions/mean_terminated_length": 426.0470886230469, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.16533333333333333, "grad_norm": 3.718019962310791, "kl": 0.12290191650390625, "learning_rate": 1.2777777777777779e-06, "loss": -0.001, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03905310481786728, "mask/share_reasoning": 0.8445406556129456, "mask/share_step_conf": 0.11249996721744537, "num_tokens": 37043654.0, "reward": 0.6309127807617188, "reward_std": 0.28129005432128906, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7120078206062317, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.24669276177883148, "step": 155 }, { "adv/mean_abs_final_conf": 0.6478233337402344, "adv/mean_abs_reasoning": 0.42158418893814087, "adv/mean_abs_step_conf": 0.7578214406967163, "adv/ratio_final_to_reasoning": 1.5366404878037057, "adv/ratio_step_to_reasoning": 1.7975565986131221, "adv/std_final_conf": 0.8765128254890442, "adv/std_reasoning": 0.7012465000152588, "adv/std_step_conf": 0.9365244507789612, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.453125, "calib/ece": 0.3049200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.46, "calib/gap": 0.33457616066771, "calib/mean_conf": 0.51828, "calib/mu_c": 0.6628169014084507, "calib/mu_w": 0.3282407407407407, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12760000000000007, "calib/std_conf": 0.4733377669275926, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7888337468982631, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.005935441813517461, "calib/step_q_w": 0.7828983050847457, "calib/step_q_w_n": 590.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 505.36328125, "completions/mean_terminated_length": 505.36328125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1664, "grad_norm": 1.0986273288726807, "kl": 0.06716537475585938, "learning_rate": 1.25e-06, "loss": 0.0407, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0364220067858696, "mask/share_reasoning": 0.8422032594680786, "mask/share_step_conf": 0.12137471139431, "num_tokens": 37277787.0, "reward": 0.6499598622322083, "reward_std": 0.27834880352020264, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6760715246200562, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.31759825348854065, "step": 156 }, { "adv/mean_abs_final_conf": 0.5612874031066895, "adv/mean_abs_reasoning": 0.497196763753891, "adv/mean_abs_step_conf": 0.7866624593734741, "adv/ratio_final_to_reasoning": 1.1289039752972385, "adv/ratio_step_to_reasoning": 1.5821954540373209, "adv/std_final_conf": 0.816787600517273, "adv/std_reasoning": 0.775337278842926, "adv/std_step_conf": 0.9363912343978882, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.17897233201581028, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5138339920948617, "calib/gap": 0.5861313772138514, "calib/mean_conf": 0.580790513833992, "calib/mu_c": 0.8055128205128205, "calib/mu_w": 0.21938144329896908, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.0715810276679842, "calib/std_conf": 0.4663229380995103, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8149248554913294, "calib/step_q_c_n": 865.0, "calib/step_q_gap": 0.04328709055876678, "calib/step_q_w": 0.7716377649325626, "calib/step_q_w_n": 519.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 491.4765625, "completions/mean_terminated_length": 491.4765625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.16746666666666668, "grad_norm": 0.81620192527771, "kl": 0.0667572021484375, "learning_rate": 1.2222222222222223e-06, "loss": 0.0572, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035581715404987335, "mask/share_reasoning": 0.8397380113601685, "mask/share_step_conf": 0.12468031048774719, "num_tokens": 37507333.0, "reward": 0.7193992137908936, "reward_std": 0.30614110827445984, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.8084452748298645, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3116031289100647, "step": 157 }, { "adv/mean_abs_final_conf": 0.674676775932312, "adv/mean_abs_reasoning": 0.5534849166870117, "adv/mean_abs_step_conf": 0.7770692110061646, "adv/ratio_final_to_reasoning": 1.2189614487974072, "adv/ratio_step_to_reasoning": 1.4039573393570663, "adv/std_final_conf": 0.8448676466941833, "adv/std_reasoning": 0.7928907871246338, "adv/std_step_conf": 0.9361749887466431, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.7421875, "calib/ece": 0.2972908366533865, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.2891368120456906, "calib/mean_conf": 0.5741035856573706, "calib/mu_c": 0.6973611111111111, "calib/mu_w": 0.4082242990654205, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14884462151394423, "calib/std_conf": 0.45852384535599827, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.811062091503268, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.013138503463400841, "calib/step_q_w": 0.7979235880398672, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 492.203125, "completions/mean_terminated_length": 492.203125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.16853333333333334, "grad_norm": 1.1106908321380615, "kl": 0.06908416748046875, "learning_rate": 1.1944444444444446e-06, "loss": 0.104, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03861449286341667, "mask/share_reasoning": 0.8458616137504578, "mask/share_step_conf": 0.11552391946315765, "num_tokens": 37738577.0, "reward": 0.6030690670013428, "reward_std": 0.34117481112480164, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6703546643257141, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.22797095775604248, "step": 158 }, { "adv/mean_abs_final_conf": 0.615410327911377, "adv/mean_abs_reasoning": 0.48913106322288513, "adv/mean_abs_step_conf": 0.7589163780212402, "adv/ratio_final_to_reasoning": 1.258170609440418, "adv/ratio_step_to_reasoning": 1.5515603793812223, "adv/std_final_conf": 0.8271223902702332, "adv/std_reasoning": 0.7575496435165405, "adv/std_step_conf": 0.9364922046661377, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.23095999999999983, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.472, "calib/gap": 0.4849155844155844, "calib/mean_conf": 0.53528, "calib/mu_c": 0.7486428571428572, "calib/mu_w": 0.26372727272727275, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10311999999999985, "calib/std_conf": 0.47182212919701, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8135846153846155, "calib/step_q_c_n": 650.0, "calib/step_q_gap": 0.034950551303671085, "calib/step_q_w": 0.7786340640809444, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 472.08984375, "completions/mean_terminated_length": 473.9411926269531, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1696, "grad_norm": 1.1364686489105225, "kl": 0.071441650390625, "learning_rate": 1.1666666666666668e-06, "loss": 0.0712, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03742946684360504, "mask/share_reasoning": 0.8430386185646057, "mask/share_step_conf": 0.11562561988830566, "num_tokens": 37964216.0, "reward": 0.7366165518760681, "reward_std": 0.3033605217933655, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7513078451156616, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.41645652055740356, "step": 159 }, { "adv/mean_abs_final_conf": 0.5849112272262573, "adv/mean_abs_reasoning": 0.48765817284584045, "adv/mean_abs_step_conf": 0.8051754832267761, "adv/ratio_final_to_reasoning": 1.1994287388087326, "adv/ratio_step_to_reasoning": 1.6511063036798728, "adv/std_final_conf": 0.7948373556137085, "adv/std_reasoning": 0.7393858432769775, "adv/std_step_conf": 0.9365575313568115, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.58203125, "calib/ece": 0.25127490039840633, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4342629482071713, "calib/gap": 0.4081174968071521, "calib/mean_conf": 0.5170916334661354, "calib/mu_c": 0.7057037037037038, "calib/mu_w": 0.29758620689655174, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11525896414342628, "calib/std_conf": 0.4678525020931935, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8138113207547171, "calib/step_q_c_n": 530.0, "calib/step_q_gap": 0.027172129463892913, "calib/step_q_w": 0.7866391912908242, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 482.98828125, "completions/mean_terminated_length": 482.98828125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.17066666666666666, "grad_norm": 0.8197259902954102, "kl": 0.068145751953125, "learning_rate": 1.138888888888889e-06, "loss": 0.0677, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035837285220623016, "mask/share_reasoning": 0.8586567044258118, "mask/share_step_conf": 0.10550597310066223, "num_tokens": 38192701.0, "reward": 0.624016284942627, "reward_std": 0.3329963684082031, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.719306230545044, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.22794502973556519, "step": 160 }, { "adv/mean_abs_final_conf": 0.5999249219894409, "adv/mean_abs_reasoning": 0.4801775813102722, "adv/mean_abs_step_conf": 0.772857666015625, "adv/ratio_final_to_reasoning": 1.249381365020023, "adv/ratio_step_to_reasoning": 1.6095246760723596, "adv/std_final_conf": 0.8302241563796997, "adv/std_reasoning": 0.7392877340316772, "adv/std_step_conf": 0.9364839196205139, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.69921875, "calib/ece": 0.2676470588235294, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4470588235294118, "calib/gap": 0.43562146892655373, "calib/mean_conf": 0.5373725490196078, "calib/mu_c": 0.6706214689265537, "calib/mu_w": 0.235, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05545098039215686, "calib/std_conf": 0.4581785746903353, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7975471698113207, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.02025139516343355, "calib/step_q_w": 0.7772957746478871, "calib/step_q_w_n": 355.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 437.78515625, "completions/mean_terminated_length": 439.5019836425781, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.17173333333333332, "grad_norm": 1.318349838256836, "kl": 0.075439453125, "learning_rate": 1.111111111111111e-06, "loss": 0.0834, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03923429921269417, "mask/share_reasoning": 0.83887779712677, "mask/share_step_conf": 0.11798164248466492, "num_tokens": 38408694.0, "reward": 0.7005940675735474, "reward_std": 0.32924798130989075, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7313761711120605, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.33309316635131836, "step": 161 }, { "adv/mean_abs_final_conf": 0.6051830649375916, "adv/mean_abs_reasoning": 0.44631415605545044, "adv/mean_abs_step_conf": 0.7811368703842163, "adv/ratio_final_to_reasoning": 1.3559575844204304, "adv/ratio_step_to_reasoning": 1.7501951479378288, "adv/std_final_conf": 0.8399561643600464, "adv/std_reasoning": 0.7205145955085754, "adv/std_step_conf": 0.9366507530212402, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 4.6328125, "calib/ece": 0.24625984251968502, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4763779527559055, "calib/gap": 0.4096654193264362, "calib/mean_conf": 0.5732677165354331, "calib/mu_c": 0.697457627118644, "calib/mu_w": 0.2877922077922078, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.06133858267716536, "calib/std_conf": 0.4535433019603018, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8089772727272728, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.015449353945547029, "calib/step_q_w": 0.7935279187817258, "calib/step_q_w_n": 394.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 428.17578125, "completions/mean_terminated_length": 429.85491943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.1728, "grad_norm": 1.532256841659546, "kl": 0.07913970947265625, "learning_rate": 1.0833333333333335e-06, "loss": 0.0135, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03943236172199249, "mask/share_reasoning": 0.8404712677001953, "mask/share_step_conf": 0.11619007587432861, "num_tokens": 38622451.0, "reward": 0.74653160572052, "reward_std": 0.31516486406326294, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7350714206695557, "rewards/format_reward_step": 0.9921875, "rewards/step_correlation_reward": 0.421272873878479, "step": 162 }, { "adv/mean_abs_final_conf": 0.5998756885528564, "adv/mean_abs_reasoning": 0.477761447429657, "adv/mean_abs_step_conf": 0.7583494186401367, "adv/ratio_final_to_reasoning": 1.2555966827800162, "adv/ratio_step_to_reasoning": 1.58729722274586, "adv/std_final_conf": 0.8168103098869324, "adv/std_reasoning": 0.7393505573272705, "adv/std_step_conf": 0.9366748332977295, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 4.84765625, "calib/ece": 0.23811999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.392, "calib/gap": 0.46971920289855074, "calib/mean_conf": 0.46116, "calib/mu_c": 0.6715942028985508, "calib/mu_w": 0.201875, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.07363999999999996, "calib/std_conf": 0.4592426966212963, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8115336463223786, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.038510390508425085, "calib/step_q_w": 0.7730232558139535, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 495.7265625, "completions/mean_terminated_length": 499.6299133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.17386666666666667, "grad_norm": 1.1021077632904053, "kl": 0.0713958740234375, "learning_rate": 1.0555555555555557e-06, "loss": 0.004, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037006720900535583, "mask/share_reasoning": 0.8422186374664307, "mask/share_step_conf": 0.11296215653419495, "num_tokens": 38854189.0, "reward": 0.6317862272262573, "reward_std": 0.3233771324157715, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7479175329208374, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2125299572944641, "step": 163 }, { "adv/mean_abs_final_conf": 0.5631370544433594, "adv/mean_abs_reasoning": 0.4589161276817322, "adv/mean_abs_step_conf": 0.7574629187583923, "adv/ratio_final_to_reasoning": 1.227102340656693, "adv/ratio_step_to_reasoning": 1.6505476122288483, "adv/std_final_conf": 0.8071234226226807, "adv/std_reasoning": 0.7573532462120056, "adv/std_step_conf": 0.9368436932563782, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.45703125, "calib/ece": 0.1839682539682539, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.5387037037037037, "calib/mean_conf": 0.4363492063492064, "calib/mu_c": 0.6672222222222222, "calib/mu_w": 0.1285185185185185, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0244444444444444, "calib/std_conf": 0.44685870636225844, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8247706422018349, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.03860344977281294, "calib/step_q_w": 0.786167192429022, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2229.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 560.57421875, "completions/mean_terminated_length": 560.57421875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.17493333333333333, "grad_norm": 1.3901984691619873, "kl": 0.06829071044921875, "learning_rate": 1.0277777777777777e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030896078795194626, "mask/share_reasoning": 0.8671608567237854, "mask/share_step_conf": 0.10194303095340729, "num_tokens": 39103832.0, "reward": 0.6638706922531128, "reward_std": 0.3134278655052185, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7885116934776306, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.22985464334487915, "step": 164 }, { "adv/mean_abs_final_conf": 0.5942582488059998, "adv/mean_abs_reasoning": 0.400866836309433, "adv/mean_abs_step_conf": 0.7760728001594543, "adv/ratio_final_to_reasoning": 1.4824330550190141, "adv/ratio_step_to_reasoning": 1.9359865418260649, "adv/std_final_conf": 0.8310674428939819, "adv/std_reasoning": 0.7011778950691223, "adv/std_step_conf": 0.9366821050643921, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.671875, "calib/ece": 0.22168627450980388, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4235294117647059, "calib/gap": 0.4614738461538461, "calib/mean_conf": 0.5170588235294119, "calib/mu_c": 0.75232, "calib/mu_w": 0.29084615384615387, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12427450980392153, "calib/std_conf": 0.4584479106025796, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8212671232876713, "calib/step_q_c_n": 584.0, "calib/step_q_gap": 0.035907646163488294, "calib/step_q_w": 0.785359477124183, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 497.86328125, "completions/mean_terminated_length": 499.8157043457031, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.176, "grad_norm": 0.9651271104812622, "kl": 0.068145751953125, "learning_rate": 1.0000000000000002e-06, "loss": 0.0753, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034680455923080444, "mask/share_reasoning": 0.855582058429718, "mask/share_step_conf": 0.10583124309778214, "num_tokens": 39336861.0, "reward": 0.6190352439880371, "reward_std": 0.29815220832824707, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7590284943580627, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.18372955918312073, "step": 165 }, { "adv/mean_abs_final_conf": 0.49343326687812805, "adv/mean_abs_reasoning": 0.4054815173149109, "adv/mean_abs_step_conf": 0.7982088327407837, "adv/ratio_final_to_reasoning": 1.2169069262284298, "adv/ratio_step_to_reasoning": 1.9685455406858094, "adv/std_final_conf": 0.7402017712593079, "adv/std_reasoning": 0.6815526485443115, "adv/std_step_conf": 0.9365569353103638, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.16901960784313722, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5215686274509804, "calib/gap": 0.607141356122163, "calib/mean_conf": 0.5831372549019609, "calib/mu_c": 0.7807558139534884, "calib/mu_w": 0.17361445783132531, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03882352941176467, "calib/std_conf": 0.46288731613169276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8241399082568807, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.03358020676434337, "calib/step_q_w": 0.7905597014925373, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 525.7265625, "completions/mean_terminated_length": 527.7882690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.17706666666666668, "grad_norm": 1.6964820623397827, "kl": 0.06235504150390625, "learning_rate": 9.722222222222224e-07, "loss": -0.0332, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033445168286561966, "mask/share_reasoning": 0.8463903069496155, "mask/share_step_conf": 0.11625826358795166, "num_tokens": 39577631.0, "reward": 0.7533150315284729, "reward_std": 0.283791184425354, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.8212109208106995, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.3518253564834595, "step": 166 }, { "adv/mean_abs_final_conf": 0.6175565719604492, "adv/mean_abs_reasoning": 0.40720972418785095, "adv/mean_abs_step_conf": 0.7839671969413757, "adv/ratio_final_to_reasoning": 1.5165565439089628, "adv/ratio_step_to_reasoning": 1.9252172784059594, "adv/std_final_conf": 0.8386374115943909, "adv/std_reasoning": 0.681658923625946, "adv/std_step_conf": 0.9366242289543152, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.92578125, "calib/ece": 0.2456573705179284, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6095617529880478, "calib/gap": 0.2968678286129265, "calib/mean_conf": 0.7263745019920318, "calib/mu_c": 0.8221764705882352, "calib/mu_w": 0.5253086419753087, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14737051792828698, "calib/std_conf": 0.40052356569935915, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8245610034207526, "calib/step_q_c_n": 877.0, "calib/step_q_gap": -0.0039025382459141333, "calib/step_q_w": 0.8284635416666667, "calib/step_q_w_n": 384.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 479.3984375, "completions/mean_terminated_length": 481.2784729003906, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.17813333333333334, "grad_norm": 1.481986165046692, "kl": 0.0680389404296875, "learning_rate": 9.444444444444445e-07, "loss": -0.022, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035307954996824265, "mask/share_reasoning": 0.8507791757583618, "mask/share_step_conf": 0.11000661551952362, "num_tokens": 39805965.0, "reward": 0.6165010929107666, "reward_std": 0.3056008219718933, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.730678915977478, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.1734171211719513, "step": 167 }, { "adv/mean_abs_final_conf": 0.6127045154571533, "adv/mean_abs_reasoning": 0.5175468921661377, "adv/mean_abs_step_conf": 0.7768701910972595, "adv/ratio_final_to_reasoning": 1.1838628049581044, "adv/ratio_step_to_reasoning": 1.5010624213117223, "adv/std_final_conf": 0.8142591714859009, "adv/std_reasoning": 0.7576119303703308, "adv/std_step_conf": 0.9366459846496582, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.18359375, "calib/ece": 0.2146586345381526, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5341365461847389, "calib/gap": 0.3924328440875104, "calib/mean_conf": 0.6541767068273092, "calib/mu_c": 0.7991719745222929, "calib/mu_w": 0.40673913043478255, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11915662650602407, "calib/std_conf": 0.4228809977435837, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8306224066390041, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.046797903327745805, "calib/step_q_w": 0.7838245033112583, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 566.79296875, "completions/mean_terminated_length": 569.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.1792, "grad_norm": 0.8936233520507812, "kl": 0.05992889404296875, "learning_rate": 9.166666666666666e-07, "loss": 0.0427, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03287447243928909, "mask/share_reasoning": 0.8597174882888794, "mask/share_step_conf": 0.1035018116235733, "num_tokens": 40055736.0, "reward": 0.6701132655143738, "reward_std": 0.3157053589820862, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7455191612243652, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.27830106019973755, "step": 168 }, { "adv/mean_abs_final_conf": 0.5711694955825806, "adv/mean_abs_reasoning": 0.3959871828556061, "adv/mean_abs_step_conf": 0.7644123435020447, "adv/ratio_final_to_reasoning": 1.4423938963470277, "adv/ratio_step_to_reasoning": 1.9303966809975823, "adv/std_final_conf": 0.8118226528167725, "adv/std_reasoning": 0.6613566279411316, "adv/std_step_conf": 0.9363489151000977, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.4921875, "calib/ece": 0.19490039840637455, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5139442231075697, "calib/gap": 0.43578916827853, "calib/mean_conf": 0.6394422310756972, "calib/mu_c": 0.8304255319148937, "calib/mu_w": 0.3946363636363637, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1362948207171315, "calib/std_conf": 0.4251295650283073, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8396159317211949, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.03263606594938273, "calib/step_q_w": 0.8069798657718121, "calib/step_q_w_n": 447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 510.3125, "completions/mean_terminated_length": 510.3125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.18026666666666666, "grad_norm": 1.0554523468017578, "kl": 0.06801605224609375, "learning_rate": 8.88888888888889e-07, "loss": 0.0047, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0337769091129303, "mask/share_reasoning": 0.864791750907898, "mask/share_step_conf": 0.10143131017684937, "num_tokens": 40290560.0, "reward": 0.6543185114860535, "reward_std": 0.2583219110965729, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7663476467132568, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.23603929579257965, "step": 169 }, { "adv/mean_abs_final_conf": 0.6359504461288452, "adv/mean_abs_reasoning": 0.4752800464630127, "adv/mean_abs_step_conf": 0.7920522093772888, "adv/ratio_final_to_reasoning": 1.3380541658786769, "adv/ratio_step_to_reasoning": 1.6664958170907098, "adv/std_final_conf": 0.8587112426757812, "adv/std_reasoning": 0.7392920255661011, "adv/std_step_conf": 0.9366955757141113, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 4.9765625, "calib/ece": 0.16440476190476197, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.49966548501450225, "calib/mean_conf": 0.6411507936507936, "calib/mu_c": 0.8533103448275864, "calib/mu_w": 0.35364485981308413, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.11507936507936513, "calib/std_conf": 0.42726411978415135, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.830982019363762, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.027602709018934468, "calib/step_q_w": 0.8033793103448276, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2084.0, "completions/max_terminated_length": 2084.0, "completions/mean_length": 511.77734375, "completions/mean_terminated_length": 513.7843627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.18133333333333335, "grad_norm": 0.9533416032791138, "kl": 0.07131195068359375, "learning_rate": 8.611111111111112e-07, "loss": -0.0407, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03206608444452286, "mask/share_reasoning": 0.857288122177124, "mask/share_step_conf": 0.10673956573009491, "num_tokens": 40525727.0, "reward": 0.6545491218566895, "reward_std": 0.35225439071655273, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7924433350563049, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.2080610692501068, "step": 170 }, { "adv/mean_abs_final_conf": 0.6436096429824829, "adv/mean_abs_reasoning": 0.5360812544822693, "adv/mean_abs_step_conf": 0.7808128595352173, "adv/ratio_final_to_reasoning": 1.200582258008744, "adv/ratio_step_to_reasoning": 1.4565196096798838, "adv/std_final_conf": 0.8436787724494934, "adv/std_reasoning": 0.7928290367126465, "adv/std_step_conf": 0.9368202090263367, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.73828125, "calib/ece": 0.3090836653386454, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5338645418326693, "calib/gap": 0.3343702046035805, "calib/mean_conf": 0.6366533864541833, "calib/mu_c": 0.8178260869565217, "calib/mu_w": 0.4834558823529412, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24378486055776888, "calib/std_conf": 0.44094514951291863, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8206283662477559, "calib/step_q_c_n": 557.0, "calib/step_q_gap": 0.046832634540438844, "calib/step_q_w": 0.7737957317073171, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 483.97265625, "completions/mean_terminated_length": 485.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.1824, "grad_norm": 1.2724828720092773, "kl": 0.062896728515625, "learning_rate": 8.333333333333333e-07, "loss": -0.0012, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034824684262275696, "mask/share_reasoning": 0.8547459840774536, "mask/share_step_conf": 0.1065230667591095, "num_tokens": 40756520.0, "reward": 0.5631700754165649, "reward_std": 0.36437439918518066, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6779695153236389, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.16243301331996918, "step": 171 }, { "adv/mean_abs_final_conf": 0.5658808946609497, "adv/mean_abs_reasoning": 0.41651487350463867, "adv/mean_abs_step_conf": 0.7689204216003418, "adv/ratio_final_to_reasoning": 1.358609093354856, "adv/ratio_step_to_reasoning": 1.84608154597336, "adv/std_final_conf": 0.7835713624954224, "adv/std_reasoning": 0.6816684603691101, "adv/std_step_conf": 0.9365073442459106, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.88671875, "calib/ece": 0.20811999999999992, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.664, "calib/gap": 0.31057040998217467, "calib/mean_conf": 0.7898000000000001, "calib/mu_c": 0.8953939393939394, "calib/mu_w": 0.5848235294117647, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1689599999999999, "calib/std_conf": 0.34871243166827304, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8172129032258063, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.013494415830848205, "calib/step_q_w": 0.8037184873949581, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 468.765625, "completions/mean_terminated_length": 468.765625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.18346666666666667, "grad_norm": 0.916139543056488, "kl": 0.0650482177734375, "learning_rate": 8.055555555555557e-07, "loss": 0.0749, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03676745295524597, "mask/share_reasoning": 0.8478963375091553, "mask/share_step_conf": 0.11533622443675995, "num_tokens": 40979876.0, "reward": 0.700234591960907, "reward_std": 0.33453425765037537, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7544296383857727, "rewards/format_reward_step": 0.97265625, "rewards/step_correlation_reward": 0.3226020336151123, "step": 172 }, { "adv/mean_abs_final_conf": 0.6322296857833862, "adv/mean_abs_reasoning": 0.5364935994148254, "adv/mean_abs_step_conf": 0.7635011672973633, "adv/ratio_final_to_reasoning": 1.178447769876442, "adv/ratio_step_to_reasoning": 1.4231319220399719, "adv/std_final_conf": 0.8339788913726807, "adv/std_reasoning": 0.7753890752792358, "adv/std_step_conf": 0.9367470741271973, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.25212851405622494, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7349397590361446, "calib/gap": 0.29304026845637565, "calib/mean_conf": 0.8283534136546186, "calib/mu_c": 0.9460402684563757, "calib/mu_w": 0.653, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.24104417670682732, "calib/std_conf": 0.32677089292691075, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8198890258939582, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.017080227247596058, "calib/step_q_w": 0.8028087986463621, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 528.03125, "completions/mean_terminated_length": 530.1019897460938, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.18453333333333333, "grad_norm": 0.8001253604888916, "kl": 0.06053924560546875, "learning_rate": 7.777777777777779e-07, "loss": -0.0226, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035726092755794525, "mask/share_reasoning": 0.8430019617080688, "mask/share_step_conf": 0.11736571043729782, "num_tokens": 41218212.0, "reward": 0.6385993361473083, "reward_std": 0.3808228373527527, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7167023420333862, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.25033998489379883, "step": 173 }, { "adv/mean_abs_final_conf": 0.7430212497711182, "adv/mean_abs_reasoning": 0.6782289147377014, "adv/mean_abs_step_conf": 0.7806899547576904, "adv/ratio_final_to_reasoning": 1.0955316613985333, "adv/ratio_step_to_reasoning": 1.1510714712887387, "adv/std_final_conf": 0.9063425064086914, "adv/std_reasoning": 0.8591742515563965, "adv/std_step_conf": 0.9367698431015015, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.3318930041152264, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5596707818930041, "calib/gap": 0.22395813575396295, "calib/mean_conf": 0.6604526748971192, "calib/mu_c": 0.7728925619834711, "calib/mu_w": 0.5489344262295082, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.24720164609053502, "calib/std_conf": 0.4274599598882447, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.817245696400626, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.012832288579397, "calib/step_q_w": 0.804413407821229, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 557.79296875, "completions/mean_terminated_length": 559.9804077148438, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.1856, "grad_norm": 1.4570891857147217, "kl": 0.060298919677734375, "learning_rate": 7.5e-07, "loss": -0.0311, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03084418550133705, "mask/share_reasoning": 0.8569861650466919, "mask/share_step_conf": 0.10826341807842255, "num_tokens": 41465239.0, "reward": 0.5176540613174438, "reward_std": 0.39822250604629517, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6196972727775574, "rewards/format_reward_step": 0.94921875, "rewards/step_correlation_reward": 0.13123580813407898, "step": 174 }, { "adv/mean_abs_final_conf": 0.6206293106079102, "adv/mean_abs_reasoning": 0.44335708022117615, "adv/mean_abs_step_conf": 0.7642242908477783, "adv/ratio_final_to_reasoning": 1.399840756570976, "adv/ratio_step_to_reasoning": 1.7237218597400816, "adv/std_final_conf": 0.8469446301460266, "adv/std_reasoning": 0.7206447720527649, "adv/std_step_conf": 0.9368031024932861, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.2421875, "calib/ece": 0.26934959349593496, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5121951219512195, "calib/gap": 0.4066698266522212, "calib/mean_conf": 0.604390243902439, "calib/mu_c": 0.8391346153846155, "calib/mu_w": 0.43246478873239436, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2254878048780488, "calib/std_conf": 0.4459759487872507, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.81841796875, "calib/step_q_c_n": 512.0, "calib/step_q_gap": 0.042393872364457796, "calib/step_q_w": 0.7760240963855422, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 542.46875, "completions/mean_terminated_length": 544.5961303710938, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.18666666666666668, "grad_norm": 0.8548311591148376, "kl": 0.0626068115234375, "learning_rate": 7.222222222222222e-07, "loss": -0.0291, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03319551423192024, "mask/share_reasoning": 0.8527624011039734, "mask/share_step_conf": 0.11013580858707428, "num_tokens": 41709935.0, "reward": 0.551446259021759, "reward_std": 0.3598180413246155, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.6943405866622925, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.135114386677742, "step": 175 }, { "adv/mean_abs_final_conf": 0.6044785976409912, "adv/mean_abs_reasoning": 0.4800613522529602, "adv/mean_abs_step_conf": 0.7739962339401245, "adv/ratio_final_to_reasoning": 1.2591694682442827, "adv/ratio_step_to_reasoning": 1.6122860761602826, "adv/std_final_conf": 0.8305591344833374, "adv/std_reasoning": 0.7394341826438904, "adv/std_step_conf": 0.9366341233253479, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.20890243902439026, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.556910569105691, "calib/gap": 0.4423638814016173, "calib/mean_conf": 0.6599593495934958, "calib/mu_c": 0.8505714285714286, "calib/mu_w": 0.40820754716981134, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14987804878048783, "calib/std_conf": 0.42447241090068166, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8125276243093923, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.038148539342072074, "calib/step_q_w": 0.7743790849673202, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2323.0, "completions/max_terminated_length": 2323.0, "completions/mean_length": 518.1171875, "completions/mean_terminated_length": 524.2608642578125, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.18773333333333334, "grad_norm": 1.0764228105545044, "kl": 0.060382843017578125, "learning_rate": 6.944444444444446e-07, "loss": -0.046, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036576274782419205, "mask/share_reasoning": 0.832595944404602, "mask/share_step_conf": 0.11910906434059143, "num_tokens": 41946637.0, "reward": 0.6765834093093872, "reward_std": 0.31728595495224, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7453163862228394, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.30785030126571655, "step": 176 }, { "adv/mean_abs_final_conf": 0.575515627861023, "adv/mean_abs_reasoning": 0.4532824158668518, "adv/mean_abs_step_conf": 0.7793318033218384, "adv/ratio_final_to_reasoning": 1.2696623732037207, "adv/ratio_step_to_reasoning": 1.7193073810980593, "adv/std_final_conf": 0.8180306553840637, "adv/std_reasoning": 0.7392659783363342, "adv/std_step_conf": 0.9365831613540649, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.2024696356275304, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5465587044534413, "calib/gap": 0.41349404191206696, "calib/mean_conf": 0.6741295546558704, "calib/mu_c": 0.8381879194630873, "calib/mu_w": 0.4246938775510204, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.13668016194331983, "calib/std_conf": 0.4120954050844349, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8214230271668823, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.031025299894154923, "calib/step_q_w": 0.7903977272727274, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 521.71875, "completions/mean_terminated_length": 523.7647094726562, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.1888, "grad_norm": 1.281508445739746, "kl": 0.059566497802734375, "learning_rate": 6.666666666666667e-07, "loss": 0.0594, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03327722102403641, "mask/share_reasoning": 0.8508939146995544, "mask/share_step_conf": 0.11192260682582855, "num_tokens": 42184029.0, "reward": 0.683879554271698, "reward_std": 0.34568291902542114, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7522894144058228, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.3068757951259613, "step": 177 }, { "adv/mean_abs_final_conf": 0.5984201431274414, "adv/mean_abs_reasoning": 0.49371129274368286, "adv/mean_abs_step_conf": 0.7932643890380859, "adv/ratio_final_to_reasoning": 1.212085184039166, "adv/ratio_step_to_reasoning": 1.606737380118871, "adv/std_final_conf": 0.7960706949234009, "adv/std_reasoning": 0.7206971645355225, "adv/std_step_conf": 0.9362597465515137, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.15505882352941175, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5725490196078431, "calib/gap": 0.4963874244567314, "calib/mean_conf": 0.6763137254901961, "calib/mu_c": 0.872922077922078, "calib/mu_w": 0.3765346534653466, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11372549019607839, "calib/std_conf": 0.41487127652576855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8233197278911563, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.03858895866038703, "calib/step_q_w": 0.7847307692307692, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 469.69921875, "completions/mean_terminated_length": 471.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.18986666666666666, "grad_norm": 1.0725016593933105, "kl": 0.06549835205078125, "learning_rate": 6.388888888888889e-07, "loss": 0.008, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036850735545158386, "mask/share_reasoning": 0.8465697765350342, "mask/share_step_conf": 0.11267320811748505, "num_tokens": 42410344.0, "reward": 0.7041335701942444, "reward_std": 0.34978020191192627, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.8177062273025513, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.27102965116500854, "step": 178 }, { "adv/mean_abs_final_conf": 0.5919932126998901, "adv/mean_abs_reasoning": 0.4286684989929199, "adv/mean_abs_step_conf": 0.7745324373245239, "adv/ratio_final_to_reasoning": 1.3810047019799039, "adv/ratio_step_to_reasoning": 1.8068331103035318, "adv/std_final_conf": 0.8123120665550232, "adv/std_reasoning": 0.6817300915718079, "adv/std_step_conf": 0.9364674687385559, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.1706827309236948, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.570281124497992, "calib/gap": 0.49522882181110023, "calib/mean_conf": 0.6711646586345382, "calib/mu_c": 0.8521518987341772, "calib/mu_w": 0.35692307692307695, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10365461847389558, "calib/std_conf": 0.41613136175603455, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8231461988304094, "calib/step_q_c_n": 855.0, "calib/step_q_gap": 0.07295822890559733, "calib/step_q_w": 0.7501879699248121, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 519.7734375, "completions/mean_terminated_length": 523.8661499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.19093333333333334, "grad_norm": 1.594141960144043, "kl": 0.0595550537109375, "learning_rate": 6.111111111111112e-07, "loss": 0.0343, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03363651782274246, "mask/share_reasoning": 0.8469129204750061, "mask/share_step_conf": 0.11163808405399323, "num_tokens": 42649670.0, "reward": 0.7076936960220337, "reward_std": 0.29267340898513794, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7968624830245972, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.30133724212646484, "step": 179 }, { "adv/mean_abs_final_conf": 0.5924807190895081, "adv/mean_abs_reasoning": 0.3779870569705963, "adv/mean_abs_step_conf": 0.7664633393287659, "adv/ratio_final_to_reasoning": 1.5674629809760847, "adv/ratio_step_to_reasoning": 2.0277502237024723, "adv/std_final_conf": 0.8204371929168701, "adv/std_reasoning": 0.6815129518508911, "adv/std_step_conf": 0.9363333582878113, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.7265625, "calib/ece": 0.24959677419354842, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.657258064516129, "calib/gap": 0.26154863355862457, "calib/mean_conf": 0.7415322580645162, "calib/mu_c": 0.828012048192771, "calib/mu_w": 0.5664634146341464, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16088709677419358, "calib/std_conf": 0.3865293135710924, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8110019455252918, "calib/step_q_c_n": 1028.0, "calib/step_q_gap": 0.01586495922392195, "calib/step_q_w": 0.7951369863013699, "calib/step_q_w_n": 438.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 580.7578125, "completions/mean_terminated_length": 587.644287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.192, "grad_norm": 1.2956640720367432, "kl": 0.058837890625, "learning_rate": 5.833333333333334e-07, "loss": -0.0525, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030962135642766953, "mask/share_reasoning": 0.8441920876502991, "mask/share_step_conf": 0.11312699317932129, "num_tokens": 42902200.0, "reward": 0.7091296911239624, "reward_std": 0.28258341550827026, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7128117084503174, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.3827913999557495, "step": 180 }, { "adv/mean_abs_final_conf": 0.5850663185119629, "adv/mean_abs_reasoning": 0.32236939668655396, "adv/mean_abs_step_conf": 0.7583494186401367, "adv/ratio_final_to_reasoning": 1.814894107584394, "adv/ratio_step_to_reasoning": 2.3524237301517013, "adv/std_final_conf": 0.8155914545059204, "adv/std_reasoning": 0.6185604929924011, "adv/std_step_conf": 0.9355369210243225, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.5859375, "calib/ece": 0.17633466135458176, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.50199203187251, "calib/gap": 0.46306786350904006, "calib/mean_conf": 0.6708366533864542, "calib/mu_c": 0.8903787878787879, "calib/mu_w": 0.4273109243697478, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16063745019920325, "calib/std_conf": 0.3996209573744399, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8234738372093022, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.017712520336874205, "calib/step_q_w": 0.805761316872428, "calib/step_q_w_n": 486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 476.51171875, "completions/mean_terminated_length": 476.51171875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.19306666666666666, "grad_norm": 1.5248737335205078, "kl": 0.07073211669921875, "learning_rate": 5.555555555555555e-07, "loss": 0.0472, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036633823066949844, "mask/share_reasoning": 0.8504502177238464, "mask/share_step_conf": 0.1129159927368164, "num_tokens": 43130451.0, "reward": 0.6563092470169067, "reward_std": 0.3206620216369629, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7852367162704468, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.22816309332847595, "step": 181 }, { "adv/mean_abs_final_conf": 0.562563419342041, "adv/mean_abs_reasoning": 0.3157555162906647, "adv/mean_abs_step_conf": 0.772614598274231, "adv/ratio_final_to_reasoning": 1.7816424110360767, "adv/ratio_step_to_reasoning": 2.4468760113853736, "adv/std_final_conf": 0.7761539220809937, "adv/std_reasoning": 0.6184600591659546, "adv/std_step_conf": 0.9360139966011047, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.21226190476190473, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7023809523809523, "calib/gap": 0.3129258098223615, "calib/mean_conf": 0.8140873015873016, "calib/mu_c": 0.9221212121212121, "calib/mu_w": 0.6091954022988506, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1857936507936508, "calib/std_conf": 0.3285567893180786, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8237206572769953, "calib/step_q_c_n": 852.0, "calib/step_q_gap": 0.0330500458371531, "calib/step_q_w": 0.7906706114398422, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 508.25390625, "completions/mean_terminated_length": 508.25390625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.19413333333333332, "grad_norm": 0.9305014610290527, "kl": 0.0578460693359375, "learning_rate": 5.277777777777779e-07, "loss": -0.0028, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03418903797864914, "mask/share_reasoning": 0.8501096963882446, "mask/share_step_conf": 0.11570125818252563, "num_tokens": 43366724.0, "reward": 0.7293093800544739, "reward_std": 0.2843572497367859, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7659671306610107, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.36687028408050537, "step": 182 }, { "adv/mean_abs_final_conf": 0.6345954537391663, "adv/mean_abs_reasoning": 0.5462435483932495, "adv/mean_abs_step_conf": 0.7914299368858337, "adv/ratio_final_to_reasoning": 1.1617445288018502, "adv/ratio_step_to_reasoning": 1.448859101794042, "adv/std_final_conf": 0.8290164470672607, "adv/std_reasoning": 0.7929679751396179, "adv/std_step_conf": 0.936626672744751, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 4.8984375, "calib/ece": 0.24947791164658645, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5622489959839357, "calib/gap": 0.3095480706344016, "calib/mean_conf": 0.6863453815261045, "calib/mu_c": 0.8230935251798561, "calib/mu_w": 0.5135454545454545, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18879518072289167, "calib/std_conf": 0.3966930282009605, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8219440353460973, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.042092730998271266, "calib/step_q_w": 0.779851304347826, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 542.26953125, "completions/mean_terminated_length": 542.26953125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1952, "grad_norm": 1.066447138786316, "kl": 0.056621551513671875, "learning_rate": 5.000000000000001e-07, "loss": 0.0293, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03397630155086517, "mask/share_reasoning": 0.8612287044525146, "mask/share_step_conf": 0.10479501634836197, "num_tokens": 43612225.0, "reward": 0.6343309879302979, "reward_std": 0.33117324113845825, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7083578109741211, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.25796037912368774, "step": 183 }, { "adv/mean_abs_final_conf": 0.6710717678070068, "adv/mean_abs_reasoning": 0.5068272352218628, "adv/mean_abs_step_conf": 0.7556173801422119, "adv/ratio_final_to_reasoning": 1.324064140936006, "adv/ratio_step_to_reasoning": 1.4908776159423272, "adv/std_final_conf": 0.8614399433135986, "adv/std_reasoning": 0.7576295733451843, "adv/std_step_conf": 0.9365668892860413, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 5.8828125, "calib/ece": 0.208152610441767, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6907630522088354, "calib/gap": 0.3330428432327166, "calib/mean_conf": 0.7959437751004016, "calib/mu_c": 0.9176582278481011, "calib/mu_w": 0.5846153846153845, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1847791164658634, "calib/std_conf": 0.3502208323000122, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8196493506493506, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.09223087238848104, "calib/step_q_w": 0.7274184782608696, "calib/step_q_w_n": 736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 515.59375, "completions/mean_terminated_length": 521.70751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.19626666666666667, "grad_norm": 1.1740838289260864, "kl": 0.061229705810546875, "learning_rate": 4.7222222222222226e-07, "loss": -0.0545, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03360723331570625, "mask/share_reasoning": 0.8402235507965088, "mask/share_step_conf": 0.11445043981075287, "num_tokens": 43849497.0, "reward": 0.6816393136978149, "reward_std": 0.3882199227809906, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7494492530822754, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.29664188623428345, "step": 184 }, { "adv/mean_abs_final_conf": 0.5479968190193176, "adv/mean_abs_reasoning": 0.44736379384994507, "adv/mean_abs_step_conf": 0.7752357721328735, "adv/ratio_final_to_reasoning": 1.2249467358619703, "adv/ratio_step_to_reasoning": 1.732897884876449, "adv/std_final_conf": 0.7786417007446289, "adv/std_reasoning": 0.739273190498352, "adv/std_step_conf": 0.9366080164909363, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.22700404858299597, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6680161943319838, "calib/gap": 0.3598577319587628, "calib/mean_conf": 0.7632793522267205, "calib/mu_c": 0.9046, "calib/mu_w": 0.5447422680412372, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19149797570850202, "calib/std_conf": 0.3787334728008911, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8087356321839081, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.0324037544546506, "calib/step_q_w": 0.7763318777292575, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 546.95703125, "completions/mean_terminated_length": 546.95703125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.19733333333333333, "grad_norm": 0.9003777503967285, "kl": 0.05350494384765625, "learning_rate": 4.444444444444445e-07, "loss": 0.1472, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03338668495416641, "mask/share_reasoning": 0.8522862195968628, "mask/share_step_conf": 0.11432710289955139, "num_tokens": 44096438.0, "reward": 0.6480090618133545, "reward_std": 0.3556399345397949, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7384746074676514, "rewards/format_reward_step": 0.96484375, "rewards/step_correlation_reward": 0.2466059923171997, "step": 185 }, { "adv/mean_abs_final_conf": 0.5788049101829529, "adv/mean_abs_reasoning": 0.4567175507545471, "adv/mean_abs_step_conf": 0.7655304670333862, "adv/ratio_final_to_reasoning": 1.267314797136007, "adv/ratio_step_to_reasoning": 1.6761573225479216, "adv/std_final_conf": 0.8138347268104553, "adv/std_reasoning": 0.739247739315033, "adv/std_step_conf": 0.9366657137870789, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.20313492063492072, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6785714285714286, "calib/gap": 0.44572192513369, "calib/mean_conf": 0.7416269841269842, "calib/mu_c": 0.916732026143791, "calib/mu_w": 0.471010101010101, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1688095238095239, "calib/std_conf": 0.39794129410953466, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8141927409261577, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.0210420559946507, "calib/step_q_w": 0.793150684931507, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 520.62109375, "completions/mean_terminated_length": 520.62109375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.1984, "grad_norm": 1.04643714427948, "kl": 0.062358856201171875, "learning_rate": 4.1666666666666667e-07, "loss": 0.0845, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03430875763297081, "mask/share_reasoning": 0.8483922481536865, "mask/share_step_conf": 0.11729902029037476, "num_tokens": 44334757.0, "reward": 0.6961232423782349, "reward_std": 0.33872687816619873, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7852003574371338, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.29063987731933594, "step": 186 }, { "adv/mean_abs_final_conf": 0.6189491748809814, "adv/mean_abs_reasoning": 0.5102524757385254, "adv/mean_abs_step_conf": 0.7715613842010498, "adv/ratio_final_to_reasoning": 1.213025324345818, "adv/ratio_step_to_reasoning": 1.51211688504659, "adv/std_final_conf": 0.7987761497497559, "adv/std_reasoning": 0.7576136589050293, "adv/std_step_conf": 0.9364960193634033, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 5.9296875, "calib/ece": 0.25548387096774183, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6451612903225806, "calib/gap": 0.2761765480895917, "calib/mean_conf": 0.7639516129032259, "calib/mu_c": 0.8864492753623189, "calib/mu_w": 0.6102727272727272, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23149193548387087, "calib/std_conf": 0.36010598473134825, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8084885290148447, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.05521954574586152, "calib/step_q_w": 0.7532689832689832, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 569.71875, "completions/mean_terminated_length": 574.2047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.19946666666666665, "grad_norm": 1.0760246515274048, "kl": 0.055908203125, "learning_rate": 3.8888888888888895e-07, "loss": -0.089, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03105928562581539, "mask/share_reasoning": 0.8503777980804443, "mask/share_step_conf": 0.11075045168399811, "num_tokens": 44582149.0, "reward": 0.6254994869232178, "reward_std": 0.3427451252937317, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6943827867507935, "rewards/format_reward_step": 0.96875, "rewards/step_correlation_reward": 0.25505363941192627, "step": 187 }, { "adv/mean_abs_final_conf": 0.652023434638977, "adv/mean_abs_reasoning": 0.48183199763298035, "adv/mean_abs_step_conf": 0.7877976894378662, "adv/ratio_final_to_reasoning": 1.3532173824944569, "adv/ratio_step_to_reasoning": 1.6350049255922292, "adv/std_final_conf": 0.8597961068153381, "adv/std_reasoning": 0.757516622543335, "adv/std_step_conf": 0.9366502165794373, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.2447560975609756, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6951219512195121, "calib/gap": 0.34118356164383556, "calib/mean_conf": 0.7906910569105691, "calib/mu_c": 0.9293835616438356, "calib/mu_w": 0.5882000000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.22097560975609754, "calib/std_conf": 0.3620390678752621, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.8257086614173228, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.034231211961646246, "calib/step_q_w": 0.7914774494556766, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2161.0, "completions/max_terminated_length": 2161.0, "completions/mean_length": 539.09765625, "completions/mean_terminated_length": 549.836669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.20053333333333334, "grad_norm": 1.0041913986206055, "kl": 0.055118560791015625, "learning_rate": 3.611111111111111e-07, "loss": -0.0684, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03301866352558136, "mask/share_reasoning": 0.8338941335678101, "mask/share_step_conf": 0.11355596035718918, "num_tokens": 44824230.0, "reward": 0.6531383395195007, "reward_std": 0.36959201097488403, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7161867022514343, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.2854023873806, "step": 188 }, { "adv/mean_abs_final_conf": 0.682059645652771, "adv/mean_abs_reasoning": 0.4347081184387207, "adv/mean_abs_step_conf": 0.774594783782959, "adv/ratio_final_to_reasoning": 1.5690059990193594, "adv/ratio_step_to_reasoning": 1.7818732867583906, "adv/std_final_conf": 0.8766545057296753, "adv/std_reasoning": 0.7013657689094543, "adv/std_step_conf": 0.936470627784729, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 4.765625, "calib/ece": 0.3255118110236221, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5866141732283464, "calib/gap": 0.23952755905511813, "calib/mean_conf": 0.7081889763779528, "calib/mu_c": 0.8279527559055119, "calib/mu_w": 0.5884251968503937, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26685039370078745, "calib/std_conf": 0.39350718074236285, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8075263157894736, "calib/step_q_c_n": 570.0, "calib/step_q_gap": 0.021187854251012106, "calib/step_q_w": 0.7863384615384615, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 467.52734375, "completions/mean_terminated_length": 469.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.2016, "grad_norm": 0.9322103261947632, "kl": 0.0639190673828125, "learning_rate": 3.3333333333333335e-07, "loss": -0.0051, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035988159477710724, "mask/share_reasoning": 0.8513275384902954, "mask/share_step_conf": 0.10877804458141327, "num_tokens": 45051685.0, "reward": 0.5799663662910461, "reward_std": 0.34250661730766296, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6630769371986389, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.20076197385787964, "step": 189 }, { "adv/mean_abs_final_conf": 0.6382098197937012, "adv/mean_abs_reasoning": 0.4994494318962097, "adv/mean_abs_step_conf": 0.7759018540382385, "adv/ratio_final_to_reasoning": 1.2778267008347046, "adv/ratio_step_to_reasoning": 1.553514338964106, "adv/std_final_conf": 0.8606169819831848, "adv/std_reasoning": 0.7575089335441589, "adv/std_step_conf": 0.9367921948432922, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.2109375, "calib/ece": 0.22261904761904755, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5873015873015873, "calib/gap": 0.39445306986591366, "calib/mean_conf": 0.7124603174603176, "calib/mu_c": 0.8830769230769229, "calib/mu_w": 0.4886238532110092, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18380952380952376, "calib/std_conf": 0.397311773360283, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8043846153846155, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.01230880311024729, "calib/step_q_w": 0.7920758122743682, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 551.4765625, "completions/mean_terminated_length": 551.4765625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.20266666666666666, "grad_norm": 1.002307653427124, "kl": 0.0587310791015625, "learning_rate": 3.055555555555556e-07, "loss": 0.0118, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030747095122933388, "mask/share_reasoning": 0.8636980652809143, "mask/share_step_conf": 0.10555484890937805, "num_tokens": 45298471.0, "reward": 0.6581155061721802, "reward_std": 0.38210529088974, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7533788681030273, "rewards/format_reward_step": 0.98046875, "rewards/step_correlation_reward": 0.2550395727157593, "step": 190 }, { "adv/mean_abs_final_conf": 0.6476730704307556, "adv/mean_abs_reasoning": 0.4213106632232666, "adv/mean_abs_step_conf": 0.7631505131721497, "adv/ratio_final_to_reasoning": 1.5372814575251612, "adv/ratio_step_to_reasoning": 1.811372414202891, "adv/std_final_conf": 0.8605149388313293, "adv/std_reasoning": 0.7013201117515564, "adv/std_step_conf": 0.9366151094436646, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.29276679841897235, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6640316205533597, "calib/gap": 0.3406681249999999, "calib/mean_conf": 0.7594861660079052, "calib/mu_c": 0.9318399999999999, "calib/mu_w": 0.591171875, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27909090909090917, "calib/std_conf": 0.3732425654376393, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8143399089529592, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.0407517461909368, "calib/step_q_w": 0.7735881627620224, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 492.67578125, "completions/mean_terminated_length": 494.6078796386719, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.20373333333333332, "grad_norm": 1.014994502067566, "kl": 0.06876373291015625, "learning_rate": 2.7777777777777776e-07, "loss": 0.0584, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036824602633714676, "mask/share_reasoning": 0.8341710567474365, "mask/share_step_conf": 0.12509804964065552, "num_tokens": 45528764.0, "reward": 0.5843663811683655, "reward_std": 0.3251737952232361, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7022628784179688, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.17115738987922668, "step": 191 }, { "adv/mean_abs_final_conf": 0.6557433605194092, "adv/mean_abs_reasoning": 0.5446396470069885, "adv/mean_abs_step_conf": 0.8026284575462341, "adv/ratio_final_to_reasoning": 1.2039949058482609, "adv/ratio_step_to_reasoning": 1.473687165370712, "adv/std_final_conf": 0.8455314040184021, "adv/std_reasoning": 0.7753980755805969, "adv/std_step_conf": 0.9362693428993225, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.98828125, "calib/ece": 0.20356862745098037, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5725490196078431, "calib/gap": 0.4465033783783784, "calib/mean_conf": 0.6737647058823529, "calib/mu_c": 0.868125, "calib/mu_w": 0.42162162162162165, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15631372549019604, "calib/std_conf": 0.42222188146245787, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8037642045454544, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.03118130751229553, "calib/step_q_w": 0.7725828970331589, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 505.5859375, "completions/mean_terminated_length": 507.56866455078125, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.2048, "grad_norm": 0.9712722897529602, "kl": 0.0670928955078125, "learning_rate": 2.5000000000000004e-07, "loss": -0.0198, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0350809246301651, "mask/share_reasoning": 0.8480446338653564, "mask/share_step_conf": 0.11296822130680084, "num_tokens": 45763170.0, "reward": 0.6897858381271362, "reward_std": 0.3775734007358551, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7804738283157349, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.2873789668083191, "step": 192 }, { "adv/mean_abs_final_conf": 0.6855747699737549, "adv/mean_abs_reasoning": 0.6207780838012695, "adv/mean_abs_step_conf": 0.7821047306060791, "adv/ratio_final_to_reasoning": 1.1043797902395485, "adv/ratio_step_to_reasoning": 1.2598781287782306, "adv/std_final_conf": 0.8768498301506042, "adv/std_reasoning": 0.8266281485557556, "adv/std_step_conf": 0.9366915822029114, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.25944664031620557, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6086956521739131, "calib/gap": 0.33153026265702323, "calib/mean_conf": 0.7222924901185771, "calib/mu_c": 0.8677464788732394, "calib/mu_w": 0.5362162162162162, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21023715415019764, "calib/std_conf": 0.3870475573915073, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8110312499999999, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.03435446699544742, "calib/step_q_w": 0.7766767830045525, "calib/step_q_w_n": 659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 492.1171875, "completions/mean_terminated_length": 494.0470886230469, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.20586666666666667, "grad_norm": 1.2200297117233276, "kl": 0.0626068115234375, "learning_rate": 2.2222222222222224e-07, "loss": 0.0767, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03413480520248413, "mask/share_reasoning": 0.8514578342437744, "mask/share_step_conf": 0.11050112545490265, "num_tokens": 45994864.0, "reward": 0.6448402404785156, "reward_std": 0.40046432614326477, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7326070070266724, "rewards/format_reward_step": 0.98828125, "rewards/step_correlation_reward": 0.24847961962223053, "step": 193 }, { "adv/mean_abs_final_conf": 0.6273636817932129, "adv/mean_abs_reasoning": 0.5078442096710205, "adv/mean_abs_step_conf": 0.7637410163879395, "adv/ratio_final_to_reasoning": 1.2353467261143267, "adv/ratio_step_to_reasoning": 1.5038884008989448, "adv/std_final_conf": 0.8247886896133423, "adv/std_reasoning": 0.7754206657409668, "adv/std_step_conf": 0.936740517616272, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 5.234375, "calib/ece": 0.24341463414634146, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6382113821138211, "calib/gap": 0.37815767973856196, "calib/mean_conf": 0.7291056910569105, "calib/mu_c": 0.8859027777777777, "calib/mu_w": 0.5077450980392157, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19357723577235772, "calib/std_conf": 0.4016338747132833, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8111596009975062, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.045862203227989395, "calib/step_q_w": 0.7652973977695168, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 528.9453125, "completions/mean_terminated_length": 531.0196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.20693333333333333, "grad_norm": 1.1031935214996338, "kl": 0.062103271484375, "learning_rate": 1.9444444444444447e-07, "loss": 0.0994, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03461195155978203, "mask/share_reasoning": 0.849972128868103, "mask/share_step_conf": 0.11150962114334106, "num_tokens": 46236218.0, "reward": 0.6224148869514465, "reward_std": 0.37691283226013184, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.721427321434021, "rewards/format_reward_step": 0.953125, "rewards/step_correlation_reward": 0.22027736902236938, "step": 194 }, { "adv/mean_abs_final_conf": 0.657250165939331, "adv/mean_abs_reasoning": 0.5337613821029663, "adv/mean_abs_step_conf": 0.7565616369247437, "adv/ratio_final_to_reasoning": 1.2313557855194233, "adv/ratio_step_to_reasoning": 1.417415463711456, "adv/std_final_conf": 0.8767140507698059, "adv/std_reasoning": 0.8097855448722839, "adv/std_step_conf": 0.936643123626709, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.16796875, "calib/ece": 0.19291497975708505, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.611336032388664, "calib/gap": 0.41616734693877555, "calib/mean_conf": 0.7398785425101214, "calib/mu_c": 0.9083673469387755, "calib/mu_w": 0.49219999999999997, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1688259109311741, "calib/std_conf": 0.38091779403813597, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8145132743362832, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.028009514937786917, "calib/step_q_w": 0.7865037593984963, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 530.0078125, "completions/mean_terminated_length": 532.0863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.208, "grad_norm": 1.0037895441055298, "kl": 0.058197021484375, "learning_rate": 1.6666666666666668e-07, "loss": -0.013, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03281273692846298, "mask/share_reasoning": 0.8520941138267517, "mask/share_step_conf": 0.1111869066953659, "num_tokens": 46477884.0, "reward": 0.6687745451927185, "reward_std": 0.34658920764923096, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7655781507492065, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.26493972539901733, "step": 195 }, { "adv/mean_abs_final_conf": 0.5276245474815369, "adv/mean_abs_reasoning": 0.3016800284385681, "adv/mean_abs_step_conf": 0.7574066519737244, "adv/ratio_final_to_reasoning": 1.7489541823912231, "adv/ratio_step_to_reasoning": 2.5106290790739467, "adv/std_final_conf": 0.7639484405517578, "adv/std_reasoning": 0.6183252334594727, "adv/std_step_conf": 0.9365251660346985, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.70703125, "calib/ece": 0.2711764705882354, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7647058823529411, "calib/gap": 0.2821241830065362, "calib/mean_conf": 0.837607843137255, "calib/mu_c": 0.9504575163398694, "calib/mu_w": 0.6683333333333332, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2543921568627452, "calib/std_conf": 0.32299481440816297, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8137500000000002, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.020504563894523375, "calib/step_q_w": 0.7932454361054768, "calib/step_q_w_n": 493.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 402.7734375, "completions/mean_terminated_length": 404.35296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.20906666666666668, "grad_norm": 1.489843487739563, "kl": 0.07180023193359375, "learning_rate": 1.3888888888888888e-07, "loss": 0.0224, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04053875058889389, "mask/share_reasoning": 0.8314760327339172, "mask/share_step_conf": 0.12407892942428589, "num_tokens": 46683538.0, "reward": 0.6767395734786987, "reward_std": 0.30844956636428833, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7317668199539185, "rewards/format_reward_step": 0.99609375, "rewards/step_correlation_reward": 0.30296236276626587, "step": 196 }, { "adv/mean_abs_final_conf": 0.6408740878105164, "adv/mean_abs_reasoning": 0.46309763193130493, "adv/mean_abs_step_conf": 0.7852573394775391, "adv/ratio_final_to_reasoning": 1.3838854781826706, "adv/ratio_step_to_reasoning": 1.6956626105011539, "adv/std_final_conf": 0.8449645042419434, "adv/std_reasoning": 0.7206152677536011, "adv/std_step_conf": 0.936303436756134, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.2653968253968255, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.3625909090909092, "calib/mean_conf": 0.768095238095238, "calib/mu_c": 0.9407575757575758, "calib/mu_w": 0.5781666666666666, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25484126984127, "calib/std_conf": 0.3671127465881333, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8096092619392184, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.049307424668877164, "calib/step_q_w": 0.7603018372703413, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 535.0234375, "completions/mean_terminated_length": 537.12158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.21013333333333334, "grad_norm": 1.3064262866973877, "kl": 0.058258056640625, "learning_rate": 1.1111111111111112e-07, "loss": 0.0482, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03409402072429657, "mask/share_reasoning": 0.8432488441467285, "mask/share_step_conf": 0.1187509298324585, "num_tokens": 46925560.0, "reward": 0.6909942030906677, "reward_std": 0.35764315724372864, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7254882454872131, "rewards/format_reward_step": 0.984375, "rewards/step_correlation_reward": 0.3565000593662262, "step": 197 }, { "adv/mean_abs_final_conf": 0.5573863387107849, "adv/mean_abs_reasoning": 0.46284085512161255, "adv/mean_abs_step_conf": 0.7740723490715027, "adv/ratio_final_to_reasoning": 1.2042721219247818, "adv/ratio_step_to_reasoning": 1.6724373842670248, "adv/std_final_conf": 0.7805414795875549, "adv/std_reasoning": 0.7206101417541504, "adv/std_step_conf": 0.9364205002784729, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.14584, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.54, "calib/gap": 0.46597498160412076, "calib/mean_conf": 0.67256, "calib/mu_c": 0.8570860927152318, "calib/mu_w": 0.3911111111111111, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1072, "calib/std_conf": 0.39974572718166734, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8100576036866358, "calib/step_q_c_n": 868.0, "calib/step_q_gap": 0.021787333416365517, "calib/step_q_w": 0.7882702702702703, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 494.1015625, "completions/mean_terminated_length": 494.1015625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2112, "grad_norm": 1.1771690845489502, "kl": 0.06976318359375, "learning_rate": 8.333333333333334e-08, "loss": 0.0553, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03889241814613342, "mask/share_reasoning": 0.825330913066864, "mask/share_step_conf": 0.13577669858932495, "num_tokens": 47157434.0, "reward": 0.7292279005050659, "reward_std": 0.2781729996204376, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.8000258207321167, "rewards/format_reward_step": 0.9765625, "rewards/step_correlation_reward": 0.3451487421989441, "step": 198 }, { "adv/mean_abs_final_conf": 0.6263066530227661, "adv/mean_abs_reasoning": 0.5689499378204346, "adv/mean_abs_step_conf": 0.7660895586013794, "adv/ratio_final_to_reasoning": 1.1008115325964476, "adv/ratio_step_to_reasoning": 1.3464973061358585, "adv/std_final_conf": 0.8343291878700256, "adv/std_reasoning": 0.7928807139396667, "adv/std_step_conf": 0.9363666772842407, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.3023770491803277, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7131147540983607, "calib/gap": 0.27100135685210325, "calib/mean_conf": 0.8057377049180326, "calib/mu_c": 0.927910447761194, "calib/mu_w": 0.6569090909090908, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.279467213114754, "calib/std_conf": 0.34837660522793784, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8121082621082621, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.0423199682103792, "calib/step_q_w": 0.769788293897883, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 550.546875, "completions/mean_terminated_length": 559.2857666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.21226666666666666, "grad_norm": 0.9838593602180481, "kl": 0.058837890625, "learning_rate": 5.555555555555556e-08, "loss": -0.0275, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03533787280321121, "mask/share_reasoning": 0.8256212472915649, "mask/share_step_conf": 0.12341582775115967, "num_tokens": 47402574.0, "reward": 0.6137939095497131, "reward_std": 0.35966432094573975, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6627289056777954, "rewards/format_reward_step": 0.9453125, "rewards/step_correlation_reward": 0.27110886573791504, "step": 199 }, { "adv/mean_abs_final_conf": 0.5365869998931885, "adv/mean_abs_reasoning": 0.39925652742385864, "adv/mean_abs_step_conf": 0.7651621103286743, "adv/ratio_final_to_reasoning": 1.3439655034707474, "adv/ratio_step_to_reasoning": 1.9164673781685304, "adv/std_final_conf": 0.7594389915466309, "adv/std_reasoning": 0.6612989902496338, "adv/std_step_conf": 0.9362245202064514, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 4.84375, "calib/ece": 0.17392712550607284, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6882591093117408, "calib/gap": 0.41600773464227264, "calib/mean_conf": 0.7652631578947368, "calib/mu_c": 0.9016867469879517, "calib/mu_w": 0.48567901234567906, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.13356275303643722, "calib/std_conf": 0.3747666366720727, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.8107654320987655, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.04937008326155612, "calib/step_q_w": 0.7613953488372094, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 517.73046875, "completions/mean_terminated_length": 517.73046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.21333333333333335, "grad_norm": 1.1266281604766846, "kl": 0.05748748779296875, "learning_rate": 2.777777777777778e-08, "loss": -0.0016, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03594672679901123, "mask/share_reasoning": 0.8535377979278564, "mask/share_step_conf": 0.11051549017429352, "num_tokens": 47643161.0, "reward": 0.719478189945221, "reward_std": 0.2980797290802002, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7813234329223633, "rewards/format_reward_step": 0.9609375, "rewards/step_correlation_reward": 0.3357579708099365, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.02025045383197721, "train_runtime": 8390.0133, "train_samples_per_second": 6.102, "train_steps_per_second": 0.024 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47643161, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }