{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7490277290344238, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5698086470140988, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9343300461769104, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04298506677150726, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0135, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.8933746814727783, "reward_std": 0.19672557711601257, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.7698483467102051, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5081617139634353, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9345317482948303, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04043004661798477, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.8337589502334595, "reward_std": 0.1928534209728241, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7821295261383057, "adv/mean_abs_reasoning": 0.46012091636657715, "adv/mean_abs_step_conf": 0.7513811588287354, "adv/ratio_final_to_reasoning": 1.6998347571645387, "adv/ratio_step_to_reasoning": 1.633008046585111, "adv/std_final_conf": 0.9285802841186523, "adv/std_reasoning": 0.7206122875213623, "adv/std_step_conf": 0.9327278137207031, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.509598183569561, "calib/avg_num_step_conf": 4.88671875, "calib/ece": 0.22188235294117642, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2627450980392157, "calib/gap": 0.0024157148754645474, "calib/mean_conf": 0.8810196078431374, "calib/mu_c": 0.8818343195266273, "calib/mu_w": 0.8794186046511627, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.220078431372549, "calib/std_conf": 0.04377907558034709, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7875064599483204, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.019602896007020698, "calib/step_q_w": 0.7679035639412997, "calib/step_q_w_n": 477.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 495.42578125, "completions/mean_terminated_length": 495.42578125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.0032, "grad_norm": 0.04484294354915619, "kl": 0.0013908743858337402, "learning_rate": 7.5e-07, "loss": 0.0599, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033238910138607025, "mask/share_reasoning": 0.8569374084472656, "mask/share_step_conf": 0.10982376337051392, "num_tokens": 690746.0, "reward": 0.9005277156829834, "reward_std": 0.18030250072479248, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7222340106964111, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7491339445114136, "step": 3 }, { "adv/mean_abs_final_conf": 0.7650107145309448, "adv/mean_abs_reasoning": 0.44886037707328796, "adv/mean_abs_step_conf": 0.7658126354217529, "adv/ratio_final_to_reasoning": 1.7043400433761995, "adv/ratio_step_to_reasoning": 1.706126614282807, "adv/std_final_conf": 0.9289597868919373, "adv/std_reasoning": 0.7205116152763367, "adv/std_step_conf": 0.9340852499008179, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5186476242046838, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.222549019607843, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.22745098039215686, "calib/gap": 0.0038682821172331128, "calib/mean_conf": 0.873529411764706, "calib/mu_c": 0.8748795180722891, "calib/mu_w": 0.871011235955056, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.222549019607843, "calib/std_conf": 0.05047319340595715, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7892537313432837, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.0006307065125839006, "calib/step_q_w": 0.7886230248306998, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 498.46484375, "completions/mean_terminated_length": 500.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.004266666666666667, "grad_norm": 0.04768529161810875, "kl": 0.00032705068588256836, "learning_rate": 1.0000000000000002e-06, "loss": 0.0365, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03280118107795715, "mask/share_reasoning": 0.8532472252845764, "mask/share_step_conf": 0.11004534363746643, "num_tokens": 924521.0, "reward": 0.8905909657478333, "reward_std": 0.18536730110645294, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7146550416946411, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7391830086708069, "step": 4 }, { "adv/mean_abs_final_conf": 0.762508749961853, "adv/mean_abs_reasoning": 0.3974190950393677, "adv/mean_abs_step_conf": 0.7614568471908569, "adv/ratio_final_to_reasoning": 1.918651517955674, "adv/ratio_step_to_reasoning": 1.9160046829542206, "adv/std_final_conf": 0.9297828674316406, "adv/std_reasoning": 0.6815359592437744, "adv/std_step_conf": 0.9332932233810425, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4592454587796926, "calib/avg_num_step_conf": 4.765625, "calib/ece": 0.33922764227642277, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.2967479674796748, "calib/gap": -0.005463437354447986, "calib/mean_conf": 0.8798780487804878, "calib/mu_c": 0.8773684210526316, "calib/mu_w": 0.8828318584070796, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.33922764227642277, "calib/std_conf": 0.045989409414631144, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.8018296529968455, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.010174362894456501, "calib/step_q_w": 0.791655290102389, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 525.87109375, "completions/mean_terminated_length": 527.933349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.005333333333333333, "grad_norm": 0.04267508536577225, "kl": 0.0002726316452026367, "learning_rate": 1.25e-06, "loss": 0.0323, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03395113721489906, "mask/share_reasoning": 0.851130485534668, "mask/share_step_conf": 0.11101210117340088, "num_tokens": 1165832.0, "reward": 0.7854695320129395, "reward_std": 0.16234935820102692, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6045964956283569, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6725925207138062, "step": 5 }, { "adv/mean_abs_final_conf": 0.7647924423217773, "adv/mean_abs_reasoning": 0.4423978328704834, "adv/mean_abs_step_conf": 0.75323486328125, "adv/ratio_final_to_reasoning": 1.7287436454185308, "adv/ratio_step_to_reasoning": 1.7026187908605046, "adv/std_final_conf": 0.9318432211875916, "adv/std_reasoning": 0.720503568649292, "adv/std_step_conf": 0.9338743090629578, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5081408557364634, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.33411067193675875, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.308300395256917, "calib/gap": 1.640792629054033e-05, "calib/mean_conf": 0.8835177865612648, "calib/mu_c": 0.8835251798561151, "calib/mu_w": 0.8835087719298246, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33411067193675875, "calib/std_conf": 0.04051882741810088, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.795828488372093, "calib/step_q_c_n": 688.0, "calib/step_q_gap": -0.008621429526100788, "calib/step_q_w": 0.8044499178981938, "calib/step_q_w_n": 609.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 433.66796875, "completions/mean_terminated_length": 437.0826721191406, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.0064, "grad_norm": 0.039841946214437485, "kl": 0.0005662441253662109, "learning_rate": 1.5e-06, "loss": -0.0496, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03768611699342728, "mask/share_reasoning": 0.8277445435523987, "mask/share_step_conf": 0.12675684690475464, "num_tokens": 1382803.0, "reward": 0.8113628625869751, "reward_std": 0.18264266848564148, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6277871131896973, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.6894698143005371, "step": 6 }, { "adv/mean_abs_final_conf": 0.8003997206687927, "adv/mean_abs_reasoning": 0.5040899515151978, "adv/mean_abs_step_conf": 0.7537604570388794, "adv/ratio_final_to_reasoning": 1.587811298882163, "adv/ratio_step_to_reasoning": 1.4952895902273393, "adv/std_final_conf": 0.929995596408844, "adv/std_reasoning": 0.7393463253974915, "adv/std_step_conf": 0.9334385395050049, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4976365378473331, "calib/avg_num_step_conf": 5.20703125, "calib/ece": 0.277109375, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.34375, "calib/gap": 0.004912168636218572, "calib/mean_conf": 0.882578125, "calib/mu_c": 0.8845161290322582, "calib/mu_w": 0.8796039603960396, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.277109375, "calib/std_conf": 0.05675806789773922, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7917788461538462, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.001998407032089644, "calib/step_q_w": 0.7897804391217566, "calib/step_q_w_n": 501.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 525.24609375, "completions/mean_terminated_length": 527.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.007466666666666667, "grad_norm": 0.04400845989584923, "kl": 0.0003158748149871826, "learning_rate": 1.75e-06, "loss": 0.0579, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030677923932671547, "mask/share_reasoning": 0.8573629856109619, "mask/share_step_conf": 0.10805287212133408, "num_tokens": 1624690.0, "reward": 0.876061737537384, "reward_std": 0.19940611720085144, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6821796894073486, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7496312260627747, "step": 7 }, { "adv/mean_abs_final_conf": 0.7505788207054138, "adv/mean_abs_reasoning": 0.4054693579673767, "adv/mean_abs_step_conf": 0.7549181580543518, "adv/ratio_final_to_reasoning": 1.851135741719363, "adv/ratio_step_to_reasoning": 1.86183775227496, "adv/std_final_conf": 0.9302457571029663, "adv/std_reasoning": 0.7013283967971802, "adv/std_step_conf": 0.934516966342926, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4533621140763998, "calib/avg_num_step_conf": 4.47265625, "calib/ece": 0.2893625498007968, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2749003984063745, "calib/gap": 0.0003623757195183597, "calib/mean_conf": 0.8750199203187251, "calib/mu_c": 0.8751700680272108, "calib/mu_w": 0.8748076923076924, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2893625498007968, "calib/std_conf": 0.047481382402938206, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7935275590551181, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.02386089238845146, "calib/step_q_w": 0.7696666666666666, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 545.45703125, "completions/mean_terminated_length": 545.45703125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.008533333333333334, "grad_norm": 0.03767187148332596, "kl": 0.00035335123538970947, "learning_rate": 2.0000000000000003e-06, "loss": -0.0414, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032488659024238586, "mask/share_reasoning": 0.8679938316345215, "mask/share_step_conf": 0.09951749444007874, "num_tokens": 1870839.0, "reward": 0.8519612550735474, "reward_std": 0.1759214848279953, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6545824408531189, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7391838431358337, "step": 8 }, { "adv/mean_abs_final_conf": 0.7705340385437012, "adv/mean_abs_reasoning": 0.47810250520706177, "adv/mean_abs_step_conf": 0.74006587266922, "adv/ratio_final_to_reasoning": 1.6116502845137572, "adv/ratio_step_to_reasoning": 1.5479230177819383, "adv/std_final_conf": 0.9310494661331177, "adv/std_reasoning": 0.7393581867218018, "adv/std_step_conf": 0.934630811214447, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5215696465696467, "calib/avg_num_step_conf": 4.5625, "calib/ece": 0.2912301587301586, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": 0.00011954261954272294, "calib/mean_conf": 0.8785317460317461, "calib/mu_c": 0.8785810810810811, "calib/mu_w": 0.8784615384615384, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.2912301587301586, "calib/std_conf": 0.04584120045173485, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7793759512937595, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.0028593172428788005, "calib/step_q_w": 0.7765166340508807, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 486.640625, "completions/mean_terminated_length": 488.5490417480469, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0096, "grad_norm": 0.04332495480775833, "kl": 0.00040915608406066895, "learning_rate": 2.25e-06, "loss": 0.0087, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034009773284196854, "mask/share_reasoning": 0.8562586307525635, "mask/share_step_conf": 0.10582535713911057, "num_tokens": 2102955.0, "reward": 0.8214284181594849, "reward_std": 0.21525982022285461, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6493328213691711, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6857114434242249, "step": 9 }, { "adv/mean_abs_final_conf": 0.7739413976669312, "adv/mean_abs_reasoning": 0.4154004752635956, "adv/mean_abs_step_conf": 0.7806507349014282, "adv/ratio_final_to_reasoning": 1.863121117460977, "adv/ratio_step_to_reasoning": 1.8792726089349328, "adv/std_final_conf": 0.9283263683319092, "adv/std_reasoning": 0.6815720200538635, "adv/std_step_conf": 0.9333322048187256, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5619734132184984, "calib/avg_num_step_conf": 5.04296875, "calib/ece": 0.3107031250000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.35546875, "calib/gap": 0.012565686825188549, "calib/mean_conf": 0.884921875, "calib/mu_c": 0.8902721088435375, "calib/mu_w": 0.8777064220183489, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3107031250000001, "calib/std_conf": 0.043499211446696086, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7838218390804598, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.0015865449628127415, "calib/step_q_w": 0.782235294117647, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 506.203125, "completions/mean_terminated_length": 508.1882629394531, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.010666666666666666, "grad_norm": 0.04865778982639313, "kl": 0.0004032254219055176, "learning_rate": 2.5e-06, "loss": 0.078, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03207051753997803, "mask/share_reasoning": 0.8554325103759766, "mask/share_step_conf": 0.1085907593369484, "num_tokens": 2339343.0, "reward": 0.8535362482070923, "reward_std": 0.16450417041778564, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6632242202758789, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7290045022964478, "step": 10 }, { "adv/mean_abs_final_conf": 0.7742735743522644, "adv/mean_abs_reasoning": 0.3947698175907135, "adv/mean_abs_step_conf": 0.75969398021698, "adv/ratio_final_to_reasoning": 1.961329209709264, "adv/ratio_step_to_reasoning": 1.924397323111996, "adv/std_final_conf": 0.92948317527771, "adv/std_reasoning": 0.6816731095314026, "adv/std_step_conf": 0.9346326589584351, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.3907295216458412, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.35083003952569175, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.383399209486166, "calib/gap": -0.028455761706424165, "calib/mean_conf": 0.8849802371541501, "calib/mu_c": 0.8721582733812951, "calib/mu_w": 0.9006140350877193, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34320158102766807, "calib/std_conf": 0.0881905254513794, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7702435530085959, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.023481504815824894, "calib/step_q_w": 0.746762048192771, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 519.19921875, "completions/mean_terminated_length": 521.2353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.011733333333333333, "grad_norm": 0.05495906248688698, "kl": 0.0007111132144927979, "learning_rate": 2.7500000000000004e-06, "loss": 0.0205, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03233877941966057, "mask/share_reasoning": 0.8515212535858154, "mask/share_step_conf": 0.1122337281703949, "num_tokens": 2576738.0, "reward": 0.8109133243560791, "reward_std": 0.17835165560245514, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6096394062042236, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.706718385219574, "step": 11 }, { "adv/mean_abs_final_conf": 0.7674424648284912, "adv/mean_abs_reasoning": 0.4356115460395813, "adv/mean_abs_step_conf": 0.7697429060935974, "adv/ratio_final_to_reasoning": 1.7617587775296446, "adv/ratio_step_to_reasoning": 1.7670397240197477, "adv/std_final_conf": 0.9264889359474182, "adv/std_reasoning": 0.7014715075492859, "adv/std_step_conf": 0.9340823292732239, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4732954545454545, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.1991633466135458, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": -0.006133333333333546, "calib/mean_conf": 0.8918326693227091, "calib/mu_c": 0.8899999999999999, "calib/mu_w": 0.8961333333333334, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1949003984063745, "calib/std_conf": 0.04934458769820447, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7774606741573034, "calib/step_q_c_n": 890.0, "calib/step_q_gap": 0.01652427839758608, "calib/step_q_w": 0.7609363957597173, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 477.0859375, "completions/mean_terminated_length": 482.74310302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0128, "grad_norm": 0.0514008104801178, "kl": 0.00114363431930542, "learning_rate": 3e-06, "loss": 0.015, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036076635122299194, "mask/share_reasoning": 0.8272448778152466, "mask/share_step_conf": 0.12495977431535721, "num_tokens": 2803048.0, "reward": 0.9234218597412109, "reward_std": 0.18709824979305267, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7339004278182983, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7801306843757629, "step": 12 }, { "adv/mean_abs_final_conf": 0.737118124961853, "adv/mean_abs_reasoning": 0.469147264957428, "adv/mean_abs_step_conf": 0.7225381135940552, "adv/ratio_final_to_reasoning": 1.5711870877660166, "adv/ratio_step_to_reasoning": 1.5401094018093033, "adv/std_final_conf": 0.9289529323577881, "adv/std_reasoning": 0.7573885321617126, "adv/std_step_conf": 0.9343307614326477, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4623179850452578, "calib/avg_num_step_conf": 4.80859375, "calib/ece": 0.290395256916996, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4980237154150198, "calib/gap": -0.0016594516594514719, "calib/mean_conf": 0.899090909090909, "calib/mu_c": 0.8984415584415585, "calib/mu_w": 0.90010101010101, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.290395256916996, "calib/std_conf": 0.042755436012976406, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7684196185286103, "calib/step_q_c_n": 734.0, "calib/step_q_gap": -0.0028479871051925176, "calib/step_q_w": 0.7712676056338028, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 478.9921875, "completions/mean_terminated_length": 480.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.013866666666666666, "grad_norm": 0.03835929557681084, "kl": 0.0017654895782470703, "learning_rate": 3.2500000000000002e-06, "loss": -0.0057, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03364657610654831, "mask/share_reasoning": 0.8520339727401733, "mask/share_step_conf": 0.11041321605443954, "num_tokens": 3030262.0, "reward": 0.8755779266357422, "reward_std": 0.18408486247062683, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.666958212852478, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7654476165771484, "step": 13 }, { "adv/mean_abs_final_conf": 0.7786264419555664, "adv/mean_abs_reasoning": 0.5486913919448853, "adv/mean_abs_step_conf": 0.7712372541427612, "adv/ratio_final_to_reasoning": 1.4190607933462487, "adv/ratio_step_to_reasoning": 1.4055938647206445, "adv/std_final_conf": 0.9291496276855469, "adv/std_reasoning": 0.792828381061554, "adv/std_step_conf": 0.9350869059562683, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4334833953154564, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.3871370967741936, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6451612903225806, "calib/gap": -0.007322372284204293, "calib/mean_conf": 0.9153629032258064, "calib/mu_c": 0.9119083969465649, "calib/mu_w": 0.9192307692307692, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3871370967741936, "calib/std_conf": 0.03840401412936802, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7520642201834862, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.06305584029521794, "calib/step_q_w": 0.6890083798882682, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 537.93359375, "completions/mean_terminated_length": 546.4722290039062, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.014933333333333333, "grad_norm": 0.04159555211663246, "kl": 0.003778219223022461, "learning_rate": 3.5e-06, "loss": -0.051, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0319044254720211, "mask/share_reasoning": 0.8418941497802734, "mask/share_step_conf": 0.11057643592357635, "num_tokens": 3273373.0, "reward": 0.8043519258499146, "reward_std": 0.2247830182313919, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5733602046966553, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7400311827659607, "step": 14 }, { "adv/mean_abs_final_conf": 0.7623423933982849, "adv/mean_abs_reasoning": 0.49396762251853943, "adv/mean_abs_step_conf": 0.7478142976760864, "adv/ratio_final_to_reasoning": 1.5433043759253127, "adv/ratio_step_to_reasoning": 1.5138933476313414, "adv/std_final_conf": 0.9219062924385071, "adv/std_reasoning": 0.739289402961731, "adv/std_step_conf": 0.9342023134231567, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4932983682983683, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.3167450980392157, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.792156862745098, "calib/gap": -0.0031429681429681056, "calib/mean_conf": 0.9254509803921569, "calib/mu_c": 0.9242307692307693, "calib/mu_w": 0.9273737373737374, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3152156862745098, "calib/std_conf": 0.04695943753441511, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7143354838709679, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.0030854838709679644, "calib/step_q_w": 0.7112499999999999, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 467.5234375, "completions/mean_terminated_length": 469.3569030761719, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.016, "grad_norm": 0.040848296135663986, "kl": 0.008719921112060547, "learning_rate": 3.7500000000000005e-06, "loss": -0.0177, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03405185788869858, "mask/share_reasoning": 0.8486453294754028, "mask/share_step_conf": 0.11339649558067322, "num_tokens": 3500939.0, "reward": 0.8766745328903198, "reward_std": 0.20056243240833282, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6578144431114197, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7736595869064331, "step": 15 }, { "adv/mean_abs_final_conf": 0.7344765067100525, "adv/mean_abs_reasoning": 0.37689459323883057, "adv/mean_abs_step_conf": 0.7676898241043091, "adv/ratio_final_to_reasoning": 1.9487584058936855, "adv/ratio_step_to_reasoning": 2.0368820298195134, "adv/std_final_conf": 0.9199317097663879, "adv/std_reasoning": 0.6814736127853394, "adv/std_step_conf": 0.9343593716621399, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.47638728694092014, "calib/avg_num_step_conf": 6.484375, "calib/ece": 0.34105882352941186, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8941176470588236, "calib/gap": -0.001993464052287397, "calib/mean_conf": 0.9410588235294116, "calib/mu_c": 0.9402614379084968, "calib/mu_w": 0.9422549019607842, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34105882352941186, "calib/std_conf": 0.03228634857797129, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6677367896311067, "calib/step_q_c_n": 1003.0, "calib/step_q_gap": 0.019365404547392884, "calib/step_q_w": 0.6483713850837138, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 615.65234375, "completions/mean_terminated_length": 618.0667114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.017066666666666667, "grad_norm": 0.03159378096461296, "kl": 0.009853363037109375, "learning_rate": 4.000000000000001e-06, "loss": -0.0392, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.026258613914251328, "mask/share_reasoning": 0.8569262623786926, "mask/share_step_conf": 0.11290884017944336, "num_tokens": 3767394.0, "reward": 0.8711050748825073, "reward_std": 0.15706929564476013, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6391730308532715, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7842870950698853, "step": 16 }, { "adv/mean_abs_final_conf": 0.7703932523727417, "adv/mean_abs_reasoning": 0.4900427758693695, "adv/mean_abs_step_conf": 0.7754462957382202, "adv/ratio_final_to_reasoning": 1.5720938871224277, "adv/ratio_step_to_reasoning": 1.5824053203570347, "adv/std_final_conf": 0.9156169295310974, "adv/std_reasoning": 0.739263117313385, "adv/std_step_conf": 0.9346990585327148, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5078077158603796, "calib/avg_num_step_conf": 5.88671875, "calib/ece": 0.22843137254901952, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8784313725490196, "calib/gap": -0.004872167789344961, "calib/mean_conf": 0.9390196078431372, "calib/mu_c": 0.9376630434782609, "calib/mu_w": 0.9425352112676059, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22294117647058817, "calib/std_conf": 0.04889584376101013, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6399820788530465, "calib/step_q_c_n": 1116.0, "calib/step_q_gap": 0.030442436909312476, "calib/step_q_w": 0.609539641943734, "calib/step_q_w_n": 391.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 524.06640625, "completions/mean_terminated_length": 524.06640625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.018133333333333335, "grad_norm": 0.03880058601498604, "kl": 0.014672279357910156, "learning_rate": 4.25e-06, "loss": 0.0472, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03221951052546501, "mask/share_reasoning": 0.8422893285751343, "mask/share_step_conf": 0.12549114227294922, "num_tokens": 4005083.0, "reward": 0.9624072313308716, "reward_std": 0.1909254938364029, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7406648397445679, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8419621586799622, "step": 17 }, { "adv/mean_abs_final_conf": 0.7564767003059387, "adv/mean_abs_reasoning": 0.43438613414764404, "adv/mean_abs_step_conf": 0.7423365712165833, "adv/ratio_final_to_reasoning": 1.7414844555070879, "adv/ratio_step_to_reasoning": 1.7089324747282784, "adv/std_final_conf": 0.9087929129600525, "adv/std_reasoning": 0.7205876708030701, "adv/std_step_conf": 0.934881865978241, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5161087339541908, "calib/avg_num_step_conf": 4.5859375, "calib/ece": 0.41592885375494065, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9209486166007905, "calib/gap": -0.00337528316133906, "calib/mean_conf": 0.9416205533596839, "calib/mu_c": 0.9400729927007299, "calib/mu_w": 0.9434482758620689, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.40802371541501975, "calib/std_conf": 0.09777031188382188, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6413122923588038, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.020525579072090627, "calib/step_q_w": 0.6207867132867132, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 504.70703125, "completions/mean_terminated_length": 506.6863098144531, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.0192, "grad_norm": 0.02734338492155075, "kl": 0.014324188232421875, "learning_rate": 4.5e-06, "loss": -0.0854, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03250627592206001, "mask/share_reasoning": 0.8629031777381897, "mask/share_step_conf": 0.1006842851638794, "num_tokens": 4245008.0, "reward": 0.8083161115646362, "reward_std": 0.19949162006378174, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5616640448570251, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7541868686676025, "step": 18 }, { "adv/mean_abs_final_conf": 0.7312251329421997, "adv/mean_abs_reasoning": 0.3974907696247101, "adv/mean_abs_step_conf": 0.7509687542915344, "adv/ratio_final_to_reasoning": 1.8396028004186968, "adv/ratio_step_to_reasoning": 1.889273441495408, "adv/std_final_conf": 0.9088385701179504, "adv/std_reasoning": 0.6815569400787354, "adv/std_step_conf": 0.9349246621131897, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.472909825713564, "calib/avg_num_step_conf": 4.671875, "calib/ece": 0.3792549019607844, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": -0.0028624652690071306, "calib/mean_conf": 0.9596470588235294, "calib/mu_c": 0.9584459459459461, "calib/mu_w": 0.9613084112149533, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3792549019607844, "calib/std_conf": 0.021353076216285564, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6354014598540146, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.02362063793620639, "calib/step_q_w": 0.6117808219178082, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 499.2421875, "completions/mean_terminated_length": 501.2000427246094, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.020266666666666665, "grad_norm": 0.02526911534368992, "kl": 0.023448944091796875, "learning_rate": 4.75e-06, "loss": -0.0057, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03129266947507858, "mask/share_reasoning": 0.8586671948432922, "mask/share_step_conf": 0.1061338409781456, "num_tokens": 4477574.0, "reward": 0.8613945841789246, "reward_std": 0.18759620189666748, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.60454261302948, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.804184079170227, "step": 19 }, { "adv/mean_abs_final_conf": 0.7194894552230835, "adv/mean_abs_reasoning": 0.4517136514186859, "adv/mean_abs_step_conf": 0.7461144924163818, "adv/ratio_final_to_reasoning": 1.5927998920630375, "adv/ratio_step_to_reasoning": 1.6517421824048897, "adv/std_final_conf": 0.8810336589813232, "adv/std_reasoning": 0.739335298538208, "adv/std_step_conf": 0.9348995685577393, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49452873254564983, "calib/avg_num_step_conf": 5.44140625, "calib/ece": 0.35896, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.984, "calib/gap": 0.0010418904403868279, "calib/mean_conf": 0.96696, "calib/mu_c": 0.9673684210526318, "calib/mu_w": 0.9663265306122449, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35896, "calib/std_conf": 0.019543756036135944, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6032474226804124, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.044171247639893885, "calib/step_q_w": 0.5590761750405185, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2581.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 513.9375, "completions/mean_terminated_length": 515.9529418945312, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.021333333333333333, "grad_norm": 0.03067217580974102, "kl": 0.025417327880859375, "learning_rate": 5e-06, "loss": 0.0229, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03458629548549652, "mask/share_reasoning": 0.8356574773788452, "mask/share_step_conf": 0.12584996223449707, "num_tokens": 4714014.0, "reward": 0.8612836599349976, "reward_std": 0.20020237565040588, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6180921792984009, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7904125452041626, "step": 20 }, { "adv/mean_abs_final_conf": 0.7106478214263916, "adv/mean_abs_reasoning": 0.43507465720176697, "adv/mean_abs_step_conf": 0.7549261450767517, "adv/ratio_final_to_reasoning": 1.6333928204345556, "adv/ratio_step_to_reasoning": 1.735164603546772, "adv/std_final_conf": 0.8982265591621399, "adv/std_reasoning": 0.7205180525779724, "adv/std_step_conf": 0.9351524710655212, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.579504724666015, "calib/avg_num_step_conf": 5.8671875, "calib/ece": 0.36047244094488184, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9881889763779528, "calib/gap": 0.002816552623004176, "calib/mean_conf": 0.9707086614173228, "calib/mu_c": 0.9718064516129032, "calib/mu_w": 0.9689898989898991, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36047244094488184, "calib/std_conf": 0.021284762297996477, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5655878928987195, "calib/step_q_c_n": 859.0, "calib/step_q_gap": 0.008558343909605859, "calib/step_q_w": 0.5570295489891136, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1916.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 500.91796875, "completions/mean_terminated_length": 502.88238525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0224, "grad_norm": 0.027707798406481743, "kl": 0.031322479248046875, "learning_rate": 4.9722222222222224e-06, "loss": -0.0308, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03268470615148544, "mask/share_reasoning": 0.836051344871521, "mask/share_step_conf": 0.12735772132873535, "num_tokens": 4945209.0, "reward": 0.8745825886726379, "reward_std": 0.1900649070739746, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6281523704528809, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8014814853668213, "step": 21 }, { "adv/mean_abs_final_conf": 0.7099519968032837, "adv/mean_abs_reasoning": 0.3854062557220459, "adv/mean_abs_step_conf": 0.7544239163398743, "adv/ratio_final_to_reasoning": 1.842087372124285, "adv/ratio_step_to_reasoning": 1.9574770910931012, "adv/std_final_conf": 0.8757226467132568, "adv/std_reasoning": 0.6815925240516663, "adv/std_step_conf": 0.9346233606338501, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5026154891304349, "calib/avg_num_step_conf": 6.0546875, "calib/ece": 0.3418253968253968, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003532608695651085, "calib/mean_conf": 0.9767460317460317, "calib/mu_c": 0.976875, "calib/mu_w": 0.9765217391304349, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3418253968253968, "calib/std_conf": 0.013934618594137112, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5992184154175589, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.06386127256041607, "calib/step_q_w": 0.5353571428571429, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 505.08984375, "completions/mean_terminated_length": 511.0790710449219, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.023466666666666667, "grad_norm": 0.02160457894206047, "kl": 0.03806304931640625, "learning_rate": 4.944444444444445e-06, "loss": -0.0644, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031118055805563927, "mask/share_reasoning": 0.8294792175292969, "mask/share_step_conf": 0.12768399715423584, "num_tokens": 5176328.0, "reward": 0.8897135257720947, "reward_std": 0.18438906967639923, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6411515474319458, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.816400408744812, "step": 22 }, { "adv/mean_abs_final_conf": 0.725068211555481, "adv/mean_abs_reasoning": 0.4642287790775299, "adv/mean_abs_step_conf": 0.771478533744812, "adv/ratio_final_to_reasoning": 1.5618769111993998, "adv/ratio_step_to_reasoning": 1.6618498647968762, "adv/std_final_conf": 0.8838828206062317, "adv/std_reasoning": 0.7392085194587708, "adv/std_step_conf": 0.9351414442062378, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5082431412031211, "calib/avg_num_step_conf": 5.7265625, "calib/ece": 0.44043478260869573, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00023345079285186365, "calib/mean_conf": 0.9819367588932807, "calib/mu_c": 0.9820437956204381, "calib/mu_w": 0.9818103448275862, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44043478260869573, "calib/std_conf": 0.011820627134753209, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6099600000000001, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.024080111731843656, "calib/step_q_w": 0.5858798882681564, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 528.421875, "completions/mean_terminated_length": 530.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.024533333333333334, "grad_norm": 0.021991299465298653, "kl": 0.032794952392578125, "learning_rate": 4.9166666666666665e-06, "loss": 0.005, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03355323523283005, "mask/share_reasoning": 0.8336979150772095, "mask/share_step_conf": 0.12884265184402466, "num_tokens": 5415540.0, "reward": 0.8157504200935364, "reward_std": 0.20109497010707855, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5472742319107056, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7803203463554382, "step": 23 }, { "adv/mean_abs_final_conf": 0.7268941402435303, "adv/mean_abs_reasoning": 0.634642481803894, "adv/mean_abs_step_conf": 0.7647470235824585, "adv/ratio_final_to_reasoning": 1.1453600429921145, "adv/ratio_step_to_reasoning": 1.2050044639444213, "adv/std_final_conf": 0.8899421095848083, "adv/std_reasoning": 0.843029260635376, "adv/std_step_conf": 0.9352526068687439, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6084887334887334, "calib/avg_num_step_conf": 6.765625, "calib/ece": 0.45120481927710854, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0050796425796426625, "calib/mean_conf": 0.9813253012048194, "calib/mu_c": 0.9837121212121211, "calib/mu_w": 0.9786324786324785, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.45120481927710854, "calib/std_conf": 0.013089535751676469, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6250829383886255, "calib/step_q_c_n": 844.0, "calib/step_q_gap": 0.005251857307544405, "calib/step_q_w": 0.6198310810810811, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 599.82421875, "completions/mean_terminated_length": 604.5472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.0256, "grad_norm": 0.027556290850043297, "kl": 0.04826927185058594, "learning_rate": 4.888888888888889e-06, "loss": 0.0361, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030089421197772026, "mask/share_reasoning": 0.8356856107711792, "mask/share_step_conf": 0.12641246616840363, "num_tokens": 5673607.0, "reward": 0.7990022897720337, "reward_std": 0.2679530680179596, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5345726609230042, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7665569186210632, "step": 24 }, { "adv/mean_abs_final_conf": 0.6522500514984131, "adv/mean_abs_reasoning": 0.40848758816719055, "adv/mean_abs_step_conf": 0.759333610534668, "adv/ratio_final_to_reasoning": 1.5967438678490584, "adv/ratio_step_to_reasoning": 1.8588902882989902, "adv/std_final_conf": 0.8359845876693726, "adv/std_reasoning": 0.6815344095230103, "adv/std_step_conf": 0.9345144629478455, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.534307953950811, "calib/avg_num_step_conf": 5.859375, "calib/ece": 0.3708661417322835, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0013932496075353962, "calib/mean_conf": 0.9850393700787402, "calib/mu_c": 0.9855769230769231, "calib/mu_w": 0.9841836734693877, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3708661417322835, "calib/std_conf": 0.010712713452924015, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6474572405929304, "calib/step_q_c_n": 877.0, "calib/step_q_gap": 0.0247124572863493, "calib/step_q_w": 0.6227447833065811, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 493.07421875, "completions/mean_terminated_length": 495.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.02666666666666667, "grad_norm": 0.02888176217675209, "kl": 0.034931182861328125, "learning_rate": 4.861111111111111e-06, "loss": 0.0065, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03211010992527008, "mask/share_reasoning": 0.8384443521499634, "mask/share_step_conf": 0.12553933262825012, "num_tokens": 5903058.0, "reward": 0.8710969686508179, "reward_std": 0.1723031848669052, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6211484670639038, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8007329702377319, "step": 25 }, { "adv/mean_abs_final_conf": 0.6101415753364563, "adv/mean_abs_reasoning": 0.4364333152770996, "adv/mean_abs_step_conf": 0.7746877670288086, "adv/ratio_final_to_reasoning": 1.3980178734729867, "adv/ratio_step_to_reasoning": 1.775042692460233, "adv/std_final_conf": 0.8025437593460083, "adv/std_reasoning": 0.7013906240463257, "adv/std_step_conf": 0.9353902339935303, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5800822047031872, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.37240000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.003988949531702479, "calib/mean_conf": 0.9844, "calib/mu_c": 0.9859477124183006, "calib/mu_w": 0.9819587628865981, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37240000000000006, "calib/std_conf": 0.015357083056361975, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.649262453066333, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.008036962870254682, "calib/step_q_w": 0.6412254901960783, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 528.96484375, "completions/mean_terminated_length": 531.0392456054688, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.027733333333333332, "grad_norm": 0.02937263622879982, "kl": 0.042026519775390625, "learning_rate": 4.833333333333333e-06, "loss": 0.0452, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02984796278178692, "mask/share_reasoning": 0.8507357239723206, "mask/share_step_conf": 0.11551006883382797, "num_tokens": 6143713.0, "reward": 0.848415732383728, "reward_std": 0.206464484333992, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6108601093292236, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7703462839126587, "step": 26 }, { "adv/mean_abs_final_conf": 0.6748393774032593, "adv/mean_abs_reasoning": 0.5690269470214844, "adv/mean_abs_step_conf": 0.7577905654907227, "adv/ratio_final_to_reasoning": 1.18595328557925, "adv/ratio_step_to_reasoning": 1.3317305436188969, "adv/std_final_conf": 0.8738255500793457, "adv/std_reasoning": 0.8098766207695007, "adv/std_step_conf": 0.9352293014526367, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5000635404752828, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.501593625498008, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000529927563858279, "calib/mean_conf": 0.9876494023904383, "calib/mu_c": 0.9873770491803279, "calib/mu_w": 0.9879069767441861, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.501593625498008, "calib/std_conf": 0.0074451908086178315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6452361111111111, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.030247246968572172, "calib/step_q_w": 0.614988864142539, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 539.953125, "completions/mean_terminated_length": 544.2047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0288, "grad_norm": 0.02530396170914173, "kl": 0.03910064697265625, "learning_rate": 4.805555555555556e-06, "loss": 0.0118, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030356114730238914, "mask/share_reasoning": 0.8339431285858154, "mask/share_step_conf": 0.12788823246955872, "num_tokens": 6387157.0, "reward": 0.7715263366699219, "reward_std": 0.2471705675125122, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.48854607343673706, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.763100266456604, "step": 27 }, { "adv/mean_abs_final_conf": 0.5680310130119324, "adv/mean_abs_reasoning": 0.3548010289669037, "adv/mean_abs_step_conf": 0.7654111981391907, "adv/ratio_final_to_reasoning": 1.600984683347463, "adv/ratio_step_to_reasoning": 2.157297007756393, "adv/std_final_conf": 0.7786206603050232, "adv/std_reasoning": 0.6401953101158142, "adv/std_step_conf": 0.934259831905365, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5256724548859382, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.3370866141732284, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0014817841334693327, "calib/mean_conf": 0.9866929133858268, "calib/mu_c": 0.9872121212121211, "calib/mu_w": 0.9857303370786518, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3370866141732284, "calib/std_conf": 0.010615329734419499, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6254288702928871, "calib/step_q_c_n": 956.0, "calib/step_q_gap": -0.008191819362285302, "calib/step_q_w": 0.6336206896551724, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 553.17578125, "completions/mean_terminated_length": 555.3451538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.029866666666666666, "grad_norm": 0.025527317076921463, "kl": 0.041477203369140625, "learning_rate": 4.777777777777778e-06, "loss": -0.0162, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02951807528734207, "mask/share_reasoning": 0.8487849235534668, "mask/share_step_conf": 0.11779077351093292, "num_tokens": 6635714.0, "reward": 0.8864841461181641, "reward_std": 0.16156134009361267, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6463539004325867, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.800832986831665, "step": 28 }, { "adv/mean_abs_final_conf": 0.606778621673584, "adv/mean_abs_reasoning": 0.4561738073825836, "adv/mean_abs_step_conf": 0.7378709316253662, "adv/ratio_final_to_reasoning": 1.330147877527504, "adv/ratio_step_to_reasoning": 1.617521479058812, "adv/std_final_conf": 0.8212588429450989, "adv/std_reasoning": 0.7205647230148315, "adv/std_step_conf": 0.9352527260780334, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5364016997875265, "calib/avg_num_step_conf": 6.4140625, "calib/ece": 0.4859683794466403, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0014391951006123005, "calib/mean_conf": 0.9879446640316205, "calib/mu_c": 0.9886614173228344, "calib/mu_w": 0.9872222222222221, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4859683794466403, "calib/std_conf": 0.00837161241651468, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5999868073878627, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.024421196528134126, "calib/step_q_w": 0.5755656108597286, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 584.58984375, "completions/mean_terminated_length": 586.8823852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.030933333333333334, "grad_norm": 0.027113767340779305, "kl": 0.0341796875, "learning_rate": 4.75e-06, "loss": -0.0488, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.028182100504636765, "mask/share_reasoning": 0.849419355392456, "mask/share_step_conf": 0.11849230527877808, "num_tokens": 6892497.0, "reward": 0.8000890016555786, "reward_std": 0.19791510701179504, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5045530796051025, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7995311617851257, "step": 29 }, { "adv/mean_abs_final_conf": 0.695144534111023, "adv/mean_abs_reasoning": 0.5578685402870178, "adv/mean_abs_step_conf": 0.747866153717041, "adv/ratio_final_to_reasoning": 1.246072298239614, "adv/ratio_step_to_reasoning": 1.3405777521210844, "adv/std_final_conf": 0.8613123297691345, "adv/std_reasoning": 0.7755047082901001, "adv/std_step_conf": 0.9354647397994995, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5202317880794703, "calib/avg_num_step_conf": 6.5078125, "calib/ece": 0.3870517928286853, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00024172185430437754, "calib/mean_conf": 0.9886454183266933, "calib/mu_c": 0.9887417218543043, "calib/mu_w": 0.9884999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3870517928286853, "calib/std_conf": 0.007179941690125934, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5561439312567132, "calib/step_q_c_n": 931.0, "calib/step_q_gap": 0.0007017543859649811, "calib/step_q_w": 0.5554421768707483, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 625.05859375, "completions/mean_terminated_length": 627.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.032, "grad_norm": 0.031422603875398636, "kl": 0.03484344482421875, "learning_rate": 4.722222222222222e-06, "loss": 0.0001, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02694585919380188, "mask/share_reasoning": 0.8529764413833618, "mask/share_step_conf": 0.11617143452167511, "num_tokens": 7159496.0, "reward": 0.8489872813224792, "reward_std": 0.24480488896369934, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5945906639099121, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7908838987350464, "step": 30 }, { "adv/mean_abs_final_conf": 0.7133569717407227, "adv/mean_abs_reasoning": 0.4811326265335083, "adv/mean_abs_step_conf": 0.7683807015419006, "adv/ratio_final_to_reasoning": 1.4826618117344432, "adv/ratio_step_to_reasoning": 1.5970247270029754, "adv/std_final_conf": 0.8834726214408875, "adv/std_reasoning": 0.720693826675415, "adv/std_step_conf": 0.9351906776428223, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4464917825537295, "calib/avg_num_step_conf": 7.140625, "calib/ece": 0.5399209486166008, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": -0.0009823008849556558, "calib/mean_conf": 0.9865612648221344, "calib/mu_c": 0.9860176991150444, "calib/mu_w": 0.9870000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5399209486166008, "calib/std_conf": 0.015336579589303448, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5300267379679144, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.005850812041988562, "calib/step_q_w": 0.5241759259259259, "calib/step_q_w_n": 1080.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 620.8359375, "completions/mean_terminated_length": 623.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.03306666666666667, "grad_norm": 0.09976796060800552, "kl": 0.056781768798828125, "learning_rate": 4.694444444444445e-06, "loss": 0.0319, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02804284356534481, "mask/share_reasoning": 0.84264075756073, "mask/share_step_conf": 0.12541010975837708, "num_tokens": 7424342.0, "reward": 0.7682323455810547, "reward_std": 0.21610067784786224, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.4552140533924103, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7953131198883057, "step": 31 }, { "adv/mean_abs_final_conf": 0.6915620565414429, "adv/mean_abs_reasoning": 0.4747565984725952, "adv/mean_abs_step_conf": 0.7673255205154419, "adv/ratio_final_to_reasoning": 1.4566665503257086, "adv/ratio_step_to_reasoning": 1.6162503543586555, "adv/std_final_conf": 0.8616219162940979, "adv/std_reasoning": 0.7393125295639038, "adv/std_step_conf": 0.93488609790802, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5823407071280016, "calib/avg_num_step_conf": 6.13671875, "calib/ece": 0.4732142857142858, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.0052353942144072185, "calib/mean_conf": 0.9851190476190477, "calib/mu_c": 0.9876744186046511, "calib/mu_w": 0.9824390243902439, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4732142857142858, "calib/std_conf": 0.013728725037543098, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4811369680851063, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.035080802028940206, "calib/step_q_w": 0.4460561660561661, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 552.30859375, "completions/mean_terminated_length": 558.8577270507812, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.034133333333333335, "grad_norm": 0.03148404136300087, "kl": 0.046142578125, "learning_rate": 4.666666666666667e-06, "loss": -0.0457, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029168495908379555, "mask/share_reasoning": 0.838029146194458, "mask/share_step_conf": 0.12108359485864639, "num_tokens": 7672437.0, "reward": 0.8188750147819519, "reward_std": 0.19034592807292938, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5203777551651001, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8197159767150879, "step": 32 }, { "adv/mean_abs_final_conf": 0.6719990372657776, "adv/mean_abs_reasoning": 0.44133204221725464, "adv/mean_abs_step_conf": 0.7471027374267578, "adv/ratio_final_to_reasoning": 1.5226608833785344, "adv/ratio_step_to_reasoning": 1.692835928416413, "adv/std_final_conf": 0.8424936532974243, "adv/std_reasoning": 0.701448380947113, "adv/std_step_conf": 0.9346904754638672, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.540872713414634, "calib/avg_num_step_conf": 6.61328125, "calib/ece": 0.4918725099601594, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.006772103658536421, "calib/mean_conf": 0.9819123505976096, "calib/mu_c": 0.9853658536585366, "calib/mu_w": 0.9785937500000002, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4918725099601594, "calib/std_conf": 0.02858426222007358, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4880940594059406, "calib/step_q_c_n": 808.0, "calib/step_q_gap": 0.07595846618560159, "calib/step_q_w": 0.412135593220339, "calib/step_q_w_n": 885.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 576.78515625, "completions/mean_terminated_length": 579.047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.0352, "grad_norm": 0.028174152597784996, "kl": 0.0565185546875, "learning_rate": 4.638888888888889e-06, "loss": 0.0181, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028950443491339684, "mask/share_reasoning": 0.8444273471832275, "mask/share_step_conf": 0.12271599471569061, "num_tokens": 7926966.0, "reward": 0.8060372471809387, "reward_std": 0.18721714615821838, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5007531046867371, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8191338777542114, "step": 33 }, { "adv/mean_abs_final_conf": 0.720805287361145, "adv/mean_abs_reasoning": 0.5097736120223999, "adv/mean_abs_step_conf": 0.7887823581695557, "adv/ratio_final_to_reasoning": 1.4139713597601287, "adv/ratio_step_to_reasoning": 1.5473189266118699, "adv/std_final_conf": 0.9024256467819214, "adv/std_reasoning": 0.7575446963310242, "adv/std_step_conf": 0.9344650506973267, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5065359477124183, "calib/avg_num_step_conf": 6.8125, "calib/ece": 0.44379446640316206, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9723320158102767, "calib/gap": 0.00667106586224242, "calib/mean_conf": 0.9747826086956521, "calib/mu_c": 0.9778676470588236, "calib/mu_w": 0.9711965811965811, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44051383399209487, "calib/std_conf": 0.061158546097693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39326460481099657, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.01340237748608264, "calib/step_q_w": 0.3798622273249139, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 489.14453125, "completions/mean_terminated_length": 494.9446716308594, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.03626666666666667, "grad_norm": 0.028920266777276993, "kl": 0.06124114990234375, "learning_rate": 4.611111111111112e-06, "loss": -0.0353, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03213128075003624, "mask/share_reasoning": 0.8149935007095337, "mask/share_step_conf": 0.14115644991397858, "num_tokens": 8157299.0, "reward": 0.8410252332687378, "reward_std": 0.19692179560661316, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.553253173828125, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8248911499977112, "step": 34 }, { "adv/mean_abs_final_conf": 0.7265925407409668, "adv/mean_abs_reasoning": 0.492765873670578, "adv/mean_abs_step_conf": 0.7652642726898193, "adv/ratio_final_to_reasoning": 1.4745187919135523, "adv/ratio_step_to_reasoning": 1.5529977086063613, "adv/std_final_conf": 0.8988257050514221, "adv/std_reasoning": 0.7393568754196167, "adv/std_step_conf": 0.9353464841842651, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5056186868686869, "calib/avg_num_step_conf": 6.48046875, "calib/ece": 0.4386111111111111, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9087301587301587, "calib/gap": 0.010030303030302723, "calib/mean_conf": 0.9624206349206349, "calib/mu_c": 0.9671969696969696, "calib/mu_w": 0.9571666666666668, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4386111111111111, "calib/std_conf": 0.0845159751194443, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3924677002583979, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.041778434721674695, "calib/step_q_w": 0.3506892655367232, "calib/step_q_w_n": 885.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 592.49609375, "completions/mean_terminated_length": 594.8196411132812, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.037333333333333336, "grad_norm": 0.04211479797959328, "kl": 0.05974578857421875, "learning_rate": 4.583333333333333e-06, "loss": 0.0351, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.028332440182566643, "mask/share_reasoning": 0.8531514406204224, "mask/share_step_conf": 0.11460986733436584, "num_tokens": 8418234.0, "reward": 0.8261188268661499, "reward_std": 0.1979362666606903, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5472050905227661, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8058139085769653, "step": 35 }, { "adv/mean_abs_final_conf": 0.7033538818359375, "adv/mean_abs_reasoning": 0.40417081117630005, "adv/mean_abs_step_conf": 0.7413933277130127, "adv/ratio_final_to_reasoning": 1.7402391820153813, "adv/ratio_step_to_reasoning": 1.834356433497162, "adv/std_final_conf": 0.9182852506637573, "adv/std_reasoning": 0.7012822031974792, "adv/std_step_conf": 0.9347103834152222, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.517733490964364, "calib/avg_num_step_conf": 6.6171875, "calib/ece": 0.24321818181818186, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8932806324110671, "calib/gap": -0.03352055396047959, "calib/mean_conf": 0.9403391304347826, "calib/mu_c": 0.9321246073298429, "calib/mu_w": 0.9656451612903225, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21430830039525697, "calib/std_conf": 0.15752448260193302, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36025121555915723, "calib/step_q_c_n": 1234.0, "calib/step_q_gap": 0.007251215559157254, "calib/step_q_w": 0.353, "calib/step_q_w_n": 460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 527.109375, "completions/mean_terminated_length": 527.109375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0384, "grad_norm": 0.05301982909440994, "kl": 0.07215118408203125, "learning_rate": 4.555555555555556e-06, "loss": 0.0231, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0321599505841732, "mask/share_reasoning": 0.8275481462478638, "mask/share_step_conf": 0.14029183983802795, "num_tokens": 8655886.0, "reward": 0.9387648105621338, "reward_std": 0.18246082961559296, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7346935272216797, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7951799035072327, "step": 36 }, { "adv/mean_abs_final_conf": 0.7183753252029419, "adv/mean_abs_reasoning": 0.5007466077804565, "adv/mean_abs_step_conf": 0.7447097301483154, "adv/ratio_final_to_reasoning": 1.4346084707135964, "adv/ratio_step_to_reasoning": 1.4871987519780068, "adv/std_final_conf": 0.9124584197998047, "adv/std_reasoning": 0.7575705647468567, "adv/std_step_conf": 0.9348015785217285, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4699532085561497, "calib/avg_num_step_conf": 6.80859375, "calib/ece": 0.5294707317073172, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8617886178861789, "calib/gap": -0.0421933689839572, "calib/mean_conf": 0.9406918699186994, "calib/mu_c": 0.9173654545454546, "calib/mu_w": 0.9595588235294118, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5115040650406505, "calib/std_conf": 0.12652019157822678, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40833070866141724, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.07433251371556882, "calib/step_q_w": 0.3339981949458484, "calib/step_q_w_n": 1108.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 575.2734375, "completions/mean_terminated_length": 582.0949096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.039466666666666664, "grad_norm": 0.030364058911800385, "kl": 0.06581878662109375, "learning_rate": 4.527777777777778e-06, "loss": -0.0514, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.029105212539434433, "mask/share_reasoning": 0.8325843811035156, "mask/share_step_conf": 0.12659165263175964, "num_tokens": 8910252.0, "reward": 0.7674344182014465, "reward_std": 0.20056301355361938, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.4537394344806671, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8037855625152588, "step": 37 }, { "adv/mean_abs_final_conf": 0.7179021835327148, "adv/mean_abs_reasoning": 0.4177252948284149, "adv/mean_abs_step_conf": 0.7406455278396606, "adv/ratio_final_to_reasoning": 1.7185987835081924, "adv/ratio_step_to_reasoning": 1.77304447925254, "adv/std_final_conf": 0.90470290184021, "adv/std_reasoning": 0.7013022899627686, "adv/std_step_conf": 0.9349610209465027, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5494932984635502, "calib/avg_num_step_conf": 6.11328125, "calib/ece": 0.4177016129032257, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8991935483870968, "calib/gap": 0.012976135992154303, "calib/mean_conf": 0.9505241935483871, "calib/mu_c": 0.9565413533834587, "calib/mu_w": 0.9435652173913044, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41596774193548375, "calib/std_conf": 0.09957230827877064, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39140328697850824, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.010447214627087031, "calib/step_q_w": 0.3809560723514212, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 551.609375, "completions/mean_terminated_length": 558.1502075195312, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.04053333333333333, "grad_norm": 0.04029100760817528, "kl": 0.11200714111328125, "learning_rate": 4.5e-06, "loss": -0.0309, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03054647706449032, "mask/share_reasoning": 0.8312405943870544, "mask/share_step_conf": 0.12649419903755188, "num_tokens": 9158352.0, "reward": 0.8352245092391968, "reward_std": 0.17706511914730072, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5582581758499146, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8145345449447632, "step": 38 }, { "adv/mean_abs_final_conf": 0.717568576335907, "adv/mean_abs_reasoning": 0.4351946711540222, "adv/mean_abs_step_conf": 0.7223755121231079, "adv/ratio_final_to_reasoning": 1.648845043146101, "adv/ratio_step_to_reasoning": 1.659890527169272, "adv/std_final_conf": 0.9146440625190735, "adv/std_reasoning": 0.7205963730812073, "adv/std_step_conf": 0.9348421692848206, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5192158385093167, "calib/avg_num_step_conf": 7.0234375, "calib/ece": 0.3912872, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.868, "calib/gap": 0.029644901656314726, "calib/mean_conf": 0.9404872000000001, "calib/mu_c": 0.953768115942029, "calib/mu_w": 0.9241232142857143, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3898872, "calib/std_conf": 0.13081278070647379, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4256962040332147, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.03835588989708899, "calib/step_q_w": 0.3873403141361257, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 569.48046875, "completions/mean_terminated_length": 573.9645385742188, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.0416, "grad_norm": 0.022142568603157997, "kl": 0.05435943603515625, "learning_rate": 4.472222222222223e-06, "loss": -0.012, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03002547100186348, "mask/share_reasoning": 0.8311752080917358, "mask/share_step_conf": 0.13098683953285217, "num_tokens": 9410227.0, "reward": 0.8570247888565063, "reward_std": 0.18772698938846588, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.585284948348999, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8256394863128662, "step": 39 }, { "adv/mean_abs_final_conf": 0.7763040661811829, "adv/mean_abs_reasoning": 0.5241175293922424, "adv/mean_abs_step_conf": 0.752269983291626, "adv/ratio_final_to_reasoning": 1.4811640951627234, "adv/ratio_step_to_reasoning": 1.4353078099943446, "adv/std_final_conf": 0.9272984266281128, "adv/std_reasoning": 0.7576014995574951, "adv/std_step_conf": 0.9347425103187561, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5090937499999999, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.4414229249011857, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8656126482213439, "calib/gap": 0.01803937500000008, "calib/mean_conf": 0.9394466403162055, "calib/mu_c": 0.9483593750000001, "calib/mu_w": 0.93032, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4374703557312252, "calib/std_conf": 0.11467986941563385, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4363092269326683, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.012219483342924775, "calib/step_q_w": 0.42408974358974355, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 597.90625, "completions/mean_terminated_length": 597.90625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.042666666666666665, "grad_norm": 0.020455822348594666, "kl": 0.05359649658203125, "learning_rate": 4.444444444444444e-06, "loss": -0.009, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030211972072720528, "mask/share_reasoning": 0.8481245040893555, "mask/share_step_conf": 0.12166358530521393, "num_tokens": 9670051.0, "reward": 0.8377189040184021, "reward_std": 0.20407214760780334, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5514258146286011, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8263556957244873, "step": 40 }, { "adv/mean_abs_final_conf": 0.6707231998443604, "adv/mean_abs_reasoning": 0.3246734142303467, "adv/mean_abs_step_conf": 0.7762365937232971, "adv/ratio_final_to_reasoning": 2.0658396112731947, "adv/ratio_step_to_reasoning": 2.3908227766766856, "adv/std_final_conf": 0.8706308007240295, "adv/std_reasoning": 0.6184402704238892, "adv/std_step_conf": 0.9344105124473572, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5263590755818734, "calib/avg_num_step_conf": 6.06640625, "calib/ece": 0.20425781250000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.890625, "calib/gap": 0.039305041533020724, "calib/mean_conf": 0.9556640625000001, "calib/mu_c": 0.9653367875647668, "calib/mu_w": 0.9260317460317461, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20300781250000002, "calib/std_conf": 0.08102582317382585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4902378929481733, "calib/step_q_c_n": 1177.0, "calib/step_q_gap": 0.008642148267322292, "calib/step_q_w": 0.48159574468085103, "calib/step_q_w_n": 376.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 498.671875, "completions/mean_terminated_length": 500.6274719238281, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.04373333333333333, "grad_norm": 0.025205468758940697, "kl": 0.0605010986328125, "learning_rate": 4.416666666666667e-06, "loss": -0.0149, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033144064247608185, "mask/share_reasoning": 0.8285462856292725, "mask/share_step_conf": 0.13440342247486115, "num_tokens": 9904959.0, "reward": 0.9856371879577637, "reward_std": 0.14432279765605927, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7817816138267517, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8387115001678467, "step": 41 }, { "adv/mean_abs_final_conf": 0.6925047636032104, "adv/mean_abs_reasoning": 0.3465518355369568, "adv/mean_abs_step_conf": 0.7726593017578125, "adv/ratio_final_to_reasoning": 1.9982718098440448, "adv/ratio_step_to_reasoning": 2.229563437633026, "adv/std_final_conf": 0.8931399583816528, "adv/std_reasoning": 0.6401625275611877, "adv/std_step_conf": 0.9346557855606079, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5946341153283744, "calib/avg_num_step_conf": 6.3515625, "calib/ece": 0.3798031496062992, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8937007874015748, "calib/gap": 0.024753639773666625, "calib/mean_conf": 0.9546062992125984, "calib/mu_c": 0.9650340136054423, "calib/mu_w": 0.9402803738317757, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37783464566929126, "calib/std_conf": 0.08252679426309702, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5342200854700855, "calib/step_q_c_n": 936.0, "calib/step_q_gap": 0.042683853586027554, "calib/step_q_w": 0.49153623188405793, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 444.89453125, "completions/mean_terminated_length": 446.6392517089844, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.0448, "grad_norm": 0.01810487173497677, "kl": 0.059814453125, "learning_rate": 4.388888888888889e-06, "loss": -0.0085, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03483423590660095, "mask/share_reasoning": 0.8130151033401489, "mask/share_step_conf": 0.1482444405555725, "num_tokens": 10123220.0, "reward": 0.8757699131965637, "reward_std": 0.14750750362873077, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6153386831283569, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8229199051856995, "step": 42 }, { "adv/mean_abs_final_conf": 0.7330894470214844, "adv/mean_abs_reasoning": 0.6000100374221802, "adv/mean_abs_step_conf": 0.7539304494857788, "adv/ratio_final_to_reasoning": 1.2217953055769741, "adv/ratio_step_to_reasoning": 1.2565297286106847, "adv/std_final_conf": 0.9077485799789429, "adv/std_reasoning": 0.8099002242088318, "adv/std_step_conf": 0.934234619140625, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6020503552287174, "calib/avg_num_step_conf": 6.51171875, "calib/ece": 0.38486274509803925, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8823529411764706, "calib/gap": 0.04346379159915226, "calib/mean_conf": 0.9417254901960785, "calib/mu_c": 0.9609859154929576, "calib/mu_w": 0.9175221238938054, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38486274509803925, "calib/std_conf": 0.11158467784889671, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5562444444444444, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.03515904679125015, "calib/step_q_w": 0.5210853976531943, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2905.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 547.1015625, "completions/mean_terminated_length": 547.1015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.04586666666666667, "grad_norm": 0.022294873371720314, "kl": 0.0514373779296875, "learning_rate": 4.361111111111112e-06, "loss": -0.0052, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.031736839562654495, "mask/share_reasoning": 0.8334815502166748, "mask/share_step_conf": 0.1347815990447998, "num_tokens": 10368502.0, "reward": 0.8724399209022522, "reward_std": 0.22675156593322754, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6117148399353027, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.82300865650177, "step": 43 }, { "adv/mean_abs_final_conf": 0.7103047370910645, "adv/mean_abs_reasoning": 0.3640681505203247, "adv/mean_abs_step_conf": 0.7505558729171753, "adv/ratio_final_to_reasoning": 1.9510213570615826, "adv/ratio_step_to_reasoning": 2.061580700878349, "adv/std_final_conf": 0.9105544686317444, "adv/std_reasoning": 0.6612716913223267, "adv/std_step_conf": 0.9344580173492432, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5427115987460815, "calib/avg_num_step_conf": 6.7578125, "calib/ece": 0.49834677419354845, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8991935483870968, "calib/gap": -0.015864681295715832, "calib/mean_conf": 0.9456854838709678, "calib/mu_c": 0.9372413793103448, "calib/mu_w": 0.9531060606060606, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.48814516129032265, "calib/std_conf": 0.10945398701350703, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5724705882352941, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.03525815300213331, "calib/step_q_w": 0.5372124352331608, "calib/step_q_w_n": 965.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2901.0, "completions/max_terminated_length": 2901.0, "completions/mean_length": 614.7890625, "completions/mean_terminated_length": 614.7890625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.046933333333333334, "grad_norm": 0.025560567155480385, "kl": 0.042316436767578125, "learning_rate": 4.333333333333334e-06, "loss": 0.009, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02792154811322689, "mask/share_reasoning": 0.8449289798736572, "mask/share_step_conf": 0.12714949250221252, "num_tokens": 10632208.0, "reward": 0.7812137603759766, "reward_std": 0.17581892013549805, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.48702070116996765, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7910317778587341, "step": 44 }, { "adv/mean_abs_final_conf": 0.7185328006744385, "adv/mean_abs_reasoning": 0.49002134799957275, "adv/mean_abs_step_conf": 0.7524739503860474, "adv/ratio_final_to_reasoning": 1.4663295866756094, "adv/ratio_step_to_reasoning": 1.5355942214719662, "adv/std_final_conf": 0.8993702530860901, "adv/std_reasoning": 0.7574601769447327, "adv/std_step_conf": 0.933860719203949, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6072868217054264, "calib/avg_num_step_conf": 6.75390625, "calib/ece": 0.44074803149606306, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.84251968503937, "calib/gap": 0.0285748837209302, "calib/mean_conf": 0.9354724409448818, "calib/mu_c": 0.9495348837209302, "calib/mu_w": 0.92096, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4341732283464568, "calib/std_conf": 0.1341361000438982, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5701189060642093, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.04057949164979491, "calib/step_q_w": 0.5295394144144144, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 525.51171875, "completions/mean_terminated_length": 527.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.048, "grad_norm": 0.028553619980812073, "kl": 0.05028533935546875, "learning_rate": 4.305555555555556e-06, "loss": 0.0151, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03267733007669449, "mask/share_reasoning": 0.8204450607299805, "mask/share_step_conf": 0.14297136664390564, "num_tokens": 10871787.0, "reward": 0.8383032083511353, "reward_std": 0.19443970918655396, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5568546652793884, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8213140964508057, "step": 45 }, { "adv/mean_abs_final_conf": 0.7056925296783447, "adv/mean_abs_reasoning": 0.3723521828651428, "adv/mean_abs_step_conf": 0.760047435760498, "adv/ratio_final_to_reasoning": 1.895228662950876, "adv/ratio_step_to_reasoning": 2.0412058012179544, "adv/std_final_conf": 0.899570107460022, "adv/std_reasoning": 0.640304684638977, "adv/std_step_conf": 0.9345961809158325, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5852539062500001, "calib/avg_num_step_conf": 7.61328125, "calib/ece": 0.4351701612903226, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8870967741935484, "calib/gap": 0.03033583333333334, "calib/mean_conf": 0.950008870967742, "calib/mu_c": 0.9646875000000001, "calib/mu_w": 0.9343516666666668, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.434525, "calib/std_conf": 0.10489374078938077, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5282490974729241, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.07193424953016919, "calib/step_q_w": 0.45631484794275495, "calib/step_q_w_n": 1118.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 563.71484375, "completions/mean_terminated_length": 572.6627197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.04906666666666667, "grad_norm": 0.02243691496551037, "kl": 0.04651641845703125, "learning_rate": 4.277777777777778e-06, "loss": -0.0769, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032937683165073395, "mask/share_reasoning": 0.810023844242096, "mask/share_step_conf": 0.14141345024108887, "num_tokens": 11120866.0, "reward": 0.8275113105773926, "reward_std": 0.17046914994716644, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5484654307365417, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8128072023391724, "step": 46 }, { "adv/mean_abs_final_conf": 0.7315204739570618, "adv/mean_abs_reasoning": 0.43789246678352356, "adv/mean_abs_step_conf": 0.7421537637710571, "adv/ratio_final_to_reasoning": 1.6705482040610122, "adv/ratio_step_to_reasoning": 1.6948310831251365, "adv/std_final_conf": 0.9107234477996826, "adv/std_reasoning": 0.7205823063850403, "adv/std_step_conf": 0.93376624584198, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6430202217873451, "calib/avg_num_step_conf": 7.2890625, "calib/ece": 0.3410450199203188, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8007968127490039, "calib/gap": 0.0683270841487279, "calib/mean_conf": 0.9227183266932272, "calib/mu_c": 0.9513013698630136, "calib/mu_w": 0.8829742857142857, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3410450199203188, "calib/std_conf": 0.13605958057016465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49314814814814817, "calib/step_q_c_n": 1026.0, "calib/step_q_gap": 0.051064814814814785, "calib/step_q_w": 0.4420833333333334, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 601.453125, "completions/mean_terminated_length": 601.453125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.050133333333333335, "grad_norm": 0.02189200557768345, "kl": 0.07508087158203125, "learning_rate": 4.25e-06, "loss": 0.0298, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028607338666915894, "mask/share_reasoning": 0.8348825573921204, "mask/share_step_conf": 0.13651013374328613, "num_tokens": 11380814.0, "reward": 0.9010224938392639, "reward_std": 0.19029831886291504, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6423037052154541, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8488037586212158, "step": 47 }, { "adv/mean_abs_final_conf": 0.7594094276428223, "adv/mean_abs_reasoning": 0.5068789124488831, "adv/mean_abs_step_conf": 0.759912371635437, "adv/ratio_final_to_reasoning": 1.498206788627069, "adv/ratio_step_to_reasoning": 1.4991990255899066, "adv/std_final_conf": 0.9114439487457275, "adv/std_reasoning": 0.7393609881401062, "adv/std_step_conf": 0.9339820146560669, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5656954887218045, "calib/avg_num_step_conf": 6.6171875, "calib/ece": 0.3942640316205534, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6996047430830039, "calib/gap": 0.03819072681704261, "calib/mean_conf": 0.8779098814229249, "calib/mu_c": 0.896024060150376, "calib/mu_w": 0.8578333333333333, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3732411067193676, "calib/std_conf": 0.20810508600068744, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48002675159235675, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.03750749966716421, "calib/step_q_w": 0.44251925192519254, "calib/step_q_w_n": 909.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 528.59765625, "completions/mean_terminated_length": 528.59765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.0512, "grad_norm": 0.04436640441417694, "kl": 0.05141448974609375, "learning_rate": 4.222222222222223e-06, "loss": 0.0771, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03390214592218399, "mask/share_reasoning": 0.8238903284072876, "mask/share_step_conf": 0.14220750331878662, "num_tokens": 11619823.0, "reward": 0.8720307946205139, "reward_std": 0.21540537476539612, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5952809453010559, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8472180962562561, "step": 48 }, { "adv/mean_abs_final_conf": 0.7109767198562622, "adv/mean_abs_reasoning": 0.4105602502822876, "adv/mean_abs_step_conf": 0.7638152837753296, "adv/ratio_final_to_reasoning": 1.7317232230042197, "adv/ratio_step_to_reasoning": 1.8604219070164625, "adv/std_final_conf": 0.9296323657035828, "adv/std_reasoning": 0.7013821005821228, "adv/std_step_conf": 0.9330959320068359, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6210515157939369, "calib/avg_num_step_conf": 6.69921875, "calib/ece": 0.262055241935484, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6411290322580645, "calib/gap": 0.05743836477987441, "calib/mean_conf": 0.8805254032258064, "calib/mu_c": 0.9011383647798743, "calib/mu_w": 0.8436999999999999, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2507258064516131, "calib/std_conf": 0.17448653971054612, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44392989289191825, "calib/step_q_c_n": 1027.0, "calib/step_q_gap": 0.038566520798895, "calib/step_q_w": 0.40536337209302326, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 540.40625, "completions/mean_terminated_length": 542.5255126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.05226666666666667, "grad_norm": 0.026353314518928528, "kl": 0.05274200439453125, "learning_rate": 4.194444444444445e-06, "loss": 0.0058, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03257454186677933, "mask/share_reasoning": 0.8204771280288696, "mask/share_step_conf": 0.14304211735725403, "num_tokens": 11862703.0, "reward": 0.9203234314918518, "reward_std": 0.17258694767951965, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.686448872089386, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8362292647361755, "step": 49 }, { "adv/mean_abs_final_conf": 0.7396889925003052, "adv/mean_abs_reasoning": 0.4333023130893707, "adv/mean_abs_step_conf": 0.7512565851211548, "adv/ratio_final_to_reasoning": 1.707096800906624, "adv/ratio_step_to_reasoning": 1.7337931564796527, "adv/std_final_conf": 0.9285359382629395, "adv/std_reasoning": 0.701409637928009, "adv/std_step_conf": 0.9330706000328064, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6411962365591398, "calib/avg_num_step_conf": 6.4921875, "calib/ece": 0.2160474308300395, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4980237154150198, "calib/gap": 0.08282526881720431, "calib/mean_conf": 0.8113043478260871, "calib/mu_c": 0.84175, "calib/mu_w": 0.7589247311827957, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1974703557312253, "calib/std_conf": 0.21444725864414702, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4211234817813765, "calib/step_q_c_n": 988.0, "calib/step_q_gap": 0.02628668652915095, "calib/step_q_w": 0.39483679525222554, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2081.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 524.390625, "completions/mean_terminated_length": 526.4470825195312, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.05333333333333334, "grad_norm": 0.027222493663430214, "kl": 0.05865478515625, "learning_rate": 4.166666666666667e-06, "loss": -0.0132, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0337475948035717, "mask/share_reasoning": 0.8215821385383606, "mask/share_step_conf": 0.1407640427350998, "num_tokens": 12102307.0, "reward": 0.9507993459701538, "reward_std": 0.17239651083946228, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7195187211036682, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8594236373901367, "step": 50 }, { "adv/mean_abs_final_conf": 0.7422425746917725, "adv/mean_abs_reasoning": 0.32978689670562744, "adv/mean_abs_step_conf": 0.747180700302124, "adv/ratio_final_to_reasoning": 2.2506733351334725, "adv/ratio_step_to_reasoning": 2.265647021655528, "adv/std_final_conf": 0.9353103637695312, "adv/std_reasoning": 0.6185762286186218, "adv/std_step_conf": 0.9331775307655334, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6924078525641026, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.19300833333333328, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4007936507936508, "calib/gap": 0.14890641025641027, "calib/mean_conf": 0.7490551587301587, "calib/mu_c": 0.8057814102564103, "calib/mu_w": 0.656875, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1615079365079365, "calib/std_conf": 0.26039730848654774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41321656050955413, "calib/step_q_c_n": 942.0, "calib/step_q_gap": 0.04111060024465346, "calib/step_q_w": 0.37210596026490067, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 511.4140625, "completions/mean_terminated_length": 517.478271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0544, "grad_norm": 0.04199579730629921, "kl": 0.055328369140625, "learning_rate": 4.138888888888889e-06, "loss": -0.0797, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032312776893377304, "mask/share_reasoning": 0.8194234371185303, "mask/share_step_conf": 0.1365450620651245, "num_tokens": 12342525.0, "reward": 0.9573413133621216, "reward_std": 0.16113626956939697, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7379820942878723, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8579504489898682, "step": 51 }, { "adv/mean_abs_final_conf": 0.731773316860199, "adv/mean_abs_reasoning": 0.32766151428222656, "adv/mean_abs_step_conf": 0.7813364267349243, "adv/ratio_final_to_reasoning": 2.2333209271257184, "adv/ratio_step_to_reasoning": 2.384584068246512, "adv/std_final_conf": 0.9118570685386658, "adv/std_reasoning": 0.6184503436088562, "adv/std_step_conf": 0.9324238896369934, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6821321321321321, "calib/avg_num_step_conf": 6.40234375, "calib/ece": 0.11342519685039368, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4015748031496063, "calib/gap": 0.1586441441441443, "calib/mean_conf": 0.7516141732283463, "calib/mu_c": 0.7978333333333334, "calib/mu_w": 0.6391891891891891, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07818897637795273, "calib/std_conf": 0.24566347724293106, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4085675917636526, "calib/step_q_c_n": 1117.0, "calib/step_q_gap": 0.04006184463721579, "calib/step_q_w": 0.3685057471264368, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 500.421875, "completions/mean_terminated_length": 500.421875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.055466666666666664, "grad_norm": 0.5362410545349121, "kl": 1.5911026000976562, "learning_rate": 4.111111111111111e-06, "loss": 0.0556, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03464528173208237, "mask/share_reasoning": 0.8255054354667664, "mask/share_step_conf": 0.13984927535057068, "num_tokens": 12578585.0, "reward": 0.998847246170044, "reward_std": 0.11867427080869675, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7906261682510376, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8680056929588318, "step": 52 }, { "adv/mean_abs_final_conf": 0.7077825665473938, "adv/mean_abs_reasoning": 0.42978763580322266, "adv/mean_abs_step_conf": 0.758415937423706, "adv/ratio_final_to_reasoning": 1.6468192837251616, "adv/ratio_step_to_reasoning": 1.7646294919729733, "adv/std_final_conf": 0.8943660259246826, "adv/std_reasoning": 0.7013234496116638, "adv/std_step_conf": 0.9318793416023254, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6782853359814631, "calib/avg_num_step_conf": 7.140625, "calib/ece": 0.2157874015748033, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4448818897637795, "calib/gap": 0.1158788480635552, "calib/mean_conf": 0.7804330708661417, "calib/mu_c": 0.8237735849056604, "calib/mu_w": 0.7078947368421052, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1851181102362206, "calib/std_conf": 0.23070290028011436, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40601916376306624, "calib/step_q_c_n": 1148.0, "calib/step_q_gap": 0.033298575527772156, "calib/step_q_w": 0.3727205882352941, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 524.7890625, "completions/mean_terminated_length": 526.8471069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.05653333333333333, "grad_norm": 0.03012523055076599, "kl": 0.06369400024414062, "learning_rate": 4.083333333333334e-06, "loss": -0.0806, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03163750469684601, "mask/share_reasoning": 0.8229259252548218, "mask/share_step_conf": 0.14153030514717102, "num_tokens": 12818755.0, "reward": 0.9539192318916321, "reward_std": 0.14013896882534027, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7372496128082275, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8479325771331787, "step": 53 }, { "adv/mean_abs_final_conf": 0.6720455884933472, "adv/mean_abs_reasoning": 0.2754554748535156, "adv/mean_abs_step_conf": 0.762405276298523, "adv/ratio_final_to_reasoning": 2.4397612312869588, "adv/ratio_step_to_reasoning": 2.7677985950504786, "adv/std_final_conf": 0.870083749294281, "adv/std_reasoning": 0.5726470351219177, "adv/std_step_conf": 0.9328941702842712, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.724006359300477, "calib/avg_num_step_conf": 6.20703125, "calib/ece": 0.16122529644268782, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6363636363636364, "calib/gap": 0.12781796502384746, "calib/mean_conf": 0.8553754940711463, "calib/mu_c": 0.8897297297297297, "calib/mu_w": 0.7619117647058823, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14268774703557321, "calib/std_conf": 0.2091568744618195, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4453250222617987, "calib/step_q_c_n": 1123.0, "calib/step_q_gap": 0.03588510809870865, "calib/step_q_w": 0.40943991416309006, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2739.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 482.38671875, "completions/mean_terminated_length": 484.2784729003906, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.0576, "grad_norm": 0.042764466255903244, "kl": 0.05710601806640625, "learning_rate": 4.055555555555556e-06, "loss": 0.01, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036248303949832916, "mask/share_reasoning": 0.8180440664291382, "mask/share_step_conf": 0.14180132746696472, "num_tokens": 13048478.0, "reward": 0.9917140007019043, "reward_std": 0.13152074813842773, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7852355241775513, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8560049533843994, "step": 54 }, { "adv/mean_abs_final_conf": 0.7059681415557861, "adv/mean_abs_reasoning": 0.44087427854537964, "adv/mean_abs_step_conf": 0.7620362639427185, "adv/ratio_final_to_reasoning": 1.601291288493983, "adv/ratio_step_to_reasoning": 1.728466143357196, "adv/std_final_conf": 0.8803775310516357, "adv/std_reasoning": 0.7205796241760254, "adv/std_step_conf": 0.9334477782249451, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8048804012345679, "calib/avg_num_step_conf": 6.890625, "calib/ece": 0.26468253968253963, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5833333333333334, "calib/gap": 0.22023148148148142, "calib/mean_conf": 0.8249206349206349, "calib/mu_c": 0.9193055555555555, "calib/mu_w": 0.6990740740740741, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25908730158730153, "calib/std_conf": 0.22471762185952407, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4419675925925926, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.07078981481481483, "calib/step_q_w": 0.3711777777777778, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2216.0, "completions/max_terminated_length": 2216.0, "completions/mean_length": 498.51171875, "completions/mean_terminated_length": 504.4229431152344, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.058666666666666666, "grad_norm": 0.0345403328537941, "kl": 0.060577392578125, "learning_rate": 4.027777777777779e-06, "loss": -0.0871, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034203819930553436, "mask/share_reasoning": 0.8095616102218628, "mask/share_step_conf": 0.14451584219932556, "num_tokens": 13283921.0, "reward": 0.9504541754722595, "reward_std": 0.18151721358299255, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7330425977706909, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8592719435691833, "step": 55 }, { "adv/mean_abs_final_conf": 0.7152382135391235, "adv/mean_abs_reasoning": 0.5132660865783691, "adv/mean_abs_step_conf": 0.7421176433563232, "adv/ratio_final_to_reasoning": 1.3935037444363, "adv/ratio_step_to_reasoning": 1.4458731304527976, "adv/std_final_conf": 0.888146162033081, "adv/std_reasoning": 0.7575376629829407, "adv/std_step_conf": 0.9327020645141602, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.675488342785129, "calib/avg_num_step_conf": 6.74609375, "calib/ece": 0.36122529644268786, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7154150197628458, "calib/gap": 0.08611594202898532, "calib/mean_conf": 0.8881027667984189, "calib/mu_c": 0.9272463768115942, "calib/mu_w": 0.8411304347826088, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3519367588932808, "calib/std_conf": 0.19404259065574303, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4460133630289532, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.05667681296863958, "calib/step_q_w": 0.38933655006031365, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 516.9765625, "completions/mean_terminated_length": 519.0039672851562, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.05973333333333333, "grad_norm": 0.030760738998651505, "kl": 0.05310821533203125, "learning_rate": 4.000000000000001e-06, "loss": -0.0016, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032516516745090485, "mask/share_reasoning": 0.8238645792007446, "mask/share_step_conf": 0.1397126466035843, "num_tokens": 13523107.0, "reward": 0.8982587456703186, "reward_std": 0.1786113828420639, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6322113275527954, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.858837366104126, "step": 56 }, { "adv/mean_abs_final_conf": 0.6410858035087585, "adv/mean_abs_reasoning": 0.45202022790908813, "adv/mean_abs_step_conf": 0.7799255847930908, "adv/ratio_final_to_reasoning": 1.41826795334853, "adv/ratio_step_to_reasoning": 1.7254218653019044, "adv/std_final_conf": 0.8326322436332703, "adv/std_reasoning": 0.7205796241760254, "adv/std_step_conf": 0.9327417612075806, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.749898097826087, "calib/avg_num_step_conf": 6.65234375, "calib/ece": 0.2768650793650795, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7579365079365079, "calib/gap": 0.128470108695652, "calib/mean_conf": 0.9117857142857142, "calib/mu_c": 0.9586874999999999, "calib/mu_w": 0.8302173913043479, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2768650793650795, "calib/std_conf": 0.1650408370408865, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4636482412060301, "calib/step_q_c_n": 995.0, "calib/step_q_gap": 0.06506067058456111, "calib/step_q_w": 0.398587570621469, "calib/step_q_w_n": 708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 530.0, "completions/mean_terminated_length": 532.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0608, "grad_norm": 0.03361690789461136, "kl": 0.0550384521484375, "learning_rate": 3.972222222222223e-06, "loss": 0.0228, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032974641770124435, "mask/share_reasoning": 0.8260525465011597, "mask/share_step_conf": 0.1370665282011032, "num_tokens": 13765579.0, "reward": 0.9451895356178284, "reward_std": 0.18180344998836517, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7125582098960876, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.855945885181427, "step": 57 }, { "adv/mean_abs_final_conf": 0.7059772610664368, "adv/mean_abs_reasoning": 0.6071633696556091, "adv/mean_abs_step_conf": 0.7528630495071411, "adv/ratio_final_to_reasoning": 1.1627467932837847, "adv/ratio_step_to_reasoning": 1.2399678358959216, "adv/std_final_conf": 0.8767675757408142, "adv/std_reasoning": 0.826632022857666, "adv/std_step_conf": 0.9340095520019531, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5451073232323232, "calib/avg_num_step_conf": 7.3515625, "calib/ece": 0.39047619047619053, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6865079365079365, "calib/gap": 0.034446969696969476, "calib/mean_conf": 0.8624603174603176, "calib/mu_c": 0.8788636363636363, "calib/mu_w": 0.8444166666666668, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3645634920634921, "calib/std_conf": 0.22655174435378, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4163152507676561, "calib/step_q_c_n": 977.0, "calib/step_q_gap": 0.020679891651633975, "calib/step_q_w": 0.3956353591160221, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 581.890625, "completions/mean_terminated_length": 584.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.06186666666666667, "grad_norm": 0.027973540127277374, "kl": 0.06337738037109375, "learning_rate": 3.944444444444445e-06, "loss": 0.0905, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030530782416462898, "mask/share_reasoning": 0.8286088705062866, "mask/share_step_conf": 0.13695411384105682, "num_tokens": 14020863.0, "reward": 0.8600574731826782, "reward_std": 0.22369416058063507, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.589409351348877, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8314868807792664, "step": 58 }, { "adv/mean_abs_final_conf": 0.59950852394104, "adv/mean_abs_reasoning": 0.46238672733306885, "adv/mean_abs_step_conf": 0.7642239332199097, "adv/ratio_final_to_reasoning": 1.2965521899792745, "adv/ratio_step_to_reasoning": 1.652780860791058, "adv/std_final_conf": 0.8129385113716125, "adv/std_reasoning": 0.7206915616989136, "adv/std_step_conf": 0.9333196878433228, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5510875402792696, "calib/avg_num_step_conf": 6.86328125, "calib/ece": 0.35996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.816, "calib/gap": 0.0012755102040816757, "calib/mean_conf": 0.927, "calib/mu_c": 0.9275000000000001, "calib/mu_w": 0.9262244897959184, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33948, "calib/std_conf": 0.15880868993855468, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4582951146560319, "calib/step_q_c_n": 1003.0, "calib/step_q_gap": 0.032300419695819815, "calib/step_q_w": 0.4259946949602121, "calib/step_q_w_n": 754.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 536.48046875, "completions/mean_terminated_length": 540.7047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.06293333333333333, "grad_norm": 0.04841792583465576, "kl": 0.07193756103515625, "learning_rate": 3.916666666666667e-06, "loss": 0.0297, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03387182950973511, "mask/share_reasoning": 0.8236122727394104, "mask/share_step_conf": 0.1347033530473709, "num_tokens": 14264450.0, "reward": 0.8812941312789917, "reward_std": 0.1919928938150406, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.62040114402771, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8281245231628418, "step": 59 }, { "adv/mean_abs_final_conf": 0.6542816162109375, "adv/mean_abs_reasoning": 0.4994843602180481, "adv/mean_abs_step_conf": 0.7410473227500916, "adv/ratio_final_to_reasoning": 1.3099141200843871, "adv/ratio_step_to_reasoning": 1.4836246773103967, "adv/std_final_conf": 0.8408299684524536, "adv/std_reasoning": 0.7753770351409912, "adv/std_step_conf": 0.9341368079185486, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6835099508917032, "calib/avg_num_step_conf": 6.91015625, "calib/ece": 0.3258734126984126, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7182539682539683, "calib/gap": 0.10675041354355141, "calib/mean_conf": 0.8592059523809524, "calib/mu_c": 0.9041089041095892, "calib/mu_w": 0.7973584905660378, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30285714285714277, "calib/std_conf": 0.24918923389658307, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47209534368070943, "calib/step_q_c_n": 902.0, "calib/step_q_gap": 0.06223375198520775, "calib/step_q_w": 0.4098615916955017, "calib/step_q_w_n": 867.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 524.90625, "completions/mean_terminated_length": 526.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.064, "grad_norm": 0.06499022990465164, "kl": 0.062458038330078125, "learning_rate": 3.88888888888889e-06, "loss": 0.0025, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03385772183537483, "mask/share_reasoning": 0.8223803043365479, "mask/share_step_conf": 0.13985571265220642, "num_tokens": 14507682.0, "reward": 0.908770740032196, "reward_std": 0.21273939311504364, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6553382873535156, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8528280854225159, "step": 60 }, { "adv/mean_abs_final_conf": 0.624822735786438, "adv/mean_abs_reasoning": 0.48236283659935, "adv/mean_abs_step_conf": 0.7643167972564697, "adv/ratio_final_to_reasoning": 1.2953376346142833, "adv/ratio_step_to_reasoning": 1.5845267074157092, "adv/std_final_conf": 0.837339460849762, "adv/std_reasoning": 0.7392899990081787, "adv/std_step_conf": 0.9319630861282349, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5754061889031215, "calib/avg_num_step_conf": 6.05078125, "calib/ece": 0.29094488188976375, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7677165354330708, "calib/gap": 0.07063574462347466, "calib/mean_conf": 0.9023622047244094, "calib/mu_c": 0.9276687116564416, "calib/mu_w": 0.8570329670329669, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27578740157480314, "calib/std_conf": 0.19318285855435954, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5004536082474226, "calib/step_q_c_n": 970.0, "calib/step_q_gap": 0.0710926410626212, "calib/step_q_w": 0.4293609671848014, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 427.23046875, "completions/mean_terminated_length": 428.9059143066406, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.03414275497198105, "kl": 0.07006072998046875, "learning_rate": 3.861111111111112e-06, "loss": -0.0542, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04184265434741974, "mask/share_reasoning": 0.8065690994262695, "mask/share_step_conf": 0.14768198132514954, "num_tokens": 14721117.0, "reward": 0.9312187433242798, "reward_std": 0.1829134076833725, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6918726563453674, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8440022468566895, "step": 61 }, { "adv/mean_abs_final_conf": 0.7021551132202148, "adv/mean_abs_reasoning": 0.6482760906219482, "adv/mean_abs_step_conf": 0.7477187514305115, "adv/ratio_final_to_reasoning": 1.0831112289619933, "adv/ratio_step_to_reasoning": 1.1533955397200584, "adv/std_final_conf": 0.9061055183410645, "adv/std_reasoning": 0.8589903712272644, "adv/std_step_conf": 0.9343938827514648, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5849002849002849, "calib/avg_num_step_conf": 6.49609375, "calib/ece": 0.2740079365079365, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5317460317460317, "calib/gap": 0.07605128205128187, "calib/mean_conf": 0.7631349206349206, "calib/mu_c": 0.7984444444444444, "calib/mu_w": 0.7223931623931625, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25071428571428567, "calib/std_conf": 0.2770466654054787, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.48075178997613366, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.04790330512764879, "calib/step_q_w": 0.43284848484848487, "calib/step_q_w_n": 825.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 534.63671875, "completions/mean_terminated_length": 536.7333374023438, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.06613333333333334, "grad_norm": 0.0399576835334301, "kl": 0.06037139892578125, "learning_rate": 3.833333333333334e-06, "loss": 0.021, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033392585813999176, "mask/share_reasoning": 0.8314791321754456, "mask/share_step_conf": 0.13122203946113586, "num_tokens": 14965064.0, "reward": 0.8811693787574768, "reward_std": 0.1998165249824524, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6458152532577515, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8165234327316284, "step": 62 }, { "adv/mean_abs_final_conf": 0.7243236899375916, "adv/mean_abs_reasoning": 0.5651422142982483, "adv/mean_abs_step_conf": 0.777940571308136, "adv/ratio_final_to_reasoning": 1.2816662277423445, "adv/ratio_step_to_reasoning": 1.3765394826753916, "adv/std_final_conf": 0.9065544009208679, "adv/std_reasoning": 0.77553391456604, "adv/std_step_conf": 0.9342755079269409, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7336436170212766, "calib/avg_num_step_conf": 5.91015625, "calib/ece": 0.10771653543307089, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.41338582677165353, "calib/gap": 0.2497606382978722, "calib/mean_conf": 0.6888188976377954, "calib/mu_c": 0.78125, "calib/mu_w": 0.5314893617021278, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.08330708661417326, "calib/std_conf": 0.28815354112855623, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.48752183406113536, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.060670912788103604, "calib/step_q_w": 0.42685092127303176, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 542.08984375, "completions/mean_terminated_length": 544.2156982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.0672, "grad_norm": 0.04744715243577957, "kl": 0.0615386962890625, "learning_rate": 3.8055555555555556e-06, "loss": -0.077, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032869912683963776, "mask/share_reasoning": 0.8419344425201416, "mask/share_step_conf": 0.12128940969705582, "num_tokens": 15212479.0, "reward": 0.9379910230636597, "reward_std": 0.19364088773727417, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7451125383377075, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8175883293151855, "step": 63 }, { "adv/mean_abs_final_conf": 0.7251790761947632, "adv/mean_abs_reasoning": 0.5222870111465454, "adv/mean_abs_step_conf": 0.7692461609840393, "adv/ratio_final_to_reasoning": 1.388468525385728, "adv/ratio_step_to_reasoning": 1.4728418370875418, "adv/std_final_conf": 0.9209416508674622, "adv/std_reasoning": 0.7927463054656982, "adv/std_step_conf": 0.9337198734283447, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6286424768842813, "calib/avg_num_step_conf": 6.71875, "calib/ece": 0.1541269841269841, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.38095238095238093, "calib/gap": 0.1363519193051277, "calib/mean_conf": 0.6988888888888889, "calib/mu_c": 0.745421686746988, "calib/mu_w": 0.6090697674418604, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09714285714285711, "calib/std_conf": 0.2715350900245463, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4557664233576642, "calib/step_q_c_n": 1096.0, "calib/step_q_gap": 0.04341065412689499, "calib/step_q_w": 0.4123557692307692, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 512.3125, "completions/mean_terminated_length": 512.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.06826666666666667, "grad_norm": 0.12423020601272583, "kl": 0.06578826904296875, "learning_rate": 3.777777777777778e-06, "loss": 0.0643, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035044483840465546, "mask/share_reasoning": 0.8245336413383484, "mask/share_step_conf": 0.14042183756828308, "num_tokens": 15447407.0, "reward": 0.9516028165817261, "reward_std": 0.15794092416763306, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7492632865905762, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8273798823356628, "step": 64 }, { "adv/mean_abs_final_conf": 0.5918190479278564, "adv/mean_abs_reasoning": 0.4216136932373047, "adv/mean_abs_step_conf": 0.75426185131073, "adv/ratio_final_to_reasoning": 1.4036997787800785, "adv/ratio_step_to_reasoning": 1.7889880319570046, "adv/std_final_conf": 0.8399807810783386, "adv/std_reasoning": 0.7204056978225708, "adv/std_step_conf": 0.9331092238426208, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6555228758169934, "calib/avg_num_step_conf": 5.9921875, "calib/ece": 0.275494071146245, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7114624505928854, "calib/gap": 0.14319999999999977, "calib/mean_conf": 0.863399209486166, "calib/mu_c": 0.9199999999999999, "calib/mu_w": 0.7768000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2670750988142292, "calib/std_conf": 0.2268313845027238, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5217622080679406, "calib/step_q_c_n": 942.0, "calib/step_q_gap": 0.04120477563550817, "calib/step_q_w": 0.48055743243243243, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 453.61328125, "completions/mean_terminated_length": 455.3921813964844, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.06933333333333333, "grad_norm": 0.04314102977514267, "kl": 0.0760955810546875, "learning_rate": 3.7500000000000005e-06, "loss": -0.0123, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038080520927906036, "mask/share_reasoning": 0.8146101236343384, "mask/share_step_conf": 0.1434030830860138, "num_tokens": 15668556.0, "reward": 0.9371784925460815, "reward_std": 0.15816906094551086, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7027406096458435, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8536475896835327, "step": 65 }, { "adv/mean_abs_final_conf": 0.6618846654891968, "adv/mean_abs_reasoning": 0.5083737969398499, "adv/mean_abs_step_conf": 0.7653838396072388, "adv/ratio_final_to_reasoning": 1.301964557326526, "adv/ratio_step_to_reasoning": 1.5055532842456827, "adv/std_final_conf": 0.8603252172470093, "adv/std_reasoning": 0.7575896382331848, "adv/std_step_conf": 0.9338029026985168, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7378846275352247, "calib/avg_num_step_conf": 7.546875, "calib/ece": 0.28428571428571425, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.261530296329058, "calib/mean_conf": 0.7565079365079366, "calib/mu_c": 0.8945378151260505, "calib/mu_w": 0.6330075187969925, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.28428571428571425, "calib/std_conf": 0.2847076402438163, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4879167733674776, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.08008010959684331, "calib/step_q_w": 0.40783666377063427, "calib/step_q_w_n": 1151.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 596.5625, "completions/mean_terminated_length": 598.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.0704, "grad_norm": 0.05358945205807686, "kl": 0.06067657470703125, "learning_rate": 3.7222222222222225e-06, "loss": 0.0073, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03181684762239456, "mask/share_reasoning": 0.831584095954895, "mask/share_step_conf": 0.13269281387329102, "num_tokens": 15927628.0, "reward": 0.9063929319381714, "reward_std": 0.20524170994758606, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6950390934944153, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8310280442237854, "step": 66 }, { "adv/mean_abs_final_conf": 0.5081011056900024, "adv/mean_abs_reasoning": 0.35968446731567383, "adv/mean_abs_step_conf": 0.7419739365577698, "adv/ratio_final_to_reasoning": 1.4126301018277558, "adv/ratio_step_to_reasoning": 2.0628467559222763, "adv/std_final_conf": 0.7767390608787537, "adv/std_reasoning": 0.6813791990280151, "adv/std_step_conf": 0.9321821928024292, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7599078341013825, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.23509881422924903, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6719367588932806, "calib/gap": 0.22916919025674787, "calib/mean_conf": 0.8477470355731226, "calib/mu_c": 0.936516129032258, "calib/mu_w": 0.7073469387755101, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23509881422924903, "calib/std_conf": 0.23716887306077852, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4979411764705883, "calib/step_q_c_n": 918.0, "calib/step_q_gap": 0.0674094187157877, "calib/step_q_w": 0.43053175775480057, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 525.47265625, "completions/mean_terminated_length": 527.5333862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.07146666666666666, "grad_norm": 0.07478371262550354, "kl": 0.070709228515625, "learning_rate": 3.694444444444445e-06, "loss": -0.0358, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03388848155736923, "mask/share_reasoning": 0.8310776352882385, "mask/share_step_conf": 0.13112762570381165, "num_tokens": 16167157.0, "reward": 0.9643101096153259, "reward_std": 0.14082594215869904, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7473699450492859, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8632815480232239, "step": 67 }, { "adv/mean_abs_final_conf": 0.6216704845428467, "adv/mean_abs_reasoning": 0.4290942847728729, "adv/mean_abs_step_conf": 0.7697738409042358, "adv/ratio_final_to_reasoning": 1.448796934855256, "adv/ratio_step_to_reasoning": 1.7939503466276427, "adv/std_final_conf": 0.8364220857620239, "adv/std_reasoning": 0.7013837695121765, "adv/std_step_conf": 0.9322218894958496, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7144844093673178, "calib/avg_num_step_conf": 6.36328125, "calib/ece": 0.3388353413654618, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.714859437751004, "calib/gap": 0.19704942424634486, "calib/mean_conf": 0.8649397590361446, "calib/mu_c": 0.9583206106870228, "calib/mu_w": 0.7612711864406779, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3388353413654618, "calib/std_conf": 0.23456380175533872, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4838215712383489, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.09090585369848553, "calib/step_q_w": 0.3929157175398634, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 518.23828125, "completions/mean_terminated_length": 518.23828125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.07253333333333334, "grad_norm": 0.04946654289960861, "kl": 0.06878662109375, "learning_rate": 3.6666666666666666e-06, "loss": 0.0202, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03565322607755661, "mask/share_reasoning": 0.8229528665542603, "mask/share_step_conf": 0.14139388501644135, "num_tokens": 16403914.0, "reward": 0.9006872177124023, "reward_std": 0.1842219978570938, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6572574377059937, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8480231761932373, "step": 68 }, { "adv/mean_abs_final_conf": 0.7201113700866699, "adv/mean_abs_reasoning": 0.5343183279037476, "adv/mean_abs_step_conf": 0.7530443668365479, "adv/ratio_final_to_reasoning": 1.3477197626962016, "adv/ratio_step_to_reasoning": 1.4093552990984088, "adv/std_final_conf": 0.9012336134910583, "adv/std_reasoning": 0.7576988339424133, "adv/std_step_conf": 0.9333232045173645, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.714098237720285, "calib/avg_num_step_conf": 6.6953125, "calib/ece": 0.18699604743083, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.36363636363636365, "calib/gap": 0.2355143107111609, "calib/mean_conf": 0.662094861660079, "calib/mu_c": 0.7803174603174603, "calib/mu_w": 0.5448031496062994, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17553359683794462, "calib/std_conf": 0.2990141226542987, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4238228155339806, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.04873292789353112, "calib/step_q_w": 0.3750898876404495, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2613.0, "completions/max_terminated_length": 2613.0, "completions/mean_length": 613.66796875, "completions/mean_terminated_length": 613.66796875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.0736, "grad_norm": 0.060156576335430145, "kl": 0.06504058837890625, "learning_rate": 3.638888888888889e-06, "loss": -0.0109, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030466951429843903, "mask/share_reasoning": 0.8466631770133972, "mask/share_step_conf": 0.12286990880966187, "num_tokens": 16665509.0, "reward": 0.934023916721344, "reward_std": 0.16452187299728394, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7315406203269958, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8435383439064026, "step": 69 }, { "adv/mean_abs_final_conf": 0.6430877447128296, "adv/mean_abs_reasoning": 0.41097357869148254, "adv/mean_abs_step_conf": 0.7237237095832825, "adv/ratio_final_to_reasoning": 1.564790969678357, "adv/ratio_step_to_reasoning": 1.7609981446680327, "adv/std_final_conf": 0.8517022728919983, "adv/std_reasoning": 0.7013329863548279, "adv/std_step_conf": 0.9332301616668701, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8866443643849717, "calib/avg_num_step_conf": 7.02734375, "calib/ece": 0.13728, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.412, "calib/gap": 0.5055249613998971, "calib/mean_conf": 0.6012799999999999, "calib/mu_c": 0.8722413793103448, "calib/mu_w": 0.36671641791044773, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13728, "calib/std_conf": 0.3597743203732028, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4723941605839415, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.13617333473115872, "calib/step_q_w": 0.3362208258527828, "calib/step_q_w_n": 1114.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 593.4765625, "completions/mean_terminated_length": 598.1495971679688, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.07466666666666667, "grad_norm": 0.05660438910126686, "kl": 0.10711669921875, "learning_rate": 3.6111111111111115e-06, "loss": -0.028, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03215644136071205, "mask/share_reasoning": 0.8270474672317505, "mask/share_step_conf": 0.13298359513282776, "num_tokens": 16924431.0, "reward": 0.9864169955253601, "reward_std": 0.1601521074771881, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.8269773721694946, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8614815473556519, "step": 70 }, { "adv/mean_abs_final_conf": 0.6957278251647949, "adv/mean_abs_reasoning": 0.547110915184021, "adv/mean_abs_step_conf": 0.7761636972427368, "adv/ratio_final_to_reasoning": 1.2716394534566844, "adv/ratio_step_to_reasoning": 1.4186587686368382, "adv/std_final_conf": 0.8768286108970642, "adv/std_reasoning": 0.7576953768730164, "adv/std_step_conf": 0.9334019422531128, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6453033268101761, "calib/avg_num_step_conf": 7.08984375, "calib/ece": 0.23900398406374493, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3705179282868526, "calib/gap": 0.17135746901500337, "calib/mean_conf": 0.5838645418326693, "calib/mu_c": 0.6555479452054795, "calib/mu_w": 0.48419047619047617, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12059760956175292, "calib/std_conf": 0.3530129360880522, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4009497816593886, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.07640028221556217, "calib/step_q_w": 0.32454949944382644, "calib/step_q_w_n": 899.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 533.2734375, "completions/mean_terminated_length": 537.472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.07573333333333333, "grad_norm": 0.08311379700899124, "kl": 0.07154083251953125, "learning_rate": 3.5833333333333335e-06, "loss": 0.0345, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03394794836640358, "mask/share_reasoning": 0.8204426765441895, "mask/share_step_conf": 0.13779692351818085, "num_tokens": 17165357.0, "reward": 0.9159102439880371, "reward_std": 0.17106956243515015, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6937718391418457, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8302360773086548, "step": 71 }, { "adv/mean_abs_final_conf": 0.7265099287033081, "adv/mean_abs_reasoning": 0.38411909341812134, "adv/mean_abs_step_conf": 0.7782511711120605, "adv/ratio_final_to_reasoning": 1.8913663526548197, "adv/ratio_step_to_reasoning": 2.0260673953661517, "adv/std_final_conf": 0.8927708268165588, "adv/std_reasoning": 0.6612346768379211, "adv/std_step_conf": 0.9314785003662109, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7946273291925465, "calib/avg_num_step_conf": 6.46875, "calib/ece": 0.13019607843137251, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2627450980392157, "calib/gap": 0.36564285714285705, "calib/mean_conf": 0.4747450980392157, "calib/mu_c": 0.6396428571428571, "calib/mu_w": 0.274, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.02796078431372547, "calib/std_conf": 0.3526024169468427, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40148357870894674, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.0988315735343025, "calib/step_q_w": 0.30265200517464425, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 487.4609375, "completions/mean_terminated_length": 487.4609375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0768, "grad_norm": 0.06640713661909103, "kl": 0.08196258544921875, "learning_rate": 3.555555555555556e-06, "loss": 0.0169, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034188974648714066, "mask/share_reasoning": 0.8254907727241516, "mask/share_step_conf": 0.14032024145126343, "num_tokens": 17394555.0, "reward": 0.9930468797683716, "reward_std": 0.1217828020453453, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7926070690155029, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8872367143630981, "step": 72 }, { "adv/mean_abs_final_conf": 0.7764418721199036, "adv/mean_abs_reasoning": 0.5932731628417969, "adv/mean_abs_step_conf": 0.7526953220367432, "adv/ratio_final_to_reasoning": 1.3087426176514085, "adv/ratio_step_to_reasoning": 1.2687162831221106, "adv/std_final_conf": 0.9236984252929688, "adv/std_reasoning": 0.7929815053939819, "adv/std_step_conf": 0.9335096478462219, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7226810456624121, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.26289682539682546, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.15079365079365079, "calib/gap": 0.22959866220735775, "calib/mean_conf": 0.40361111111111114, "calib/mu_c": 0.48652173913043467, "calib/mu_w": 0.2569230769230769, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.013809523809523827, "calib/std_conf": 0.30786395346775686, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39301063829787236, "calib/step_q_c_n": 940.0, "calib/step_q_gap": 0.08380096087851752, "calib/step_q_w": 0.30920967741935484, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 502.7265625, "completions/mean_terminated_length": 504.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.07786666666666667, "grad_norm": 0.10404765605926514, "kl": 0.09230804443359375, "learning_rate": 3.5277777777777784e-06, "loss": -0.0109, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03293906897306442, "mask/share_reasoning": 0.8376255631446838, "mask/share_step_conf": 0.12552911043167114, "num_tokens": 17630285.0, "reward": 0.9415270090103149, "reward_std": 0.15380805730819702, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7067304849624634, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8544486165046692, "step": 73 }, { "adv/mean_abs_final_conf": 0.73908931016922, "adv/mean_abs_reasoning": 0.44497135281562805, "adv/mean_abs_step_conf": 0.7388650178909302, "adv/ratio_final_to_reasoning": 1.660981781169761, "adv/ratio_step_to_reasoning": 1.6604777211288828, "adv/std_final_conf": 0.9192122220993042, "adv/std_reasoning": 0.7205416560173035, "adv/std_step_conf": 0.9333183765411377, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6947535256824946, "calib/avg_num_step_conf": 6.47265625, "calib/ece": 0.18489959839357437, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.15261044176706828, "calib/gap": 0.2503215163669299, "calib/mean_conf": 0.3292369477911647, "calib/mu_c": 0.46093220338983065, "calib/mu_w": 0.21061068702290076, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.02012048192771087, "calib/std_conf": 0.3090207036128271, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.37147937411095305, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.07340809528495723, "calib/step_q_w": 0.2980712788259958, "calib/step_q_w_n": 954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 530.578125, "completions/mean_terminated_length": 530.578125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.07893333333333333, "grad_norm": 0.09156779944896698, "kl": 0.091705322265625, "learning_rate": 3.5e-06, "loss": 0.0196, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03372114151716232, "mask/share_reasoning": 0.8263232111930847, "mask/share_step_conf": 0.13995562493801117, "num_tokens": 17870041.0, "reward": 0.9400572776794434, "reward_std": 0.145626500248909, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7383249998092651, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8550707697868347, "step": 74 }, { "adv/mean_abs_final_conf": 0.6418685913085938, "adv/mean_abs_reasoning": 0.4402565062046051, "adv/mean_abs_step_conf": 0.7393307685852051, "adv/ratio_final_to_reasoning": 1.4579423183136135, "adv/ratio_step_to_reasoning": 1.6793182114646774, "adv/std_final_conf": 0.8484848737716675, "adv/std_reasoning": 0.7205449342727661, "adv/std_step_conf": 0.9325771927833557, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8336708860759493, "calib/avg_num_step_conf": 6.04296875, "calib/ece": 0.137244094488189, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.42913385826771655, "calib/gap": 0.41448245931283917, "calib/mean_conf": 0.6235433070866142, "calib/mu_c": 0.7524571428571429, "calib/mu_w": 0.33797468354430377, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.035905511811023666, "calib/std_conf": 0.3644130812605217, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43970727101038715, "calib/step_q_c_n": 1059.0, "calib/step_q_gap": 0.08626464805956746, "calib/step_q_w": 0.3534426229508197, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 474.8359375, "completions/mean_terminated_length": 474.8359375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.08, "grad_norm": 0.0735108032822609, "kl": 0.07817840576171875, "learning_rate": 3.4722222222222224e-06, "loss": 0.0596, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03609136492013931, "mask/share_reasoning": 0.822197437286377, "mask/share_step_conf": 0.14171117544174194, "num_tokens": 18096351.0, "reward": 1.008620262145996, "reward_std": 0.12839573621749878, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.8198156356811523, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8622686862945557, "step": 75 }, { "adv/mean_abs_final_conf": 0.6263427734375, "adv/mean_abs_reasoning": 0.41424667835235596, "adv/mean_abs_step_conf": 0.7314493060112, "adv/ratio_final_to_reasoning": 1.5120043350226606, "adv/ratio_step_to_reasoning": 1.7657336660380731, "adv/std_final_conf": 0.8453238010406494, "adv/std_reasoning": 0.6816251873970032, "adv/std_step_conf": 0.9329613447189331, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7849274767378216, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.18490196078431378, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.42745098039215684, "calib/gap": 0.33634236453201966, "calib/mean_conf": 0.6125098039215686, "calib/mu_c": 0.7272619047619048, "calib/mu_w": 0.3909195402298851, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0692941176470589, "calib/std_conf": 0.36580823072832713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44701639344262295, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.09937439688496369, "calib/step_q_w": 0.34764199655765926, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 484.93359375, "completions/mean_terminated_length": 486.8353271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.08106666666666666, "grad_norm": 0.056952860206365585, "kl": 0.0942840576171875, "learning_rate": 3.444444444444445e-06, "loss": -0.0246, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03601346164941788, "mask/share_reasoning": 0.8301782608032227, "mask/share_step_conf": 0.12990202009677887, "num_tokens": 18323550.0, "reward": 0.997848391532898, "reward_std": 0.1068374365568161, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7835609316825867, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8824483156204224, "step": 76 }, { "adv/mean_abs_final_conf": 0.7339452505111694, "adv/mean_abs_reasoning": 0.4712091088294983, "adv/mean_abs_step_conf": 0.7212806940078735, "adv/ratio_final_to_reasoning": 1.5575786561815792, "adv/ratio_step_to_reasoning": 1.5307019335843544, "adv/std_final_conf": 0.9045832753181458, "adv/std_reasoning": 0.7392547726631165, "adv/std_step_conf": 0.9330065846443176, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6528925619834711, "calib/avg_num_step_conf": 6.3671875, "calib/ece": 0.2300395256916996, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47035573122529645, "calib/gap": 0.15879545454545452, "calib/mean_conf": 0.7045849802371542, "calib/mu_c": 0.7598181818181818, "calib/mu_w": 0.6010227272727273, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14122529644268775, "calib/std_conf": 0.32628633502282633, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4560182370820669, "calib/step_q_c_n": 987.0, "calib/step_q_gap": 0.08223907689544169, "calib/step_q_w": 0.3737791601866252, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 508.41015625, "completions/mean_terminated_length": 508.41015625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.08213333333333334, "grad_norm": 0.09080254286527634, "kl": 0.0932464599609375, "learning_rate": 3.416666666666667e-06, "loss": 0.0611, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03744114935398102, "mask/share_reasoning": 0.8208218216896057, "mask/share_step_conf": 0.14173701405525208, "num_tokens": 18558367.0, "reward": 0.9562437534332275, "reward_std": 0.17072513699531555, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7273656129837036, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8593405485153198, "step": 77 }, { "adv/mean_abs_final_conf": 0.680761456489563, "adv/mean_abs_reasoning": 0.49786466360092163, "adv/mean_abs_step_conf": 0.7462135553359985, "adv/ratio_final_to_reasoning": 1.3673624706879133, "adv/ratio_step_to_reasoning": 1.4988281151324057, "adv/std_final_conf": 0.874717116355896, "adv/std_reasoning": 0.7575218081474304, "adv/std_step_conf": 0.9335688352584839, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6907335907335908, "calib/avg_num_step_conf": 6.72265625, "calib/ece": 0.23426877470355728, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6284584980237155, "calib/gap": 0.24018854568854564, "calib/mean_conf": 0.8066007905138339, "calib/mu_c": 0.9062837837837837, "calib/mu_w": 0.6660952380952381, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2279446640316205, "calib/std_conf": 0.292426651590856, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5035102925243771, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.09295891407826179, "calib/step_q_w": 0.4105513784461153, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2771.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 577.2109375, "completions/mean_terminated_length": 577.2109375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.0832, "grad_norm": 0.06496492773294449, "kl": 0.0711212158203125, "learning_rate": 3.3888888888888893e-06, "loss": 0.0643, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029832325875759125, "mask/share_reasoning": 0.8439598083496094, "mask/share_step_conf": 0.1262078881263733, "num_tokens": 18814157.0, "reward": 0.9481760859489441, "reward_std": 0.20212724804878235, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7228991985321045, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.86173415184021, "step": 78 }, { "adv/mean_abs_final_conf": 0.6379029750823975, "adv/mean_abs_reasoning": 0.48745274543762207, "adv/mean_abs_step_conf": 0.7569434642791748, "adv/ratio_final_to_reasoning": 1.308645773468165, "adv/ratio_step_to_reasoning": 1.5528550641347012, "adv/std_final_conf": 0.8597071170806885, "adv/std_reasoning": 0.7575080394744873, "adv/std_step_conf": 0.9334502816200256, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5977476864452912, "calib/avg_num_step_conf": 6.98828125, "calib/ece": 0.2859215686274509, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7764705882352941, "calib/gap": 0.08455906369080002, "calib/mean_conf": 0.8921960784313725, "calib/mu_c": 0.921377245508982, "calib/mu_w": 0.836818181818182, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2616078431372548, "calib/std_conf": 0.2300858339077255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.504735494880546, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": 0.059743598608260895, "calib/step_q_w": 0.44499189627228514, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 558.203125, "completions/mean_terminated_length": 558.203125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.08426666666666667, "grad_norm": 0.03557606413960457, "kl": 0.07753753662109375, "learning_rate": 3.3611111111111117e-06, "loss": -0.0357, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030786633491516113, "mask/share_reasoning": 0.8402998447418213, "mask/share_step_conf": 0.1289135068655014, "num_tokens": 19063433.0, "reward": 0.9475151300430298, "reward_std": 0.1952834278345108, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6971789002418518, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8689451217651367, "step": 79 }, { "adv/mean_abs_final_conf": 0.5850554704666138, "adv/mean_abs_reasoning": 0.4936892092227936, "adv/mean_abs_step_conf": 0.7499476075172424, "adv/ratio_final_to_reasoning": 1.1850683781151639, "adv/ratio_step_to_reasoning": 1.5190682589515618, "adv/std_final_conf": 0.7985376119613647, "adv/std_reasoning": 0.7575132250785828, "adv/std_step_conf": 0.9337923526763916, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5993457300275482, "calib/avg_num_step_conf": 6.6953125, "calib/ece": 0.30507246376811586, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9288537549407114, "calib/gap": 0.09892929292929298, "calib/mean_conf": 0.9572463768115942, "calib/mu_c": 0.9916565656565657, "calib/mu_w": 0.8927272727272727, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.30507246376811586, "calib/std_conf": 0.15437612240392326, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.555724171539961, "calib/step_q_c_n": 1026.0, "calib/step_q_gap": 0.06543347386554244, "calib/step_q_w": 0.4902906976744186, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 509.875, "completions/mean_terminated_length": 509.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.08533333333333333, "grad_norm": 0.024001065641641617, "kl": 0.07498931884765625, "learning_rate": 3.3333333333333333e-06, "loss": -0.0313, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034538500010967255, "mask/share_reasoning": 0.8263256549835205, "mask/share_step_conf": 0.13913580775260925, "num_tokens": 19296121.0, "reward": 0.9185687303543091, "reward_std": 0.222752183675766, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.685109555721283, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8270277976989746, "step": 80 }, { "adv/mean_abs_final_conf": 0.563480794429779, "adv/mean_abs_reasoning": 0.46398669481277466, "adv/mean_abs_step_conf": 0.7496002316474915, "adv/ratio_final_to_reasoning": 1.2144330876926368, "adv/ratio_step_to_reasoning": 1.6155640668747322, "adv/std_final_conf": 0.7881554365158081, "adv/std_reasoning": 0.7393821477890015, "adv/std_step_conf": 0.9339177012443542, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7111616997792494, "calib/avg_num_step_conf": 7.3984375, "calib/ece": 0.30008097165991915, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8259109311740891, "calib/gap": 0.15856719094922733, "calib/mean_conf": 0.9114170040485831, "calib/mu_c": 0.9730463576158941, "calib/mu_w": 0.8144791666666668, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30008097165991915, "calib/std_conf": 0.21095314874616838, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5747247706422018, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.14297330293183003, "calib/step_q_w": 0.4317514677103718, "calib/step_q_w_n": 1022.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 591.15625, "completions/mean_terminated_length": 593.4745483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.0864, "grad_norm": 0.02519802935421467, "kl": 0.06589508056640625, "learning_rate": 3.3055555555555558e-06, "loss": 0.1002, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03203187882900238, "mask/share_reasoning": 0.8315613269805908, "mask/share_step_conf": 0.1325005143880844, "num_tokens": 19553705.0, "reward": 0.9012755751609802, "reward_std": 0.200510174036026, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6784765720367432, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8131371140480042, "step": 81 }, { "adv/mean_abs_final_conf": 0.5482626557350159, "adv/mean_abs_reasoning": 0.422796368598938, "adv/mean_abs_step_conf": 0.7653951644897461, "adv/ratio_final_to_reasoning": 1.2967534644440015, "adv/ratio_step_to_reasoning": 1.810316316164473, "adv/std_final_conf": 0.7778072357177734, "adv/std_reasoning": 0.6816251277923584, "adv/std_step_conf": 0.933276891708374, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6104530744336569, "calib/avg_num_step_conf": 6.21875, "calib/ece": 0.3651383399209486, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8893280632411067, "calib/gap": 0.06926731391585794, "calib/mean_conf": 0.9455335968379447, "calib/mu_c": 0.9737333333333333, "calib/mu_w": 0.9044660194174754, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.358893280632411, "calib/std_conf": 0.167837359685009, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6096951934349356, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.12957340723737143, "calib/step_q_w": 0.4801217861975642, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 508.69140625, "completions/mean_terminated_length": 508.69140625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.08746666666666666, "grad_norm": 0.05241883173584938, "kl": 0.07489776611328125, "learning_rate": 3.277777777777778e-06, "loss": -0.0588, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03483765944838524, "mask/share_reasoning": 0.8307796120643616, "mask/share_step_conf": 0.13438266515731812, "num_tokens": 19789482.0, "reward": 0.8882876634597778, "reward_std": 0.1930658370256424, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6260604858398438, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8372336030006409, "step": 82 }, { "adv/mean_abs_final_conf": 0.5539191961288452, "adv/mean_abs_reasoning": 0.41217041015625, "adv/mean_abs_step_conf": 0.7351502776145935, "adv/ratio_final_to_reasoning": 1.3439082051495632, "adv/ratio_step_to_reasoning": 1.7836076037964608, "adv/std_final_conf": 0.7939161062240601, "adv/std_reasoning": 0.6817459464073181, "adv/std_step_conf": 0.9330217242240906, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6222155528878217, "calib/avg_num_step_conf": 6.38671875, "calib/ece": 0.3342629482071713, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8366533864541833, "calib/gap": 0.08308523409363733, "calib/mean_conf": 0.9169721115537849, "calib/mu_c": 0.9494117647058823, "calib/mu_w": 0.866326530612245, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3208366533864542, "calib/std_conf": 0.20582840883510017, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.576103752759382, "calib/step_q_c_n": 906.0, "calib/step_q_gap": 0.1191215854068442, "calib/step_q_w": 0.45698216735253777, "calib/step_q_w_n": 729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 602.44921875, "completions/mean_terminated_length": 604.8118286132812, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.08853333333333334, "grad_norm": 0.04311411827802658, "kl": 0.06854248046875, "learning_rate": 3.2500000000000002e-06, "loss": -0.0854, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03103901445865631, "mask/share_reasoning": 0.84500652551651, "mask/share_step_conf": 0.12004822492599487, "num_tokens": 20050973.0, "reward": 0.9060057997703552, "reward_std": 0.20195919275283813, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6508910059928894, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8470580577850342, "step": 83 }, { "adv/mean_abs_final_conf": 0.6155233383178711, "adv/mean_abs_reasoning": 0.42707884311676025, "adv/mean_abs_step_conf": 0.7596107721328735, "adv/ratio_final_to_reasoning": 1.4412405302633815, "adv/ratio_step_to_reasoning": 1.778619532143861, "adv/std_final_conf": 0.8287082314491272, "adv/std_reasoning": 0.7204527854919434, "adv/std_step_conf": 0.9342764616012573, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7035398230088495, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.3858232931726907, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8755020080321285, "calib/gap": 0.12827368558042673, "calib/mean_conf": 0.932008032128514, "calib/mu_c": 0.990220588235294, "calib/mu_w": 0.8619469026548673, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3858232931726907, "calib/std_conf": 0.18901983474612602, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6036246786632391, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.10646986694775795, "calib/step_q_w": 0.49715481171548115, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 501.8203125, "completions/mean_terminated_length": 505.7716369628906, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.0896, "grad_norm": 0.05019519478082657, "kl": 0.30936431884765625, "learning_rate": 3.2222222222222227e-06, "loss": -0.0528, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03413093462586403, "mask/share_reasoning": 0.8325973153114319, "mask/share_step_conf": 0.12545925378799438, "num_tokens": 20285359.0, "reward": 0.862108588218689, "reward_std": 0.21464627981185913, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6097398996353149, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8152584433555603, "step": 84 }, { "adv/mean_abs_final_conf": 0.6300166249275208, "adv/mean_abs_reasoning": 0.5347933769226074, "adv/mean_abs_step_conf": 0.7569372057914734, "adv/ratio_final_to_reasoning": 1.1780561467549617, "adv/ratio_step_to_reasoning": 1.415382535489054, "adv/std_final_conf": 0.8321962356567383, "adv/std_reasoning": 0.7927958369255066, "adv/std_step_conf": 0.9346948266029358, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6801801801801802, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.37691358024691374, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8436213991769548, "calib/gap": 0.14998771498771502, "calib/mean_conf": 0.9201234567901234, "calib/mu_c": 0.9886363636363636, "calib/mu_w": 0.8386486486486486, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.37691358024691374, "calib/std_conf": 0.19790566294345305, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5567567567567567, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.09856473680663203, "calib/step_q_w": 0.4581920199501247, "calib/step_q_w_n": 802.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 580.41015625, "completions/mean_terminated_length": 582.686279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.09066666666666667, "grad_norm": 0.0329107902944088, "kl": 0.065185546875, "learning_rate": 3.1944444444444443e-06, "loss": 0.0589, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03241187334060669, "mask/share_reasoning": 0.8383945226669312, "mask/share_step_conf": 0.12528733909130096, "num_tokens": 20541768.0, "reward": 0.8522720336914062, "reward_std": 0.2573317587375641, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6064144372940063, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8067232370376587, "step": 85 }, { "adv/mean_abs_final_conf": 0.6762087941169739, "adv/mean_abs_reasoning": 0.4965750277042389, "adv/mean_abs_step_conf": 0.733121931552887, "adv/ratio_final_to_reasoning": 1.3617454692460396, "adv/ratio_step_to_reasoning": 1.476356825558163, "adv/std_final_conf": 0.8725427985191345, "adv/std_reasoning": 0.7575492262840271, "adv/std_step_conf": 0.9345494508743286, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.67103125, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.3817786561264823, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7628458498023716, "calib/gap": 0.16379812500000002, "calib/mean_conf": 0.8758498023715414, "calib/mu_c": 0.95872, "calib/mu_w": 0.794921875, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3817786561264823, "calib/std_conf": 0.2462280049163768, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5691800878477306, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.1282504096594112, "calib/step_q_w": 0.44092967818831935, "calib/step_q_w_n": 839.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 547.69921875, "completions/mean_terminated_length": 549.8471069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.09173333333333333, "grad_norm": 0.05290186405181885, "kl": 0.07259368896484375, "learning_rate": 3.1666666666666667e-06, "loss": -0.0715, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032537199556827545, "mask/share_reasoning": 0.8427928686141968, "mask/share_step_conf": 0.1207636296749115, "num_tokens": 20787491.0, "reward": 0.8759980201721191, "reward_std": 0.21968460083007812, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.615178108215332, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8430678248405457, "step": 86 }, { "adv/mean_abs_final_conf": 0.6011219620704651, "adv/mean_abs_reasoning": 0.4481174647808075, "adv/mean_abs_step_conf": 0.7793911695480347, "adv/ratio_final_to_reasoning": 1.341438371219248, "adv/ratio_step_to_reasoning": 1.7392564021785373, "adv/std_final_conf": 0.7962128520011902, "adv/std_reasoning": 0.7205802798271179, "adv/std_step_conf": 0.9328499436378479, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6277848729956008, "calib/avg_num_step_conf": 5.1875, "calib/ece": 0.2805490196078431, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9137254901960784, "calib/gap": 0.07362494678586606, "calib/mean_conf": 0.951843137254902, "calib/mu_c": 0.9752298850574712, "calib/mu_w": 0.9016049382716051, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2750196078431372, "calib/std_conf": 0.15608486537137664, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5707511210762332, "calib/step_q_c_n": 892.0, "calib/step_q_gap": 0.04361809355329738, "calib/step_q_w": 0.5271330275229358, "calib/step_q_w_n": 436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 485.70703125, "completions/mean_terminated_length": 487.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0928, "grad_norm": 0.03262199088931084, "kl": 0.0754241943359375, "learning_rate": 3.138888888888889e-06, "loss": 0.0488, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03634430468082428, "mask/share_reasoning": 0.8396223783493042, "mask/share_step_conf": 0.12012706696987152, "num_tokens": 21017328.0, "reward": 0.9499078989028931, "reward_std": 0.1929827332496643, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7115480303764343, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8546739816665649, "step": 87 }, { "adv/mean_abs_final_conf": 0.6190221309661865, "adv/mean_abs_reasoning": 0.5069084167480469, "adv/mean_abs_step_conf": 0.7607830166816711, "adv/ratio_final_to_reasoning": 1.2211715381200003, "adv/ratio_step_to_reasoning": 1.5008293244809343, "adv/std_final_conf": 0.8372948169708252, "adv/std_reasoning": 0.7575567960739136, "adv/std_step_conf": 0.933971107006073, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8295369805436918, "calib/avg_num_step_conf": 6.265625, "calib/ece": 0.2845564516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7661290322580645, "calib/gap": 0.22631075859263772, "calib/mean_conf": 0.8853629032258065, "calib/mu_c": 0.9757046979865771, "calib/mu_w": 0.7493939393939394, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2845564516129033, "calib/std_conf": 0.2239128515191619, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.536989853438557, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.10948636668820833, "calib/step_q_w": 0.42750348675034866, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2901.0, "completions/max_terminated_length": 2901.0, "completions/mean_length": 576.59765625, "completions/mean_terminated_length": 578.85888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.09386666666666667, "grad_norm": 0.043756153434515, "kl": 0.06731414794921875, "learning_rate": 3.1111111111111116e-06, "loss": -0.0369, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030716445297002792, "mask/share_reasoning": 0.8459240794181824, "mask/share_step_conf": 0.11945319920778275, "num_tokens": 21274785.0, "reward": 0.9229838848114014, "reward_std": 0.20685866475105286, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7071710824966431, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8302028775215149, "step": 88 }, { "adv/mean_abs_final_conf": 0.6980656385421753, "adv/mean_abs_reasoning": 0.548902690410614, "adv/mean_abs_step_conf": 0.736992597579956, "adv/ratio_final_to_reasoning": 1.27174752599587, "adv/ratio_step_to_reasoning": 1.3426653038057417, "adv/std_final_conf": 0.8767775297164917, "adv/std_reasoning": 0.8098074793815613, "adv/std_step_conf": 0.9346106648445129, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7770247933884298, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.3142682926829268, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6260162601626016, "calib/gap": 0.22904661157024808, "calib/mean_conf": 0.8211788617886179, "calib/mu_c": 0.93384, "calib/mu_w": 0.7047933884297519, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3136585365853658, "calib/std_conf": 0.266238157720389, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5375903614457831, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.13856033835109488, "calib/step_q_w": 0.3990300230946882, "calib/step_q_w_n": 866.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 598.29296875, "completions/mean_terminated_length": 600.6392211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.09493333333333333, "grad_norm": 0.03296685218811035, "kl": 0.08054351806640625, "learning_rate": 3.0833333333333336e-06, "loss": -0.084, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03184279054403305, "mask/share_reasoning": 0.8469675183296204, "mask/share_step_conf": 0.11728344857692719, "num_tokens": 21536836.0, "reward": 0.886458158493042, "reward_std": 0.24242013692855835, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6579160690307617, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8267190456390381, "step": 89 }, { "adv/mean_abs_final_conf": 0.6241170763969421, "adv/mean_abs_reasoning": 0.5056729912757874, "adv/mean_abs_step_conf": 0.7552558183670044, "adv/ratio_final_to_reasoning": 1.2342305940096314, "adv/ratio_step_to_reasoning": 1.4935656667395507, "adv/std_final_conf": 0.8450374603271484, "adv/std_reasoning": 0.7576045393943787, "adv/std_step_conf": 0.9340672492980957, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.717675046614801, "calib/avg_num_step_conf": 6.19140625, "calib/ece": 0.2933858267716536, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7913385826771654, "calib/gap": 0.16305021539252884, "calib/mean_conf": 0.8856692913385827, "calib/mu_c": 0.9517880794701987, "calib/mu_w": 0.7887378640776699, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29228346456692916, "calib/std_conf": 0.23862826673908388, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5008129175946548, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.06686823054953112, "calib/step_q_w": 0.43394468704512373, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 532.34375, "completions/mean_terminated_length": 534.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.096, "grad_norm": 0.04197279363870621, "kl": 0.0772247314453125, "learning_rate": 3.055555555555556e-06, "loss": 0.0217, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03273506462574005, "mask/share_reasoning": 0.8372526168823242, "mask/share_step_conf": 0.12610609829425812, "num_tokens": 21776436.0, "reward": 0.9231204986572266, "reward_std": 0.2121354043483734, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6835886240005493, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8485898375511169, "step": 90 }, { "adv/mean_abs_final_conf": 0.5773174166679382, "adv/mean_abs_reasoning": 0.4142797589302063, "adv/mean_abs_step_conf": 0.753173291683197, "adv/ratio_final_to_reasoning": 1.393544831055091, "adv/ratio_step_to_reasoning": 1.818030631349489, "adv/std_final_conf": 0.7948840856552124, "adv/std_reasoning": 0.7013592720031738, "adv/std_step_conf": 0.9329512715339661, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6713343732274532, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.23400000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.764, "calib/gap": 0.16991917186613714, "calib/mean_conf": 0.86856, "calib/mu_c": 0.9270121951219512, "calib/mu_w": 0.7570930232558141, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.22328000000000003, "calib/std_conf": 0.2525397521183546, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4832835820895522, "calib/step_q_c_n": 871.0, "calib/step_q_gap": 0.08330047398144408, "calib/step_q_w": 0.3999831081081081, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1869.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 528.02734375, "completions/mean_terminated_length": 530.0980834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.09706666666666666, "grad_norm": 0.030806271359324455, "kl": 0.08370208740234375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030275100842118263, "mask/share_reasoning": 0.8504114747047424, "mask/share_step_conf": 0.11540718376636505, "num_tokens": 22019323.0, "reward": 0.9466753005981445, "reward_std": 0.17928370833396912, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7201827764511108, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8512927889823914, "step": 91 }, { "adv/mean_abs_final_conf": 0.6468561887741089, "adv/mean_abs_reasoning": 0.44251346588134766, "adv/mean_abs_step_conf": 0.7543888092041016, "adv/ratio_final_to_reasoning": 1.4617774116450328, "adv/ratio_step_to_reasoning": 1.7047815882881583, "adv/std_final_conf": 0.8452576994895935, "adv/std_reasoning": 0.7013332843780518, "adv/std_step_conf": 0.9340010285377502, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7756791307126878, "calib/avg_num_step_conf": 6.17578125, "calib/ece": 0.2511023622047244, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7007874015748031, "calib/gap": 0.27194375199744325, "calib/mean_conf": 0.8377165354330709, "calib/mu_c": 0.9501342281879194, "calib/mu_w": 0.6781904761904761, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2511023622047244, "calib/std_conf": 0.2746169529161331, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48480239520958085, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.11804636303799904, "calib/step_q_w": 0.3667560321715818, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 547.29296875, "completions/mean_terminated_length": 547.29296875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.09813333333333334, "grad_norm": 0.04027779400348663, "kl": 0.0789642333984375, "learning_rate": 3e-06, "loss": -0.0147, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03349082171916962, "mask/share_reasoning": 0.8417634963989258, "mask/share_step_conf": 0.12474566698074341, "num_tokens": 22266150.0, "reward": 0.9606250524520874, "reward_std": 0.17936253547668457, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7411539554595947, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.866033673286438, "step": 92 }, { "adv/mean_abs_final_conf": 0.6630334854125977, "adv/mean_abs_reasoning": 0.5095129609107971, "adv/mean_abs_step_conf": 0.7587285041809082, "adv/ratio_final_to_reasoning": 1.3013083793341973, "adv/ratio_step_to_reasoning": 1.4891250319218914, "adv/std_final_conf": 0.8640803694725037, "adv/std_reasoning": 0.7575684189796448, "adv/std_step_conf": 0.9342039823532104, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7449599298772853, "calib/avg_num_step_conf": 7.3046875, "calib/ece": 0.2682213438735179, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6126482213438735, "calib/gap": 0.2599242424242425, "calib/mean_conf": 0.7837944664031621, "calib/mu_c": 0.9081060606060607, "calib/mu_w": 0.6481818181818182, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.26513833992094876, "calib/std_conf": 0.3035427007764735, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4588609112709832, "calib/step_q_c_n": 834.0, "calib/step_q_gap": 0.08960415451422649, "calib/step_q_w": 0.36925675675675673, "calib/step_q_w_n": 1036.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 593.234375, "completions/mean_terminated_length": 593.234375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.0992, "grad_norm": 0.05038225278258324, "kl": 0.07889556884765625, "learning_rate": 2.9722222222222225e-06, "loss": -0.0754, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031364671885967255, "mask/share_reasoning": 0.833544909954071, "mask/share_step_conf": 0.1350904256105423, "num_tokens": 22523794.0, "reward": 0.9259089231491089, "reward_std": 0.20582300424575806, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7031351327896118, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8494638204574585, "step": 93 }, { "adv/mean_abs_final_conf": 0.6444368958473206, "adv/mean_abs_reasoning": 0.5294855237007141, "adv/mean_abs_step_conf": 0.732408881187439, "adv/ratio_final_to_reasoning": 1.2171001226684743, "adv/ratio_step_to_reasoning": 1.3832462804052506, "adv/std_final_conf": 0.8604599833488464, "adv/std_reasoning": 0.7928929924964905, "adv/std_step_conf": 0.9340792298316956, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7979837653836083, "calib/avg_num_step_conf": 6.0625, "calib/ece": 0.2141935483870968, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5282258064516129, "calib/gap": 0.3021550144016759, "calib/mean_conf": 0.7528225806451613, "calib/mu_c": 0.8917164179104479, "calib/mu_w": 0.589561403508772, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21334677419354842, "calib/std_conf": 0.3001412583667654, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4648173207036536, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.12778165034695005, "calib/step_q_w": 0.33703567035670356, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 537.80859375, "completions/mean_terminated_length": 546.3452758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.10026666666666667, "grad_norm": 0.0642198994755745, "kl": 0.089935302734375, "learning_rate": 2.944444444444445e-06, "loss": -0.076, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033078089356422424, "mask/share_reasoning": 0.831333339214325, "mask/share_step_conf": 0.11996357142925262, "num_tokens": 22770153.0, "reward": 0.9430240392684937, "reward_std": 0.2067991942167282, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7425273656845093, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8450832366943359, "step": 94 }, { "adv/mean_abs_final_conf": 0.591467559337616, "adv/mean_abs_reasoning": 0.4067404270172119, "adv/mean_abs_step_conf": 0.753157913684845, "adv/ratio_final_to_reasoning": 1.4541646712501164, "adv/ratio_step_to_reasoning": 1.8516918006111396, "adv/std_final_conf": 0.792325496673584, "adv/std_reasoning": 0.68165522813797, "adv/std_step_conf": 0.9338064789772034, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7732813607370659, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.1265612648221344, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5889328063241107, "calib/gap": 0.31951736357193483, "calib/mean_conf": 0.7577075098814229, "calib/mu_c": 0.8625294117647059, "calib/mu_w": 0.543012048192771, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10616600790513835, "calib/std_conf": 0.31944907092670294, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43998063891577927, "calib/step_q_c_n": 1033.0, "calib/step_q_gap": 0.0882407121758525, "calib/step_q_w": 0.35173992673992677, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 519.26953125, "completions/mean_terminated_length": 523.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 91.0, "epoch": 0.10133333333333333, "grad_norm": 0.04013686254620552, "kl": 0.08258819580078125, "learning_rate": 2.916666666666667e-06, "loss": -0.0758, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033396221697330475, "mask/share_reasoning": 0.8303040266036987, "mask/share_step_conf": 0.12848728895187378, "num_tokens": 23009214.0, "reward": 0.9940088987350464, "reward_std": 0.14248350262641907, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7942163944244385, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8648951649665833, "step": 95 }, { "adv/mean_abs_final_conf": 0.5519489645957947, "adv/mean_abs_reasoning": 0.42615142464637756, "adv/mean_abs_step_conf": 0.7463536262512207, "adv/ratio_final_to_reasoning": 1.295194460639911, "adv/ratio_step_to_reasoning": 1.7513812769030361, "adv/std_final_conf": 0.792453408241272, "adv/std_reasoning": 0.6817641258239746, "adv/std_step_conf": 0.9340023994445801, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8270086083213772, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.1136904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6428571428571429, "calib/gap": 0.4094562410329986, "calib/mean_conf": 0.7882936507936507, "calib/mu_c": 0.921529411764706, "calib/mu_w": 0.5120731707317074, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1136904761904762, "calib/std_conf": 0.3097833745797808, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4784560570071259, "calib/step_q_c_n": 842.0, "calib/step_q_gap": 0.1548417712928402, "calib/step_q_w": 0.3236142857142857, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 484.4296875, "completions/mean_terminated_length": 486.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.1024, "grad_norm": 0.04179300367832184, "kl": 0.1007080078125, "learning_rate": 2.888888888888889e-06, "loss": 0.0869, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03455401211977005, "mask/share_reasoning": 0.8357703685760498, "mask/share_step_conf": 0.12576934695243835, "num_tokens": 23239044.0, "reward": 0.9944518208503723, "reward_std": 0.16103437542915344, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.8348710536956787, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8259074687957764, "step": 96 }, { "adv/mean_abs_final_conf": 0.6904168128967285, "adv/mean_abs_reasoning": 0.5510886907577515, "adv/mean_abs_step_conf": 0.7708159685134888, "adv/ratio_final_to_reasoning": 1.252823410234385, "adv/ratio_step_to_reasoning": 1.3987149100331029, "adv/std_final_conf": 0.8760626912117004, "adv/std_reasoning": 0.8098317384719849, "adv/std_step_conf": 0.9342261552810669, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7776057791537668, "calib/avg_num_step_conf": 6.1171875, "calib/ece": 0.15060000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.46, "calib/gap": 0.3279024767801859, "calib/mean_conf": 0.6617999999999999, "calib/mu_c": 0.8113235294117648, "calib/mu_w": 0.48342105263157886, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1342000000000001, "calib/std_conf": 0.3494446451156463, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.44049180327868853, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.12093185124031919, "calib/step_q_w": 0.31955995203836934, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 535.20703125, "completions/mean_terminated_length": 535.20703125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.10346666666666667, "grad_norm": 0.043487440794706345, "kl": 0.0912933349609375, "learning_rate": 2.861111111111111e-06, "loss": -0.0083, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03317775949835777, "mask/share_reasoning": 0.84007728099823, "mask/share_step_conf": 0.12674494087696075, "num_tokens": 23481129.0, "reward": 0.9605032205581665, "reward_std": 0.19906312227249146, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7593629360198975, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8608622550964355, "step": 97 }, { "adv/mean_abs_final_conf": 0.7032493352890015, "adv/mean_abs_reasoning": 0.48649489879608154, "adv/mean_abs_step_conf": 0.7983685731887817, "adv/ratio_final_to_reasoning": 1.4455430818068544, "adv/ratio_step_to_reasoning": 1.6410625787947362, "adv/std_final_conf": 0.8910401463508606, "adv/std_reasoning": 0.739440381526947, "adv/std_step_conf": 0.9343165159225464, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7257996632996633, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.15135802469135806, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.48148148148148145, "calib/gap": 0.30894570707070707, "calib/mean_conf": 0.6682304526748971, "calib/mu_c": 0.7940972222222222, "calib/mu_w": 0.48515151515151517, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.11349794238683131, "calib/std_conf": 0.3540601602797552, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4261679790026247, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.11119545153009724, "calib/step_q_w": 0.31497252747252746, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 602.69140625, "completions/mean_terminated_length": 607.43701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 79.0, "epoch": 0.10453333333333334, "grad_norm": 0.04565083980560303, "kl": 0.07500457763671875, "learning_rate": 2.8333333333333335e-06, "loss": -0.0307, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03302436321973801, "mask/share_reasoning": 0.8465808629989624, "mask/share_step_conf": 0.112582266330719, "num_tokens": 23741602.0, "reward": 0.9159847497940063, "reward_std": 0.20621484518051147, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7263554334640503, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8063952922821045, "step": 98 }, { "adv/mean_abs_final_conf": 0.7148996591567993, "adv/mean_abs_reasoning": 0.6136833429336548, "adv/mean_abs_step_conf": 0.7592363357543945, "adv/ratio_final_to_reasoning": 1.1649324808773358, "adv/ratio_step_to_reasoning": 1.2371793116054568, "adv/std_final_conf": 0.8905500769615173, "adv/std_reasoning": 0.8266958594322205, "adv/std_step_conf": 0.9347360134124756, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7288732394366197, "calib/avg_num_step_conf": 6.87109375, "calib/ece": 0.1465702479338843, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.256198347107438, "calib/gap": 0.2946746478873239, "calib/mean_conf": 0.46599173553719014, "calib/mu_c": 0.6388999999999999, "calib/mu_w": 0.344225352112676, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09966942148760331, "calib/std_conf": 0.35996792776941633, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3858032786885246, "calib/step_q_c_n": 610.0, "calib/step_q_gap": 0.0916692490975759, "calib/step_q_w": 0.2941340295909487, "calib/step_q_w_n": 1149.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 614.54296875, "completions/mean_terminated_length": 634.366943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 82.0, "epoch": 0.1056, "grad_norm": 0.04171948879957199, "kl": 0.079315185546875, "learning_rate": 2.805555555555556e-06, "loss": -0.1994, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.02855312079191208, "mask/share_reasoning": 0.8328218460083008, "mask/share_step_conf": 0.10737504065036774, "num_tokens": 24004725.0, "reward": 0.9038716554641724, "reward_std": 0.21844570338726044, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.7249144315719604, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8172037601470947, "step": 99 }, { "adv/mean_abs_final_conf": 0.718896210193634, "adv/mean_abs_reasoning": 0.38725799322128296, "adv/mean_abs_step_conf": 0.7504380345344543, "adv/ratio_final_to_reasoning": 1.856375395156401, "adv/ratio_step_to_reasoning": 1.937824519236319, "adv/std_final_conf": 0.9049928188323975, "adv/std_reasoning": 0.6815465688705444, "adv/std_step_conf": 0.9327692985534668, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8164866255144033, "calib/avg_num_step_conf": 6.13671875, "calib/ece": 0.12813492063492066, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.393148148148148, "calib/mean_conf": 0.5690079365079365, "calib/mu_c": 0.7374999999999999, "calib/mu_w": 0.3443518518518519, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06285714285714289, "calib/std_conf": 0.3656406205949707, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41403669724770636, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.11046015933640452, "calib/step_q_w": 0.30357653791130185, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 566.171875, "completions/mean_terminated_length": 570.6299438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.10666666666666667, "grad_norm": 0.05875632166862488, "kl": 0.085693359375, "learning_rate": 2.7777777777777783e-06, "loss": -0.0158, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030663391575217247, "mask/share_reasoning": 0.8413369655609131, "mask/share_step_conf": 0.1201871708035469, "num_tokens": 24257073.0, "reward": 0.9939379692077637, "reward_std": 0.14616990089416504, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.801247239112854, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8764723539352417, "step": 100 }, { "adv/mean_abs_final_conf": 0.7209446430206299, "adv/mean_abs_reasoning": 0.42586550116539, "adv/mean_abs_step_conf": 0.7728959321975708, "adv/ratio_final_to_reasoning": 1.6928928054696835, "adv/ratio_step_to_reasoning": 1.8148827037703796, "adv/std_final_conf": 0.8900034427642822, "adv/std_reasoning": 0.6817054152488708, "adv/std_step_conf": 0.9328939914703369, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7362021169354839, "calib/avg_num_step_conf": 6.328125, "calib/ece": 0.15242063492063493, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.21825396825396826, "calib/gap": 0.3034173387096776, "calib/mean_conf": 0.4427380952380952, "calib/mu_c": 0.5968548387096776, "calib/mu_w": 0.2934375, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.05154761904761904, "calib/std_conf": 0.34981860783843766, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3698863636363637, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.046706411945542425, "calib/step_q_w": 0.32317995169082125, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 629.3203125, "completions/mean_terminated_length": 629.3203125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.10773333333333333, "grad_norm": 0.06255262345075607, "kl": 0.08121490478515625, "learning_rate": 2.7500000000000004e-06, "loss": -0.0331, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.028444774448871613, "mask/share_reasoning": 0.8561521172523499, "mask/share_step_conf": 0.11540311574935913, "num_tokens": 24525171.0, "reward": 0.9472886919975281, "reward_std": 0.15424081683158875, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.754944920539856, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8474448919296265, "step": 101 }, { "adv/mean_abs_final_conf": 0.6129434108734131, "adv/mean_abs_reasoning": 0.38685131072998047, "adv/mean_abs_step_conf": 0.7592980265617371, "adv/ratio_final_to_reasoning": 1.5844418614397389, "adv/ratio_step_to_reasoning": 1.9627645183079703, "adv/std_final_conf": 0.8420135378837585, "adv/std_reasoning": 0.6815320253372192, "adv/std_step_conf": 0.9337236285209656, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8452188552188553, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.12887550200803208, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.357429718875502, "calib/gap": 0.4105111111111111, "calib/mean_conf": 0.5828514056224899, "calib/mu_c": 0.7460666666666667, "calib/mu_w": 0.33555555555555555, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.05465863453815256, "calib/std_conf": 0.3598854988704338, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4501040312093628, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.12143159253692415, "calib/step_q_w": 0.32867243867243867, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 499.125, "completions/mean_terminated_length": 505.0434875488281, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.1088, "grad_norm": 0.09262681007385254, "kl": 0.09638214111328125, "learning_rate": 2.7222222222222224e-06, "loss": -0.0662, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03711201623082161, "mask/share_reasoning": 0.8191013336181641, "mask/share_step_conf": 0.13206785917282104, "num_tokens": 24759643.0, "reward": 0.9702571034431458, "reward_std": 0.13695275783538818, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.8009117245674133, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8294462561607361, "step": 102 }, { "adv/mean_abs_final_conf": 0.5940297842025757, "adv/mean_abs_reasoning": 0.3715140223503113, "adv/mean_abs_step_conf": 0.7678797245025635, "adv/ratio_final_to_reasoning": 1.5989431043398084, "adv/ratio_step_to_reasoning": 2.066892979287085, "adv/std_final_conf": 0.8287274241447449, "adv/std_reasoning": 0.661227285861969, "adv/std_step_conf": 0.9333446621894836, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7930283224400871, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.16393574297188757, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4979919678714859, "calib/gap": 0.37458946078431377, "calib/mean_conf": 0.6471485943775099, "calib/mu_c": 0.7915686274509804, "calib/mu_w": 0.4169791666666667, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.09831325301204821, "calib/std_conf": 0.3757759068069405, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4832968369829683, "calib/step_q_c_n": 822.0, "calib/step_q_gap": 0.1433922264901225, "calib/step_q_w": 0.3399046104928458, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 630.10546875, "completions/mean_terminated_length": 630.10546875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.10986666666666667, "grad_norm": 0.04487231746315956, "kl": 0.0799102783203125, "learning_rate": 2.6944444444444444e-06, "loss": -0.0508, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03193045035004616, "mask/share_reasoning": 0.8594948649406433, "mask/share_step_conf": 0.10857468843460083, "num_tokens": 25025502.0, "reward": 0.9651437997817993, "reward_std": 0.16764892637729645, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7688324451446533, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8489552140235901, "step": 103 }, { "adv/mean_abs_final_conf": 0.6918395757675171, "adv/mean_abs_reasoning": 0.4783649444580078, "adv/mean_abs_step_conf": 0.7479958534240723, "adv/ratio_final_to_reasoning": 1.4462589363678773, "adv/ratio_step_to_reasoning": 1.5636510619973605, "adv/std_final_conf": 0.9062270522117615, "adv/std_reasoning": 0.7393895387649536, "adv/std_step_conf": 0.9339094161987305, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8109590809345553, "calib/avg_num_step_conf": 6.24609375, "calib/ece": 0.11160682730923693, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.2931726907630522, "calib/gap": 0.39794164192590675, "calib/mean_conf": 0.5027714859437751, "calib/mu_c": 0.7057377049180328, "calib/mu_w": 0.30779606299212603, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.06220923694779116, "calib/std_conf": 0.36954813071351916, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4920183098591549, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.14827252808187702, "calib/step_q_w": 0.3437457817772779, "calib/step_q_w_n": 889.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 604.265625, "completions/mean_terminated_length": 606.6353149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.11093333333333333, "grad_norm": 0.06312678009271622, "kl": 0.087371826171875, "learning_rate": 2.666666666666667e-06, "loss": -0.0637, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030438464134931564, "mask/share_reasoning": 0.8470672369003296, "mask/share_step_conf": 0.11858808249235153, "num_tokens": 25286874.0, "reward": 0.9490529894828796, "reward_std": 0.187879741191864, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7746831774711609, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8367041349411011, "step": 104 }, { "adv/mean_abs_final_conf": 0.757011890411377, "adv/mean_abs_reasoning": 0.6426866054534912, "adv/mean_abs_step_conf": 0.7516645789146423, "adv/ratio_final_to_reasoning": 1.1778865219654233, "adv/ratio_step_to_reasoning": 1.1695662746608113, "adv/std_final_conf": 0.9064919948577881, "adv/std_reasoning": 0.8431361317634583, "adv/std_step_conf": 0.9346654415130615, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7000324464633354, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.23377510040160643, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.40963855421686746, "calib/gap": 0.25921674237508113, "calib/mean_conf": 0.5697590361445783, "calib/mu_c": 0.6894776119402986, "calib/mu_w": 0.4302608695652174, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13269076305220884, "calib/std_conf": 0.389880970107651, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48018099547511317, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.12751256746982914, "calib/step_q_w": 0.35266842800528403, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 569.80859375, "completions/mean_terminated_length": 572.0431518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.112, "grad_norm": 0.040814757347106934, "kl": 0.07910919189453125, "learning_rate": 2.6388888888888893e-06, "loss": 0.0893, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033313244581222534, "mask/share_reasoning": 0.8503282070159912, "mask/share_step_conf": 0.11245226860046387, "num_tokens": 25538505.0, "reward": 0.9136497974395752, "reward_std": 0.25790584087371826, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6958324313163757, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8345921635627747, "step": 105 }, { "adv/mean_abs_final_conf": 0.6224170327186584, "adv/mean_abs_reasoning": 0.430540531873703, "adv/mean_abs_step_conf": 0.7094933986663818, "adv/ratio_final_to_reasoning": 1.4456641980022487, "adv/ratio_step_to_reasoning": 1.6479131374197966, "adv/std_final_conf": 0.8420764803886414, "adv/std_reasoning": 0.7014119029045105, "adv/std_step_conf": 0.9333073496818542, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8074145712443583, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.17131474103585662, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5219123505976095, "calib/gap": 0.3739458413926499, "calib/mean_conf": 0.6982470119521913, "calib/mu_c": 0.8621276595744681, "calib/mu_w": 0.4881818181818182, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15390438247011956, "calib/std_conf": 0.35938764145933155, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5282806052269601, "calib/step_q_c_n": 727.0, "calib/step_q_gap": 0.12972047864468156, "calib/step_q_w": 0.3985601265822785, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 528.19921875, "completions/mean_terminated_length": 528.19921875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.11306666666666666, "grad_norm": 0.04006750136613846, "kl": 0.08618927001953125, "learning_rate": 2.6111111111111113e-06, "loss": -0.0261, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03361281752586365, "mask/share_reasoning": 0.8508884310722351, "mask/share_step_conf": 0.11549879610538483, "num_tokens": 25778308.0, "reward": 0.9733978509902954, "reward_std": 0.1720713973045349, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7717820405960083, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8695447444915771, "step": 106 }, { "adv/mean_abs_final_conf": 0.6800111532211304, "adv/mean_abs_reasoning": 0.5244839191436768, "adv/mean_abs_step_conf": 0.7717577815055847, "adv/ratio_final_to_reasoning": 1.2965338467028358, "adv/ratio_step_to_reasoning": 1.4714612847723363, "adv/std_final_conf": 0.8938814997673035, "adv/std_reasoning": 0.7752963900566101, "adv/std_step_conf": 0.9338552355766296, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6842861705525392, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.21613281250000008, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.61328125, "calib/gap": 0.23877419354838714, "calib/mean_conf": 0.7645703125000001, "calib/mu_c": 0.8587741935483871, "calib/mu_w": 0.62, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18761718750000006, "calib/std_conf": 0.32905494847920513, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5311538461538462, "calib/step_q_c_n": 858.0, "calib/step_q_gap": 0.09338473538941561, "calib/step_q_w": 0.4377691107644306, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 485.51171875, "completions/mean_terminated_length": 487.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.11413333333333334, "grad_norm": 32082.970703125, "kl": 260096.09117126465, "learning_rate": 2.5833333333333337e-06, "loss": 1538.7279, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03437138348817825, "mask/share_reasoning": 0.8336739540100098, "mask/share_step_conf": 0.12804844975471497, "num_tokens": 26007215.0, "reward": 0.9511657953262329, "reward_std": 0.18870285153388977, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.730099618434906, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8534818887710571, "step": 107 }, { "adv/mean_abs_final_conf": 0.5230342745780945, "adv/mean_abs_reasoning": 0.45457059144973755, "adv/mean_abs_step_conf": 0.735778272151947, "adv/ratio_final_to_reasoning": 1.150611773872149, "adv/ratio_step_to_reasoning": 1.6186226869744673, "adv/std_final_conf": 0.7766129374504089, "adv/std_reasoning": 0.7206103801727295, "adv/std_step_conf": 0.9345055818557739, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7777818772136955, "calib/avg_num_step_conf": 5.91015625, "calib/ece": 0.15731225296442697, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6837944664031621, "calib/gap": 0.34880681818181825, "calib/mean_conf": 0.7835573122529644, "calib/mu_c": 0.8897159090909091, "calib/mu_w": 0.5409090909090909, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12260869565217403, "calib/std_conf": 0.3358041402787481, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5856619144602851, "calib/step_q_c_n": 982.0, "calib/step_q_gap": 0.12730033253938117, "calib/step_q_w": 0.458361581920904, "calib/step_q_w_n": 531.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 541.06640625, "completions/mean_terminated_length": 543.1882934570312, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.1152, "grad_norm": 0.02788160741329193, "kl": 0.0900421142578125, "learning_rate": 2.5555555555555557e-06, "loss": -0.031, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03286145627498627, "mask/share_reasoning": 0.8382688164710999, "mask/share_step_conf": 0.12496345490217209, "num_tokens": 26248960.0, "reward": 0.9895857572555542, "reward_std": 0.1817062944173813, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7988402843475342, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8467375040054321, "step": 108 }, { "adv/mean_abs_final_conf": 0.6572158336639404, "adv/mean_abs_reasoning": 0.49283796548843384, "adv/mean_abs_step_conf": 0.7203115224838257, "adv/ratio_final_to_reasoning": 1.3335332902216608, "adv/ratio_step_to_reasoning": 1.4615585099454564, "adv/std_final_conf": 0.863275408744812, "adv/std_reasoning": 0.7575352191925049, "adv/std_step_conf": 0.9334295392036438, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8282544774124566, "calib/avg_num_step_conf": 6.390625, "calib/ece": 0.15795918367346948, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4775510204081633, "calib/gap": 0.47229751403368075, "calib/mean_conf": 0.608734693877551, "calib/mu_c": 0.8574137931034482, "calib/mu_w": 0.3851162790697675, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.14661224489795927, "calib/std_conf": 0.4024958245700629, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5868759571209802, "calib/step_q_c_n": 653.0, "calib/step_q_gap": 0.20669284420134637, "calib/step_q_w": 0.3801831129196338, "calib/step_q_w_n": 983.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 580.15234375, "completions/mean_terminated_length": 587.0316162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 39.0, "epoch": 0.11626666666666667, "grad_norm": 0.02776014618575573, "kl": 0.0984039306640625, "learning_rate": 2.5277777777777778e-06, "loss": -0.0688, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029165200889110565, "mask/share_reasoning": 0.8402441740036011, "mask/share_step_conf": 0.11887191236019135, "num_tokens": 26502079.0, "reward": 0.9375416040420532, "reward_std": 0.19762474298477173, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7645038366317749, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8301106095314026, "step": 109 }, { "adv/mean_abs_final_conf": 0.7246900200843811, "adv/mean_abs_reasoning": 0.526329517364502, "adv/mean_abs_step_conf": 0.7481073141098022, "adv/ratio_final_to_reasoning": 1.3768751251366877, "adv/ratio_step_to_reasoning": 1.4213668233083558, "adv/std_final_conf": 0.9108691215515137, "adv/std_reasoning": 0.7754833102226257, "adv/std_step_conf": 0.9347033500671387, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6953282828282827, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.23301587301587312, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.47619047619047616, "calib/gap": 0.2693863636363637, "calib/mean_conf": 0.6821428571428572, "calib/mu_c": 0.8232500000000001, "calib/mu_w": 0.5538636363636364, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2194841269841271, "calib/std_conf": 0.3538812175006936, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6097535211267605, "calib/step_q_c_n": 568.0, "calib/step_q_gap": 0.11592150012938518, "calib/step_q_w": 0.49383202099737533, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 502.828125, "completions/mean_terminated_length": 504.8000183105469, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.11733333333333333, "grad_norm": 0.055533409118652344, "kl": 0.113800048828125, "learning_rate": 2.5e-06, "loss": -0.0275, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03461432084441185, "mask/share_reasoning": 0.8436667919158936, "mask/share_step_conf": 0.11781267821788788, "num_tokens": 26735723.0, "reward": 0.8851801156997681, "reward_std": 0.21155953407287598, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.694117546081543, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7887426614761353, "step": 110 }, { "adv/mean_abs_final_conf": 0.7247201204299927, "adv/mean_abs_reasoning": 0.5945273637771606, "adv/mean_abs_step_conf": 0.7475019693374634, "adv/ratio_final_to_reasoning": 1.218985305950746, "adv/ratio_step_to_reasoning": 1.257304566417973, "adv/std_final_conf": 0.8908548355102539, "adv/std_reasoning": 0.8100446462631226, "adv/std_step_conf": 0.9354099035263062, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.759775641025641, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.22508000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.548, "calib/gap": 0.30952564102564106, "calib/mean_conf": 0.6821200000000001, "calib/mu_c": 0.8306923076923077, "calib/mu_w": 0.5211666666666667, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.19360000000000005, "calib/std_conf": 0.37661532841879924, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5931643625192012, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.16203206216163268, "calib/step_q_w": 0.43113230035756855, "calib/step_q_w_n": 839.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2714.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 575.546875, "completions/mean_terminated_length": 575.546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1184, "grad_norm": 0.03187034651637077, "kl": 0.11324310302734375, "learning_rate": 2.4722222222222226e-06, "loss": 0.0254, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03376447781920433, "mask/share_reasoning": 0.8519448041915894, "mask/share_step_conf": 0.1142907366156578, "num_tokens": 26990471.0, "reward": 0.8868539929389954, "reward_std": 0.24743622541427612, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6920046806335449, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7910782098770142, "step": 111 }, { "adv/mean_abs_final_conf": 0.6615912318229675, "adv/mean_abs_reasoning": 0.582384467124939, "adv/mean_abs_step_conf": 0.7320601940155029, "adv/ratio_final_to_reasoning": 1.1360042534943438, "adv/ratio_step_to_reasoning": 1.2570050118772382, "adv/std_final_conf": 0.8920271396636963, "adv/std_reasoning": 0.8428117632865906, "adv/std_step_conf": 0.9354498982429504, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8028235294117647, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.1567213114754099, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.42213114754098363, "calib/gap": 0.42403092436974793, "calib/mean_conf": 0.5799180327868853, "calib/mu_c": 0.78672, "calib/mu_w": 0.36268907563025204, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.11217213114754104, "calib/std_conf": 0.3957654875683685, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.563958, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.16862905263157896, "calib/step_q_w": 0.395328947368421, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 599.2265625, "completions/mean_terminated_length": 603.9448852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.11946666666666667, "grad_norm": 0.04947257414460182, "kl": 0.1170196533203125, "learning_rate": 2.4444444444444447e-06, "loss": -0.1128, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02901388704776764, "mask/share_reasoning": 0.8649810552597046, "mask/share_step_conf": 0.09819258004426956, "num_tokens": 27251793.0, "reward": 0.8923689126968384, "reward_std": 0.2590276598930359, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7300676107406616, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.7710765600204468, "step": 112 }, { "adv/mean_abs_final_conf": 0.7774215340614319, "adv/mean_abs_reasoning": 0.6023589372634888, "adv/mean_abs_step_conf": 0.7576698064804077, "adv/ratio_final_to_reasoning": 1.290628371172263, "adv/ratio_step_to_reasoning": 1.257837743592707, "adv/std_final_conf": 0.921485960483551, "adv/std_reasoning": 0.8266767859458923, "adv/std_step_conf": 0.9352647662162781, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7012999487704917, "calib/avg_num_step_conf": 5.98046875, "calib/ece": 0.22724, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.464, "calib/gap": 0.2638716700819671, "calib/mean_conf": 0.6508399999999999, "calib/mu_c": 0.7796093749999999, "calib/mu_w": 0.5157377049180328, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18303999999999998, "calib/std_conf": 0.37026651806502836, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5322963951935915, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.07087695785343806, "calib/step_q_w": 0.46141943734015345, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 524.98046875, "completions/mean_terminated_length": 524.98046875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.12053333333333334, "grad_norm": 0.033785078674554825, "kl": 0.1403656005859375, "learning_rate": 2.4166666666666667e-06, "loss": -0.0499, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03336464241147041, "mask/share_reasoning": 0.8438401222229004, "mask/share_step_conf": 0.12279525399208069, "num_tokens": 27491388.0, "reward": 0.9093856811523438, "reward_std": 0.25819456577301025, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7008984088897705, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8241229057312012, "step": 113 }, { "adv/mean_abs_final_conf": 0.6561824679374695, "adv/mean_abs_reasoning": 0.5457957983016968, "adv/mean_abs_step_conf": 0.7403881549835205, "adv/ratio_final_to_reasoning": 1.2022490278951448, "adv/ratio_step_to_reasoning": 1.3565295982257817, "adv/std_final_conf": 0.863419234752655, "adv/std_reasoning": 0.7754489183425903, "adv/std_step_conf": 0.9345146417617798, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8388490590193317, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.15166007905138346, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5335968379446641, "calib/gap": 0.4147695557547048, "calib/mean_conf": 0.7094466403162055, "calib/mu_c": 0.8848630136986301, "calib/mu_w": 0.47009345794392526, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14201581027667992, "calib/std_conf": 0.3577273595261266, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.603311345646438, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.16277040997392334, "calib/step_q_w": 0.4405409356725146, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 523.515625, "completions/mean_terminated_length": 523.515625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1216, "grad_norm": 0.032592494040727615, "kl": 0.1292572021484375, "learning_rate": 2.388888888888889e-06, "loss": -0.0674, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036023080348968506, "mask/share_reasoning": 0.8391364216804504, "mask/share_step_conf": 0.12484048306941986, "num_tokens": 27730432.0, "reward": 0.9680483341217041, "reward_std": 0.21276646852493286, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7889527082443237, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8393313884735107, "step": 114 }, { "adv/mean_abs_final_conf": 0.7770059704780579, "adv/mean_abs_reasoning": 0.6370693445205688, "adv/mean_abs_step_conf": 0.7334473133087158, "adv/ratio_final_to_reasoning": 1.2196568194044863, "adv/ratio_step_to_reasoning": 1.1512833251467733, "adv/std_final_conf": 0.9263416528701782, "adv/std_reasoning": 0.8590085506439209, "adv/std_step_conf": 0.9356162548065186, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6730133502860776, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.29470119521912347, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4860557768924303, "calib/gap": 0.21137698664971394, "calib/mean_conf": 0.6537450199203186, "calib/mu_c": 0.7632231404958677, "calib/mu_w": 0.5518461538461538, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.23318725099601592, "calib/std_conf": 0.38411941868882943, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.559277108433735, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.11944909860572522, "calib/step_q_w": 0.4398280098280098, "calib/step_q_w_n": 814.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 509.8515625, "completions/mean_terminated_length": 511.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 79.0, "epoch": 0.12266666666666666, "grad_norm": 0.03090801276266575, "kl": 0.1342315673828125, "learning_rate": 2.361111111111111e-06, "loss": -0.0815, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03292369842529297, "mask/share_reasoning": 0.8425235748291016, "mask/share_step_conf": 0.12064643204212189, "num_tokens": 27966218.0, "reward": 0.8426992893218994, "reward_std": 0.26547738909721375, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6314203143119812, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7711657285690308, "step": 115 }, { "adv/mean_abs_final_conf": 0.7391272187232971, "adv/mean_abs_reasoning": 0.6228404641151428, "adv/mean_abs_step_conf": 0.7303839921951294, "adv/ratio_final_to_reasoning": 1.1867039174684333, "adv/ratio_step_to_reasoning": 1.1726662512731436, "adv/std_final_conf": 0.9049105644226074, "adv/std_reasoning": 0.8430219888687134, "adv/std_step_conf": 0.9353899955749512, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7813555953090837, "calib/avg_num_step_conf": 6.15234375, "calib/ece": 0.1734146341463415, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.45121951219512196, "calib/gap": 0.3678552971576228, "calib/mean_conf": 0.6584552845528456, "calib/mu_c": 0.8334108527131783, "calib/mu_w": 0.4655555555555555, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.153739837398374, "calib/std_conf": 0.36947673946902104, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.5523137876386688, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.17788476221493998, "calib/step_q_w": 0.3744290254237288, "calib/step_q_w_n": 944.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 623.42578125, "completions/mean_terminated_length": 625.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.12373333333333333, "grad_norm": 0.036150332540273666, "kl": 0.125274658203125, "learning_rate": 2.3333333333333336e-06, "loss": -0.0167, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03112485446035862, "mask/share_reasoning": 0.8542196750640869, "mask/share_step_conf": 0.11074923723936081, "num_tokens": 28230335.0, "reward": 0.9023261070251465, "reward_std": 0.274005651473999, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7138823866844177, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.8056135773658752, "step": 116 }, { "adv/mean_abs_final_conf": 0.7267533540725708, "adv/mean_abs_reasoning": 0.6318784952163696, "adv/mean_abs_step_conf": 0.7376343011856079, "adv/ratio_final_to_reasoning": 1.1501473140397251, "adv/ratio_step_to_reasoning": 1.1673673131303908, "adv/std_final_conf": 0.906491756439209, "adv/std_reasoning": 0.8590298891067505, "adv/std_step_conf": 0.9355345368385315, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.722940797940798, "calib/avg_num_step_conf": 6.35546875, "calib/ece": 0.2083665338645419, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3784860557768924, "calib/gap": 0.2679929214929216, "calib/mean_conf": 0.6077290836653387, "calib/mu_c": 0.7572072072072074, "calib/mu_w": 0.48921428571428577, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.18693227091633471, "calib/std_conf": 0.3590126210351204, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5319690846286702, "calib/step_q_c_n": 579.0, "calib/step_q_gap": 0.136272519743174, "calib/step_q_w": 0.3956965648854962, "calib/step_q_w_n": 1048.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 530.328125, "completions/mean_terminated_length": 536.6166381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.1248, "grad_norm": 0.03738539293408394, "kl": 0.1431121826171875, "learning_rate": 2.305555555555556e-06, "loss": -0.14, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03171270340681076, "mask/share_reasoning": 0.8342646360397339, "mask/share_step_conf": 0.12230387330055237, "num_tokens": 28472699.0, "reward": 0.8889857530593872, "reward_std": 0.25273334980010986, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6912992596626282, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8101098537445068, "step": 117 }, { "adv/mean_abs_final_conf": 0.7259243726730347, "adv/mean_abs_reasoning": 0.5425605773925781, "adv/mean_abs_step_conf": 0.7316884994506836, "adv/ratio_final_to_reasoning": 1.3379600415527073, "adv/ratio_step_to_reasoning": 1.3485839737325018, "adv/std_final_conf": 0.9222829341888428, "adv/std_reasoning": 0.7928994297981262, "adv/std_step_conf": 0.9354918003082275, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7732473544973544, "calib/avg_num_step_conf": 6.796875, "calib/ece": 0.19256097560975607, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.45934959349593496, "calib/gap": 0.3434523809523811, "calib/mean_conf": 0.6615853658536586, "calib/mu_c": 0.8375000000000001, "calib/mu_w": 0.494047619047619, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.18317073170731707, "calib/std_conf": 0.3702310951971517, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5220641562064157, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.13714724516047233, "calib/step_q_w": 0.38491691104594333, "calib/step_q_w_n": 1023.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 602.109375, "completions/mean_terminated_length": 604.4706420898438, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.12586666666666665, "grad_norm": 0.02952709048986435, "kl": 0.1322174072265625, "learning_rate": 2.277777777777778e-06, "loss": -0.0579, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03140158951282501, "mask/share_reasoning": 0.8391228914260864, "mask/share_step_conf": 0.12556925415992737, "num_tokens": 28730847.0, "reward": 0.8890880346298218, "reward_std": 0.27732986211776733, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7022339701652527, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7954732179641724, "step": 118 }, { "adv/mean_abs_final_conf": 0.734850287437439, "adv/mean_abs_reasoning": 0.6560388207435608, "adv/mean_abs_step_conf": 0.7462862133979797, "adv/ratio_final_to_reasoning": 1.120132321749729, "adv/ratio_step_to_reasoning": 1.1375641041365991, "adv/std_final_conf": 0.9075103402137756, "adv/std_reasoning": 0.8748774528503418, "adv/std_step_conf": 0.9354383945465088, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7938529742605703, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.19991869918699193, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.3983739837398374, "calib/gap": 0.38824852775755964, "calib/mean_conf": 0.5930081300813008, "calib/mu_c": 0.7934453781512605, "calib/mu_w": 0.40519685039370085, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.1545934959349594, "calib/std_conf": 0.3947746809439399, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.5407395498392283, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.1718886940935071, "calib/step_q_w": 0.36885085574572124, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 588.47265625, "completions/mean_terminated_length": 590.7804565429688, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.12693333333333334, "grad_norm": 0.033969517797231674, "kl": 0.1355133056640625, "learning_rate": 2.25e-06, "loss": -0.1945, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03004167228937149, "mask/share_reasoning": 0.8571265935897827, "mask/share_step_conf": 0.10892552137374878, "num_tokens": 28986560.0, "reward": 0.8886620998382568, "reward_std": 0.2910217344760895, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7106913924217224, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7908514738082886, "step": 119 }, { "adv/mean_abs_final_conf": 0.7197772264480591, "adv/mean_abs_reasoning": 0.43707719445228577, "adv/mean_abs_step_conf": 0.7259597778320312, "adv/ratio_final_to_reasoning": 1.6467965741155473, "adv/ratio_step_to_reasoning": 1.6609417902522978, "adv/std_final_conf": 0.909675121307373, "adv/std_reasoning": 0.7208017706871033, "adv/std_step_conf": 0.9347007870674133, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7929282596835788, "calib/avg_num_step_conf": 5.8984375, "calib/ece": 0.14759999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.368, "calib/gap": 0.42851882160392807, "calib/mean_conf": 0.5608, "calib/mu_c": 0.721923076923077, "calib/mu_w": 0.29340425531914893, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.0422, "calib/std_conf": 0.392979592345455, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.486977829638273, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.15777415429370945, "calib/step_q_w": 0.3292036753445636, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 514.51953125, "completions/mean_terminated_length": 520.6205444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.128, "grad_norm": 0.048599161207675934, "kl": 0.1397857666015625, "learning_rate": 2.222222222222222e-06, "loss": -0.1298, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03219369053840637, "mask/share_reasoning": 0.8342239260673523, "mask/share_step_conf": 0.12186359614133835, "num_tokens": 29224965.0, "reward": 0.9581763744354248, "reward_std": 0.22807276248931885, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7615699172019958, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8438452482223511, "step": 120 }, { "adv/mean_abs_final_conf": 0.781947135925293, "adv/mean_abs_reasoning": 0.6653647422790527, "adv/mean_abs_step_conf": 0.7622631788253784, "adv/ratio_final_to_reasoning": 1.1752157669895678, "adv/ratio_step_to_reasoning": 1.1456320577108168, "adv/std_final_conf": 0.9308911561965942, "adv/std_reasoning": 0.8591954112052917, "adv/std_step_conf": 0.9346886873245239, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6882398353062275, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.21704, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.372, "calib/gap": 0.257421513124035, "calib/mean_conf": 0.5561600000000001, "calib/mu_c": 0.6941379310344827, "calib/mu_w": 0.43671641791044774, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.15460000000000002, "calib/std_conf": 0.3859612084134881, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.4705891980360065, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.09674394598442387, "calib/step_q_w": 0.37384525205158264, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 612.71484375, "completions/mean_terminated_length": 615.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.12906666666666666, "grad_norm": 0.031079187989234924, "kl": 0.1669921875, "learning_rate": 2.1944444444444445e-06, "loss": -0.1448, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028758013620972633, "mask/share_reasoning": 0.8577010631561279, "mask/share_step_conf": 0.10963468253612518, "num_tokens": 29486876.0, "reward": 0.863829493522644, "reward_std": 0.26976099610328674, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6560484170913696, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7989541888237, "step": 121 }, { "adv/mean_abs_final_conf": 0.7424711585044861, "adv/mean_abs_reasoning": 0.5156237483024597, "adv/mean_abs_step_conf": 0.7362110018730164, "adv/ratio_final_to_reasoning": 1.4399475605009564, "adv/ratio_step_to_reasoning": 1.4278066211976768, "adv/std_final_conf": 0.92244553565979, "adv/std_reasoning": 0.7754589319229126, "adv/std_step_conf": 0.9350674152374268, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8241998983997968, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.16760956175298813, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.46613545816733065, "calib/gap": 0.43796355092710176, "calib/mean_conf": 0.622191235059761, "calib/mu_c": 0.8437903225806451, "calib/mu_w": 0.40582677165354336, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14788844621513952, "calib/std_conf": 0.3943085875731503, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5155096418732782, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.1537407739487499, "calib/step_q_w": 0.3617688679245283, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 537.39453125, "completions/mean_terminated_length": 541.6259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.13013333333333332, "grad_norm": 0.03416607528924942, "kl": 0.1433258056640625, "learning_rate": 2.166666666666667e-06, "loss": -0.058, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031357355415821075, "mask/share_reasoning": 0.8387341499328613, "mask/share_step_conf": 0.122095987200737, "num_tokens": 29731793.0, "reward": 0.9425933361053467, "reward_std": 0.2181842178106308, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7617121338844299, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8351932764053345, "step": 122 }, { "adv/mean_abs_final_conf": 0.7870303392410278, "adv/mean_abs_reasoning": 0.6281484365463257, "adv/mean_abs_step_conf": 0.7771914601325989, "adv/ratio_final_to_reasoning": 1.2529368751887113, "adv/ratio_step_to_reasoning": 1.2372735724787263, "adv/std_final_conf": 0.9210789203643799, "adv/std_reasoning": 0.8268639445304871, "adv/std_step_conf": 0.934878945350647, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7303689064558629, "calib/avg_num_step_conf": 6.3515625, "calib/ece": 0.18068825910931174, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.29959514170040485, "calib/gap": 0.30387088274044816, "calib/mean_conf": 0.5317813765182184, "calib/mu_c": 0.6941739130434784, "calib/mu_w": 0.39030303030303026, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.12344129554655871, "calib/std_conf": 0.37659144732146543, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.45963909774436085, "calib/step_q_c_n": 665.0, "calib/step_q_gap": 0.13994086673499556, "calib/step_q_w": 0.3196982310093653, "calib/step_q_w_n": 961.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 668.88671875, "completions/mean_terminated_length": 668.88671875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1312, "grad_norm": 0.039897166192531586, "kl": 0.130279541015625, "learning_rate": 2.138888888888889e-06, "loss": -0.0651, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.027274832129478455, "mask/share_reasoning": 0.8642194867134094, "mask/share_step_conf": 0.10850568860769272, "num_tokens": 30008316.0, "reward": 0.8898562788963318, "reward_std": 0.2588973343372345, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7077487707138062, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7969638109207153, "step": 123 }, { "adv/mean_abs_final_conf": 0.7224191427230835, "adv/mean_abs_reasoning": 0.43076780438423157, "adv/mean_abs_step_conf": 0.7511848211288452, "adv/ratio_final_to_reasoning": 1.6770499915975798, "adv/ratio_step_to_reasoning": 1.7438276804429227, "adv/std_final_conf": 0.8989644050598145, "adv/std_reasoning": 0.7015025615692139, "adv/std_step_conf": 0.9342661499977112, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7611060448150079, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.18237154150197626, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.391304347826087, "calib/gap": 0.3329872329338197, "calib/mean_conf": 0.6119367588932806, "calib/mu_c": 0.7448684210526316, "calib/mu_w": 0.4118811881188119, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.09675889328063239, "calib/std_conf": 0.3763510628160573, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.46166259168704155, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.08511195877564914, "calib/step_q_w": 0.3765506329113924, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 535.0078125, "completions/mean_terminated_length": 537.1058959960938, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.13226666666666667, "grad_norm": 0.0538254976272583, "kl": 0.146026611328125, "learning_rate": 2.1111111111111114e-06, "loss": -0.135, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030877020210027695, "mask/share_reasoning": 0.8481591939926147, "mask/share_step_conf": 0.11705756187438965, "num_tokens": 30252094.0, "reward": 0.9374511241912842, "reward_std": 0.21843618154525757, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7460086345672607, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8195186853408813, "step": 124 }, { "adv/mean_abs_final_conf": 0.7272511124610901, "adv/mean_abs_reasoning": 0.5495245456695557, "adv/mean_abs_step_conf": 0.7719345092773438, "adv/ratio_final_to_reasoning": 1.323418795742759, "adv/ratio_step_to_reasoning": 1.404731627295734, "adv/std_final_conf": 0.9073876142501831, "adv/std_reasoning": 0.792988121509552, "adv/std_step_conf": 0.9352187514305115, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.662, "calib/avg_num_step_conf": 6.171875, "calib/ece": 0.2656275303643724, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4251012145748988, "calib/gap": 0.2216177049180329, "calib/mean_conf": 0.638582995951417, "calib/mu_c": 0.7507377049180328, "calib/mu_w": 0.5291199999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2051417004048583, "calib/std_conf": 0.37554496999452713, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4674045801526717, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.10480998555807713, "calib/step_q_w": 0.3625945945945946, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 586.6875, "completions/mean_terminated_length": 591.3070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.13333333333333333, "grad_norm": 0.03176412731409073, "kl": 0.1277313232421875, "learning_rate": 2.0833333333333334e-06, "loss": -0.1226, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030877090990543365, "mask/share_reasoning": 0.8451837301254272, "mask/share_step_conf": 0.1161266565322876, "num_tokens": 30507094.0, "reward": 0.8768726587295532, "reward_std": 0.2407217025756836, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6630710959434509, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8055179119110107, "step": 125 }, { "adv/mean_abs_final_conf": 0.7116903066635132, "adv/mean_abs_reasoning": 0.6008867621421814, "adv/mean_abs_step_conf": 0.7131746411323547, "adv/ratio_final_to_reasoning": 1.1844000425742671, "adv/ratio_step_to_reasoning": 1.1868702824969273, "adv/std_final_conf": 0.9004759788513184, "adv/std_reasoning": 0.8428918719291687, "adv/std_step_conf": 0.9355723857879639, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7877714512711864, "calib/avg_num_step_conf": 6.53515625, "calib/ece": 0.18051490514905144, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.45528455284552843, "calib/gap": 0.4178425141242937, "calib/mean_conf": 0.5843631436314363, "calib/mu_c": 0.7847916666666666, "calib/mu_w": 0.36694915254237287, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.1222764227642276, "calib/std_conf": 0.4152145166846133, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.4872044506258692, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.18707834999693834, "calib/step_q_w": 0.30012610062893086, "calib/step_q_w_n": 954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 600.23046875, "completions/mean_terminated_length": 604.9566650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.1344, "grad_norm": 0.02776472456753254, "kl": 0.122283935546875, "learning_rate": 2.0555555555555555e-06, "loss": -0.139, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03054964542388916, "mask/share_reasoning": 0.8413506746292114, "mask/share_step_conf": 0.12028719484806061, "num_tokens": 30766217.0, "reward": 0.8982517123222351, "reward_std": 0.28006142377853394, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.716654896736145, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.7954734563827515, "step": 126 }, { "adv/mean_abs_final_conf": 0.7346285581588745, "adv/mean_abs_reasoning": 0.6094740629196167, "adv/mean_abs_step_conf": 0.7600812315940857, "adv/ratio_final_to_reasoning": 1.2053483533650626, "adv/ratio_step_to_reasoning": 1.2471100541227338, "adv/std_final_conf": 0.9214181900024414, "adv/std_reasoning": 0.8266828060150146, "adv/std_step_conf": 0.9350873231887817, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7964355332776385, "calib/avg_num_step_conf": 6.65625, "calib/ece": 0.1770833333333333, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.42083333333333334, "calib/gap": 0.41462823725981607, "calib/mean_conf": 0.5909166666666666, "calib/mu_c": 0.8085964912280701, "calib/mu_w": 0.393968253968254, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.14649999999999996, "calib/std_conf": 0.4043946006755722, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.4948462664714495, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.1757375495272771, "calib/step_q_w": 0.3191087169441724, "calib/step_q_w_n": 1021.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 619.01171875, "completions/mean_terminated_length": 626.351806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.13546666666666668, "grad_norm": 0.029080456122756004, "kl": 0.13339996337890625, "learning_rate": 2.027777777777778e-06, "loss": -0.0763, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.030351759865880013, "mask/share_reasoning": 0.8314367532730103, "mask/share_step_conf": 0.1264927238225937, "num_tokens": 31028356.0, "reward": 0.9085278511047363, "reward_std": 0.23661072552204132, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7260656356811523, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.8159900903701782, "step": 127 }, { "adv/mean_abs_final_conf": 0.717476487159729, "adv/mean_abs_reasoning": 0.6013306379318237, "adv/mean_abs_step_conf": 0.7633137106895447, "adv/ratio_final_to_reasoning": 1.1931480651432789, "adv/ratio_step_to_reasoning": 1.269374388297983, "adv/std_final_conf": 0.8905273675918579, "adv/std_reasoning": 0.8101703524589539, "adv/std_step_conf": 0.9351193308830261, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7363270209157716, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.21970588235294114, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.37815126050420167, "calib/gap": 0.3302925381571509, "calib/mean_conf": 0.5816386554621849, "calib/mu_c": 0.7509482758620689, "calib/mu_w": 0.42065573770491804, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.15697478991596636, "calib/std_conf": 0.3872845615500299, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.47670175438596496, "calib/step_q_c_n": 570.0, "calib/step_q_gap": 0.17309064327485385, "calib/step_q_w": 0.3036111111111111, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 606.43359375, "completions/mean_terminated_length": 611.2086791992188, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.13653333333333334, "grad_norm": 0.02509251795709133, "kl": 0.1387786865234375, "learning_rate": 2.0000000000000003e-06, "loss": -0.0994, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.03150024265050888, "mask/share_reasoning": 0.8502755165100098, "mask/share_step_conf": 0.11041173338890076, "num_tokens": 31290267.0, "reward": 0.8657711744308472, "reward_std": 0.2659390866756439, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6803550720214844, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7808746099472046, "step": 128 }, { "adv/mean_abs_final_conf": 0.717547595500946, "adv/mean_abs_reasoning": 0.4793207049369812, "adv/mean_abs_step_conf": 0.7387598752975464, "adv/ratio_final_to_reasoning": 1.4970093887249993, "adv/ratio_step_to_reasoning": 1.5412642677196158, "adv/std_final_conf": 0.8871719837188721, "adv/std_reasoning": 0.7394227385520935, "adv/std_step_conf": 0.934581995010376, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7197559709241953, "calib/avg_num_step_conf": 6.1171875, "calib/ece": 0.20844621513944228, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.28078725337487015, "calib/mean_conf": 0.6385657370517929, "calib/mu_c": 0.7582638888888888, "calib/mu_w": 0.4774766355140187, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1366533864541833, "calib/std_conf": 0.36887512687148855, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4696181384248211, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.11189835820504085, "calib/step_q_w": 0.35771978021978024, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2642.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 546.26953125, "completions/mean_terminated_length": 546.26953125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1376, "grad_norm": 0.0597921647131443, "kl": 0.1490020751953125, "learning_rate": 1.9722222222222224e-06, "loss": 0.0022, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03373418003320694, "mask/share_reasoning": 0.837746262550354, "mask/share_step_conf": 0.12851959466934204, "num_tokens": 31532496.0, "reward": 0.9151557087898254, "reward_std": 0.21401160955429077, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7176144123077393, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8087906837463379, "step": 129 }, { "adv/mean_abs_final_conf": 0.6780507564544678, "adv/mean_abs_reasoning": 0.5096803307533264, "adv/mean_abs_step_conf": 0.7484921216964722, "adv/ratio_final_to_reasoning": 1.330345150758091, "adv/ratio_step_to_reasoning": 1.468552103217664, "adv/std_final_conf": 0.8762167096138, "adv/std_reasoning": 0.7575826644897461, "adv/std_step_conf": 0.9352434277534485, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7863839285714286, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.18468253968253967, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5198412698412699, "calib/gap": 0.36919642857142865, "calib/mean_conf": 0.7069841269841269, "calib/mu_c": 0.8710714285714286, "calib/mu_w": 0.501875, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.16805555555555554, "calib/std_conf": 0.36071637242485816, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5037385620915033, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.13467225627282936, "calib/step_q_w": 0.3690663058186739, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 521.73828125, "completions/mean_terminated_length": 521.73828125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.13866666666666666, "grad_norm": 0.05114923045039177, "kl": 0.1488189697265625, "learning_rate": 1.944444444444445e-06, "loss": -0.021, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034622110426425934, "mask/share_reasoning": 0.8403658270835876, "mask/share_step_conf": 0.12501206994056702, "num_tokens": 31771349.0, "reward": 0.9483653903007507, "reward_std": 0.2190546840429306, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7516831755638123, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8427038192749023, "step": 130 }, { "adv/mean_abs_final_conf": 0.720390796661377, "adv/mean_abs_reasoning": 0.5134299397468567, "adv/mean_abs_step_conf": 0.7589023113250732, "adv/ratio_final_to_reasoning": 1.4030946403642939, "adv/ratio_step_to_reasoning": 1.4781029553890939, "adv/std_final_conf": 0.9076294898986816, "adv/std_reasoning": 0.7752957344055176, "adv/std_step_conf": 0.9347165822982788, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8260456918175039, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.16827956989247317, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.36693548387096775, "calib/gap": 0.43755316023772395, "calib/mean_conf": 0.5579569892473119, "calib/mu_c": 0.8208417508417508, "calib/mu_w": 0.38328859060402687, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.1635215053763441, "calib/std_conf": 0.3985211708279797, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.47491039426523296, "calib/step_q_c_n": 558.0, "calib/step_q_gap": 0.14866718036729593, "calib/step_q_w": 0.32624321389793703, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 554.61328125, "completions/mean_terminated_length": 554.61328125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.13973333333333332, "grad_norm": 0.04064655676484108, "kl": 0.139007568359375, "learning_rate": 1.916666666666667e-06, "loss": -0.0294, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031148342415690422, "mask/share_reasoning": 0.8485188484191895, "mask/share_step_conf": 0.12033282965421677, "num_tokens": 32019538.0, "reward": 0.9247491359710693, "reward_std": 0.21482647955417633, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.7481170296669006, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8334124088287354, "step": 131 }, { "adv/mean_abs_final_conf": 0.6843870878219604, "adv/mean_abs_reasoning": 0.609015941619873, "adv/mean_abs_step_conf": 0.7606111764907837, "adv/ratio_final_to_reasoning": 1.123758905229334, "adv/ratio_step_to_reasoning": 1.248918336140257, "adv/std_final_conf": 0.8533295392990112, "adv/std_reasoning": 0.8100547194480896, "adv/std_step_conf": 0.9349188804626465, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7723787680209698, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.2086345381526105, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5261044176706827, "calib/gap": 0.3636952817824377, "calib/mean_conf": 0.655863453815261, "calib/mu_c": 0.8150714285714284, "calib/mu_w": 0.45137614678899074, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15112449799196792, "calib/std_conf": 0.40146938625769135, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4591356542617047, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.1655743335069877, "calib/step_q_w": 0.293561320754717, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2947.0, "completions/max_terminated_length": 2947.0, "completions/mean_length": 584.30859375, "completions/mean_terminated_length": 584.30859375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.1408, "grad_norm": 0.03089536912739277, "kl": 0.143707275390625, "learning_rate": 1.888888888888889e-06, "loss": 0.0929, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03282865881919861, "mask/share_reasoning": 0.8385095596313477, "mask/share_step_conf": 0.12866178154945374, "num_tokens": 32274713.0, "reward": 0.9386829733848572, "reward_std": 0.23207098245620728, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7341241836547852, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8416792154312134, "step": 132 }, { "adv/mean_abs_final_conf": 0.7646936178207397, "adv/mean_abs_reasoning": 0.6460624933242798, "adv/mean_abs_step_conf": 0.721850574016571, "adv/ratio_final_to_reasoning": 1.1836217482399418, "adv/ratio_step_to_reasoning": 1.1173076621463163, "adv/std_final_conf": 0.9069320559501648, "adv/std_reasoning": 0.8592305183410645, "adv/std_step_conf": 0.9352459907531738, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7253191489361701, "calib/avg_num_step_conf": 6.62109375, "calib/ece": 0.2306557377049181, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3319672131147541, "calib/gap": 0.3234411347517731, "calib/mean_conf": 0.5007377049180328, "calib/mu_c": 0.6995744680851065, "calib/mu_w": 0.3761333333333334, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.17307377049180334, "calib/std_conf": 0.4016985821152735, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.3731669266770671, "calib/step_q_c_n": 641.0, "calib/step_q_gap": 0.0708728090300082, "calib/step_q_w": 0.3022941176470589, "calib/step_q_w_n": 1054.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 623.90625, "completions/mean_terminated_length": 628.8189086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.14186666666666667, "grad_norm": 0.0348166786134243, "kl": 0.1337127685546875, "learning_rate": 1.8611111111111113e-06, "loss": -0.1335, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02746543101966381, "mask/share_reasoning": 0.8509870767593384, "mask/share_step_conf": 0.11373503506183624, "num_tokens": 32540777.0, "reward": 0.8784596920013428, "reward_std": 0.28305697441101074, "rewards/accuracy_reward_step": 0.3671875, "rewards/final_brier_reward_step": 0.6788663864135742, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.8210216760635376, "step": 133 }, { "adv/mean_abs_final_conf": 0.7420914769172668, "adv/mean_abs_reasoning": 0.5315876007080078, "adv/mean_abs_step_conf": 0.7382508516311646, "adv/ratio_final_to_reasoning": 1.3959909447265029, "adv/ratio_step_to_reasoning": 1.3887661236791589, "adv/std_final_conf": 0.9098237156867981, "adv/std_reasoning": 0.7755208015441895, "adv/std_step_conf": 0.9349122047424316, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7413217623497997, "calib/avg_num_step_conf": 6.3984375, "calib/ece": 0.24170040485829958, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4331983805668016, "calib/gap": 0.35083711615487323, "calib/mean_conf": 0.5846963562753036, "calib/mu_c": 0.783551401869159, "calib/mu_w": 0.4327142857142857, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.19659919028340078, "calib/std_conf": 0.41255279248347543, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.40404494382022466, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.11917453345305401, "calib/step_q_w": 0.28487041036717065, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 635.078125, "completions/mean_terminated_length": 637.5686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.14293333333333333, "grad_norm": 0.027463382109999657, "kl": 0.124725341796875, "learning_rate": 1.8333333333333333e-06, "loss": -0.1329, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.027732379734516144, "mask/share_reasoning": 0.859741747379303, "mask/share_step_conf": 0.10861961543560028, "num_tokens": 32812309.0, "reward": 0.8942639827728271, "reward_std": 0.24431997537612915, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.6869269609451294, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.829725980758667, "step": 134 }, { "adv/mean_abs_final_conf": 0.7548559308052063, "adv/mean_abs_reasoning": 0.6046093106269836, "adv/mean_abs_step_conf": 0.6980199813842773, "adv/ratio_final_to_reasoning": 1.2485019954826961, "adv/ratio_step_to_reasoning": 1.1544975724247883, "adv/std_final_conf": 0.9214678406715393, "adv/std_reasoning": 0.8430864214897156, "adv/std_step_conf": 0.9351249933242798, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6043043043043043, "calib/avg_num_step_conf": 6.984375, "calib/ece": 0.26634146341463416, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3821138211382114, "calib/gap": 0.18251051051051054, "calib/mean_conf": 0.5685365853658536, "calib/mu_c": 0.650888888888889, "calib/mu_w": 0.4683783783783784, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.14304878048780492, "calib/std_conf": 0.3985658903954455, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.3298093220338983, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.039714535304040455, "calib/step_q_w": 0.29009478672985783, "calib/step_q_w_n": 844.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2364.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 596.37890625, "completions/mean_terminated_length": 598.7177124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.144, "grad_norm": 0.041083741933107376, "kl": 0.1374969482421875, "learning_rate": 1.8055555555555557e-06, "loss": -0.1289, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03051985614001751, "mask/share_reasoning": 0.8427517414093018, "mask/share_step_conf": 0.12282220274209976, "num_tokens": 33070862.0, "reward": 0.8903567790985107, "reward_std": 0.24555067718029022, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6455437541007996, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8398572206497192, "step": 135 }, { "adv/mean_abs_final_conf": 0.6729371547698975, "adv/mean_abs_reasoning": 0.6114941835403442, "adv/mean_abs_step_conf": 0.7410287261009216, "adv/ratio_final_to_reasoning": 1.1004800583283054, "adv/ratio_step_to_reasoning": 1.2118328285816493, "adv/std_final_conf": 0.8736879229545593, "adv/std_reasoning": 0.8429493308067322, "adv/std_step_conf": 0.935082197189331, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8081365337462898, "calib/avg_num_step_conf": 6.83203125, "calib/ece": 0.1584337349397591, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.44314556716995746, "calib/mean_conf": 0.4978714859437751, "calib/mu_c": 0.7221138211382114, "calib/mu_w": 0.278968253968254, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.08116465863453821, "calib/std_conf": 0.40774712558630793, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.3916142857142857, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.15982210268282718, "calib/step_q_w": 0.23179218303145852, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 570.48828125, "completions/mean_terminated_length": 572.7255249023438, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.14506666666666668, "grad_norm": 0.038291603326797485, "kl": 0.137298583984375, "learning_rate": 1.777777777777778e-06, "loss": -0.1012, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03168662637472153, "mask/share_reasoning": 0.8357338905334473, "mask/share_step_conf": 0.1286732256412506, "num_tokens": 33325395.0, "reward": 0.9253720641136169, "reward_std": 0.2551695704460144, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7408660054206848, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.8294093608856201, "step": 136 }, { "adv/mean_abs_final_conf": 0.7062271237373352, "adv/mean_abs_reasoning": 0.5881997346878052, "adv/mean_abs_step_conf": 0.7120691537857056, "adv/ratio_final_to_reasoning": 1.200658691408922, "adv/ratio_step_to_reasoning": 1.2105907428939011, "adv/std_final_conf": 0.8991337418556213, "adv/std_reasoning": 0.8267337083816528, "adv/std_step_conf": 0.9343873262405396, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7438038793103448, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.1884016393442623, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.35655737704918034, "calib/gap": 0.3403933189655174, "calib/mean_conf": 0.5397950819672132, "calib/mu_c": 0.7183620689655174, "calib/mu_w": 0.37796874999999996, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.12639344262295082, "calib/std_conf": 0.39818795377709504, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.3570138888888889, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.07836189298305468, "calib/step_q_w": 0.2786519959058342, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 589.61328125, "completions/mean_terminated_length": 594.2559204101562, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.14613333333333334, "grad_norm": 0.03523816913366318, "kl": 0.13226318359375, "learning_rate": 1.75e-06, "loss": -0.0867, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.029486628249287605, "mask/share_reasoning": 0.8430880308151245, "mask/share_step_conf": 0.11961278319358826, "num_tokens": 33583320.0, "reward": 0.8976198434829712, "reward_std": 0.2519051134586334, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.697229266166687, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.8245728611946106, "step": 137 }, { "adv/mean_abs_final_conf": 0.6920844316482544, "adv/mean_abs_reasoning": 0.59056556224823, "adv/mean_abs_step_conf": 0.7281466126441956, "adv/ratio_final_to_reasoning": 1.171901099369139, "adv/ratio_step_to_reasoning": 1.2329649054919607, "adv/std_final_conf": 0.8724373579025269, "adv/std_reasoning": 0.8266708254814148, "adv/std_step_conf": 0.9192236661911011, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.816345600920069, "calib/avg_num_step_conf": 6.30078125, "calib/ece": 0.1528512396694215, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.3760330578512397, "calib/gap": 0.46143617021276595, "calib/mean_conf": 0.5332644628099174, "calib/mu_c": 0.7125, "calib/mu_w": 0.25106382978723407, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.03727272727272726, "calib/std_conf": 0.41157087233183076, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.37403567447045705, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.1139379091073286, "calib/step_q_w": 0.26009776536312845, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 573.640625, "completions/mean_terminated_length": 575.8902587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 18.0, "epoch": 0.1472, "grad_norm": 0.047151170670986176, "kl": 0.141998291015625, "learning_rate": 1.7222222222222224e-06, "loss": -0.1102, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03197343647480011, "mask/share_reasoning": 0.8420336246490479, "mask/share_step_conf": 0.12208672612905502, "num_tokens": 33834508.0, "reward": 0.9239314794540405, "reward_std": 0.19209349155426025, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7442609667778015, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8028206825256348, "step": 138 }, { "adv/mean_abs_final_conf": 0.7184191942214966, "adv/mean_abs_reasoning": 0.5137280225753784, "adv/mean_abs_step_conf": 0.7425556778907776, "adv/ratio_final_to_reasoning": 1.3984426829978585, "adv/ratio_step_to_reasoning": 1.4454256829679244, "adv/std_final_conf": 0.9192709922790527, "adv/std_reasoning": 0.7754294872283936, "adv/std_step_conf": 0.9344884753227234, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8256553079947575, "calib/avg_num_step_conf": 6.0703125, "calib/ece": 0.12702811244979922, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3132530120481928, "calib/gap": 0.44588401048492793, "calib/mean_conf": 0.5270281124497993, "calib/mu_c": 0.7222142857142857, "calib/mu_w": 0.27633027522935777, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.04590361445783132, "calib/std_conf": 0.389914009690165, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3720024570024571, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.09763759213759221, "calib/step_q_w": 0.2743648648648649, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 518.35546875, "completions/mean_terminated_length": 524.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.14826666666666666, "grad_norm": 0.05162464454770088, "kl": 0.1543731689453125, "learning_rate": 1.6944444444444446e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03312474489212036, "mask/share_reasoning": 0.8317595720291138, "mask/share_step_conf": 0.12339693307876587, "num_tokens": 34070303.0, "reward": 0.9758179187774658, "reward_std": 0.17906738817691803, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.793757438659668, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8547533750534058, "step": 139 }, { "adv/mean_abs_final_conf": 0.6915757060050964, "adv/mean_abs_reasoning": 0.5147801637649536, "adv/mean_abs_step_conf": 0.7419753074645996, "adv/ratio_final_to_reasoning": 1.3434389175898138, "adv/ratio_step_to_reasoning": 1.4413440137980575, "adv/std_final_conf": 0.8760222792625427, "adv/std_reasoning": 0.7577025890350342, "adv/std_step_conf": 0.9336758255958557, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7467915365938258, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.19032096774193544, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5645161290322581, "calib/gap": 0.38188817204301057, "calib/mean_conf": 0.6862112903225808, "calib/mu_c": 0.8294193548387095, "calib/mu_w": 0.447531182795699, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.125766129032258, "calib/std_conf": 0.39289094095617916, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.42450601092896173, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.09055053147690689, "calib/step_q_w": 0.33395547945205484, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 543.66015625, "completions/mean_terminated_length": 543.66015625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.14933333333333335, "grad_norm": 0.03653091937303543, "kl": 0.1446380615234375, "learning_rate": 1.6666666666666667e-06, "loss": 0.0809, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03261617571115494, "mask/share_reasoning": 0.8453826308250427, "mask/share_step_conf": 0.12200117111206055, "num_tokens": 34314496.0, "reward": 0.9528766870498657, "reward_std": 0.21324776113033295, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7541335821151733, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8399009704589844, "step": 140 }, { "adv/mean_abs_final_conf": 0.7118234038352966, "adv/mean_abs_reasoning": 0.5580950379371643, "adv/mean_abs_step_conf": 0.7270782589912415, "adv/ratio_final_to_reasoning": 1.2754519489482372, "adv/ratio_step_to_reasoning": 1.3027857435871038, "adv/std_final_conf": 0.9149222373962402, "adv/std_reasoning": 0.8099385499954224, "adv/std_step_conf": 0.9347255825996399, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8293533389687235, "calib/avg_num_step_conf": 6.03125, "calib/ece": 0.12299595141700409, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5465587044534413, "calib/gap": 0.5450915750915752, "calib/mean_conf": 0.6455870445344128, "calib/mu_c": 0.8464102564102566, "calib/mu_w": 0.3013186813186813, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.06850202429149801, "calib/std_conf": 0.42056396432797855, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.41401574803149604, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.16306918314599983, "calib/step_q_w": 0.2509465648854962, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 558.2734375, "completions/mean_terminated_length": 562.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.1504, "grad_norm": 0.04100858420133591, "kl": 0.148345947265625, "learning_rate": 1.638888888888889e-06, "loss": -0.1071, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031309887766838074, "mask/share_reasoning": 0.8481752872467041, "mask/share_step_conf": 0.11270233988761902, "num_tokens": 34564510.0, "reward": 0.9690626859664917, "reward_std": 0.2599467635154724, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7910523414611816, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8376979231834412, "step": 141 }, { "adv/mean_abs_final_conf": 0.73613041639328, "adv/mean_abs_reasoning": 0.5790297985076904, "adv/mean_abs_step_conf": 0.7079474925994873, "adv/ratio_final_to_reasoning": 1.271316982805511, "adv/ratio_step_to_reasoning": 1.2226443171388608, "adv/std_final_conf": 0.910010814666748, "adv/std_reasoning": 0.8266434073448181, "adv/std_step_conf": 0.9345359802246094, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8122915832999866, "calib/avg_num_step_conf": 6.58203125, "calib/ece": 0.16934693877551027, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.42448979591836733, "calib/gap": 0.45676937441643306, "calib/mean_conf": 0.579795918367347, "calib/mu_c": 0.8147058823529411, "calib/mu_w": 0.357936507936508, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.13171428571428576, "calib/std_conf": 0.41512668239023115, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.42689896373057, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.15124178957941997, "calib/step_q_w": 0.27565717415115004, "calib/step_q_w_n": 913.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 592.953125, "completions/mean_terminated_length": 595.2784423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.15146666666666667, "grad_norm": 0.037801120430231094, "kl": 0.13177490234375, "learning_rate": 1.6111111111111113e-06, "loss": -0.0517, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.031895555555820465, "mask/share_reasoning": 0.834732174873352, "mask/share_step_conf": 0.1294659972190857, "num_tokens": 34821466.0, "reward": 0.9343152046203613, "reward_std": 0.235845148563385, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7522230744361877, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8343761563301086, "step": 142 }, { "adv/mean_abs_final_conf": 0.7163206934928894, "adv/mean_abs_reasoning": 0.6285260915756226, "adv/mean_abs_step_conf": 0.759658694267273, "adv/ratio_final_to_reasoning": 1.1396833052661006, "adv/ratio_step_to_reasoning": 1.208635097968519, "adv/std_final_conf": 0.8896148204803467, "adv/std_reasoning": 0.8430455923080444, "adv/std_step_conf": 0.9334934949874878, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7944293478260871, "calib/avg_num_step_conf": 6.28125, "calib/ece": 0.17691358024691362, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.43621399176954734, "calib/gap": 0.42025747282608705, "calib/mean_conf": 0.5980658436213991, "calib/mu_c": 0.796953125, "calib/mu_w": 0.376695652173913, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.1241152263374486, "calib/std_conf": 0.4057670222221783, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.4118918918918919, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.15042875825133895, "calib/step_q_w": 0.26146313364055296, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 607.9296875, "completions/mean_terminated_length": 607.9296875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.15253333333333333, "grad_norm": 0.030441317707300186, "kl": 0.1333465576171875, "learning_rate": 1.5833333333333333e-06, "loss": -0.0023, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.029380368068814278, "mask/share_reasoning": 0.8472878932952881, "mask/share_step_conf": 0.12333173304796219, "num_tokens": 35084432.0, "reward": 0.9215575456619263, "reward_std": 0.26581722497940063, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.734161376953125, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8245788812637329, "step": 143 }, { "adv/mean_abs_final_conf": 0.7154229879379272, "adv/mean_abs_reasoning": 0.5836465358734131, "adv/mean_abs_step_conf": 0.7379428148269653, "adv/ratio_final_to_reasoning": 1.2257812630847091, "adv/ratio_step_to_reasoning": 1.2643659637637557, "adv/std_final_conf": 0.9021844267845154, "adv/std_reasoning": 0.8100001811981201, "adv/std_step_conf": 0.9352454543113708, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7495052117693626, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.22224859437751004, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4819277108433735, "calib/gap": 0.3838138672648106, "calib/mean_conf": 0.5932534136546185, "calib/mu_c": 0.7566440559440559, "calib/mu_w": 0.37283018867924533, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.12060240963855423, "calib/std_conf": 0.4278204237038552, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.41317606444188715, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.11862414410049316, "calib/step_q_w": 0.294551920341394, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 544.4375, "completions/mean_terminated_length": 546.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 79.0, "epoch": 0.1536, "grad_norm": 0.024527657777071, "kl": 0.16070556640625, "learning_rate": 1.5555555555555558e-06, "loss": -0.1027, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03337983787059784, "mask/share_reasoning": 0.8375337719917297, "mask/share_step_conf": 0.12518012523651123, "num_tokens": 35327936.0, "reward": 0.9219741225242615, "reward_std": 0.24387477338314056, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7212362885475159, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.821149468421936, "step": 144 }, { "adv/mean_abs_final_conf": 0.6954442262649536, "adv/mean_abs_reasoning": 0.6353583931922913, "adv/mean_abs_step_conf": 0.7420624494552612, "adv/ratio_final_to_reasoning": 1.0945699839908738, "adv/ratio_step_to_reasoning": 1.167943097008362, "adv/std_final_conf": 0.8785400390625, "adv/std_reasoning": 0.8430765867233276, "adv/std_step_conf": 0.9350871443748474, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7888645391042323, "calib/avg_num_step_conf": 6.890625, "calib/ece": 0.15010796221322534, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4534412955465587, "calib/gap": 0.4093300004565585, "calib/mean_conf": 0.6260053981106612, "calib/mu_c": 0.7884116331096197, "calib/mu_w": 0.3790816326530612, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.08643724696356272, "calib/std_conf": 0.39533294414075476, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4404112554112555, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.1496493506493507, "calib/step_q_w": 0.2907619047619048, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 548.89453125, "completions/mean_terminated_length": 551.047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.15466666666666667, "grad_norm": 0.029142454266548157, "kl": 0.1400146484375, "learning_rate": 1.527777777777778e-06, "loss": 0.006, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033541686832904816, "mask/share_reasoning": 0.8256252408027649, "mask/share_step_conf": 0.1369268149137497, "num_tokens": 35571157.0, "reward": 0.9419224262237549, "reward_std": 0.22772300243377686, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7556396722793579, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8219549655914307, "step": 145 }, { "adv/mean_abs_final_conf": 0.7435042858123779, "adv/mean_abs_reasoning": 0.54443359375, "adv/mean_abs_step_conf": 0.752627968788147, "adv/ratio_final_to_reasoning": 1.365647333940583, "adv/ratio_step_to_reasoning": 1.3824054529848655, "adv/std_final_conf": 0.9029568433761597, "adv/std_reasoning": 0.8098462820053101, "adv/std_step_conf": 0.9342830181121826, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7773940132737369, "calib/avg_num_step_conf": 6.75390625, "calib/ece": 0.19571428571428573, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.3469387755102041, "calib/gap": 0.4054591629418936, "calib/mean_conf": 0.519469387755102, "calib/mu_c": 0.7478504672897197, "calib/mu_w": 0.3423913043478261, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.1392244897959184, "calib/std_conf": 0.41661919685168064, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.38796969696969696, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.1105889673158148, "calib/step_q_w": 0.27738072965388216, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 609.6015625, "completions/mean_terminated_length": 614.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.15573333333333333, "grad_norm": 0.057016413658857346, "kl": 0.2330474853515625, "learning_rate": 1.5e-06, "loss": -0.0669, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030465461313724518, "mask/share_reasoning": 0.8398491144180298, "mask/share_step_conf": 0.12187288701534271, "num_tokens": 35834431.0, "reward": 0.9122192859649658, "reward_std": 0.24562488496303558, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.7225179672241211, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8316080570220947, "step": 146 }, { "adv/mean_abs_final_conf": 0.7490389347076416, "adv/mean_abs_reasoning": 0.575813353061676, "adv/mean_abs_step_conf": 0.7271679043769836, "adv/ratio_final_to_reasoning": 1.3008363399787486, "adv/ratio_step_to_reasoning": 1.2628534932552977, "adv/std_final_conf": 0.9133417010307312, "adv/std_reasoning": 0.8266557455062866, "adv/std_step_conf": 0.9348275065422058, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7377620013522652, "calib/avg_num_step_conf": 6.15234375, "calib/ece": 0.24530364372469646, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.47368421052631576, "calib/gap": 0.3844475997295469, "calib/mean_conf": 0.5774493927125506, "calib/mu_c": 0.8031372549019608, "calib/mu_w": 0.41868965517241385, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.20489878542510132, "calib/std_conf": 0.4354433411011676, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4652090032154341, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.16378193081249603, "calib/step_q_w": 0.3014270724029381, "calib/step_q_w_n": 953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2338.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 565.53515625, "completions/mean_terminated_length": 567.7529907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.1568, "grad_norm": 0.043372150510549545, "kl": 0.1708984375, "learning_rate": 1.4722222222222225e-06, "loss": -0.1202, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030713319778442383, "mask/share_reasoning": 0.8421392440795898, "mask/share_step_conf": 0.12324120104312897, "num_tokens": 36082888.0, "reward": 0.9022128582000732, "reward_std": 0.2670474946498871, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6899276971817017, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8441853523254395, "step": 147 }, { "adv/mean_abs_final_conf": 0.6602885127067566, "adv/mean_abs_reasoning": 0.5762661695480347, "adv/mean_abs_step_conf": 0.7480043172836304, "adv/ratio_final_to_reasoning": 1.1458047471789305, "adv/ratio_step_to_reasoning": 1.2980187920284993, "adv/std_final_conf": 0.8627505898475647, "adv/std_reasoning": 0.7929645776748657, "adv/std_step_conf": 0.9347200989723206, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8135960591133004, "calib/avg_num_step_conf": 6.89453125, "calib/ece": 0.16544000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.512, "calib/gap": 0.484623973727422, "calib/mean_conf": 0.63032, "calib/mu_c": 0.8338620689655172, "calib/mu_w": 0.3492380952380952, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10788000000000003, "calib/std_conf": 0.4194709734892273, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44010273972602737, "calib/step_q_c_n": 876.0, "calib/step_q_gap": 0.17063142364053802, "calib/step_q_w": 0.26947131608548935, "calib/step_q_w_n": 889.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 543.12109375, "completions/mean_terminated_length": 545.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.15786666666666666, "grad_norm": 0.02674427255988121, "kl": 0.15020751953125, "learning_rate": 1.4444444444444445e-06, "loss": 0.0487, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03537040203809738, "mask/share_reasoning": 0.8225604891777039, "mask/share_step_conf": 0.13816285133361816, "num_tokens": 36327039.0, "reward": 0.9648141860961914, "reward_std": 0.20383086800575256, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7801804542541504, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.843979001045227, "step": 148 }, { "adv/mean_abs_final_conf": 0.7208631038665771, "adv/mean_abs_reasoning": 0.5675036907196045, "adv/mean_abs_step_conf": 0.7488936185836792, "adv/ratio_final_to_reasoning": 1.2702350938942975, "adv/ratio_step_to_reasoning": 1.319627750145676, "adv/std_final_conf": 0.9123585224151611, "adv/std_reasoning": 0.8098829388618469, "adv/std_step_conf": 0.933817446231842, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8856749311294766, "calib/avg_num_step_conf": 6.6015625, "calib/ece": 0.0940080971659919, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.43724696356275305, "calib/gap": 0.5695808736717829, "calib/mean_conf": 0.5731174089068826, "calib/mu_c": 0.8521428571428572, "calib/mu_w": 0.28256198347107436, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.07850202429149797, "calib/std_conf": 0.41840017066719615, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4510818307905687, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.17016335813834987, "calib/step_q_w": 0.2809184726522188, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 591.9296875, "completions/mean_terminated_length": 598.9486694335938, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.15893333333333334, "grad_norm": 0.05520571395754814, "kl": 0.131378173828125, "learning_rate": 1.4166666666666667e-06, "loss": -0.0464, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03246816247701645, "mask/share_reasoning": 0.8309919834136963, "mask/share_step_conf": 0.12482112646102905, "num_tokens": 36583029.0, "reward": 0.9735437035560608, "reward_std": 0.22069458663463593, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.8104152679443359, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8483908772468567, "step": 149 }, { "adv/mean_abs_final_conf": 0.7165673971176147, "adv/mean_abs_reasoning": 0.6197329759597778, "adv/mean_abs_step_conf": 0.7352439165115356, "adv/ratio_final_to_reasoning": 1.1562518454143413, "adv/ratio_step_to_reasoning": 1.1863882430539807, "adv/std_final_conf": 0.9072695374488831, "adv/std_reasoning": 0.8589852452278137, "adv/std_step_conf": 0.9348389506340027, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.793323628664064, "calib/avg_num_step_conf": 5.984375, "calib/ece": 0.20621951219512194, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4634146341463415, "calib/gap": 0.3964619863693509, "calib/mean_conf": 0.6045934959349594, "calib/mu_c": 0.7963779527559055, "calib/mu_w": 0.3999159663865546, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14727642276422764, "calib/std_conf": 0.4129662152247405, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44794330042313124, "calib/step_q_c_n": 709.0, "calib/step_q_gap": 0.13494208535630253, "calib/step_q_w": 0.3130012150668287, "calib/step_q_w_n": 823.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 507.625, "completions/mean_terminated_length": 507.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.16, "grad_norm": 0.027483409270644188, "kl": 0.159912109375, "learning_rate": 1.3888888888888892e-06, "loss": -0.0122, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03679945319890976, "mask/share_reasoning": 0.8243503570556641, "mask/share_step_conf": 0.1388501524925232, "num_tokens": 36817941.0, "reward": 0.934812068939209, "reward_std": 0.23240233957767487, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7398660182952881, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.838351845741272, "step": 150 }, { "adv/mean_abs_final_conf": 0.7235825657844543, "adv/mean_abs_reasoning": 0.6277079582214355, "adv/mean_abs_step_conf": 0.7511394023895264, "adv/ratio_final_to_reasoning": 1.1527376008337897, "adv/ratio_step_to_reasoning": 1.1966383292603535, "adv/std_final_conf": 0.8992826342582703, "adv/std_reasoning": 0.8590068221092224, "adv/std_step_conf": 0.9348013997077942, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7804633867276887, "calib/avg_num_step_conf": 6.67578125, "calib/ece": 0.2073360655737705, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.3319672131147541, "calib/gap": 0.38922196796338665, "calib/mean_conf": 0.4737295081967213, "calib/mu_c": 0.716195652173913, "calib/mu_w": 0.32697368421052636, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.15200819672131147, "calib/std_conf": 0.42256683508395787, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.44220802919708024, "calib/step_q_c_n": 548.0, "calib/step_q_gap": 0.15332775357261852, "calib/step_q_w": 0.2888802756244617, "calib/step_q_w_n": 1161.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 589.71875, "completions/mean_terminated_length": 594.3621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.16106666666666666, "grad_norm": 0.0312860906124115, "kl": 0.1264190673828125, "learning_rate": 1.3611111111111112e-06, "loss": -0.0412, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03059176355600357, "mask/share_reasoning": 0.8369712233543396, "mask/share_step_conf": 0.12462448328733444, "num_tokens": 37075933.0, "reward": 0.8848069906234741, "reward_std": 0.24623329937458038, "rewards/accuracy_reward_step": 0.359375, "rewards/final_brier_reward_step": 0.7045589685440063, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8080236911773682, "step": 151 }, { "adv/mean_abs_final_conf": 0.7726234197616577, "adv/mean_abs_reasoning": 0.6658381223678589, "adv/mean_abs_step_conf": 0.7652601003646851, "adv/ratio_final_to_reasoning": 1.160377265594298, "adv/ratio_step_to_reasoning": 1.1493185425359258, "adv/std_final_conf": 0.9337043762207031, "adv/std_reasoning": 0.8431472182273865, "adv/std_step_conf": 0.9347693920135498, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7234513274336284, "calib/avg_num_step_conf": 6.65234375, "calib/ece": 0.19057613168724277, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.3004115226337449, "calib/gap": 0.34416814159292025, "calib/mean_conf": 0.4730452674897119, "calib/mu_c": 0.6571681415929203, "calib/mu_w": 0.313, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.09930041152263372, "calib/std_conf": 0.40861578289777173, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.4087291981845689, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.10920904463370512, "calib/step_q_w": 0.29952015355086375, "calib/step_q_w_n": 1042.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 585.3671875, "completions/mean_terminated_length": 587.6627807617188, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.16213333333333332, "grad_norm": 0.026001159101724625, "kl": 0.13336181640625, "learning_rate": 1.3333333333333334e-06, "loss": -0.0659, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03251414746046066, "mask/share_reasoning": 0.8299437761306763, "mask/share_step_conf": 0.1336357444524765, "num_tokens": 37331179.0, "reward": 0.8980525732040405, "reward_std": 0.24627810716629028, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7018734216690063, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.8200129270553589, "step": 152 }, { "adv/mean_abs_final_conf": 0.7555943727493286, "adv/mean_abs_reasoning": 0.5032080411911011, "adv/mean_abs_step_conf": 0.7314299941062927, "adv/ratio_final_to_reasoning": 1.5015546471809658, "adv/ratio_step_to_reasoning": 1.4535339943594439, "adv/std_final_conf": 0.9189577698707581, "adv/std_reasoning": 0.7753830552101135, "adv/std_step_conf": 0.9348416924476624, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6744253615702478, "calib/avg_num_step_conf": 6.59765625, "calib/ece": 0.2887550200803213, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.30522088353413657, "calib/gap": 0.21644176136363646, "calib/mean_conf": 0.46489959839357425, "calib/mu_c": 0.5700781250000001, "calib/mu_w": 0.35363636363636364, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11979919678714858, "calib/std_conf": 0.41180386064046676, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.3873626373626374, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.1172481732627414, "calib/step_q_w": 0.270114464099896, "calib/step_q_w_n": 961.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 584.50390625, "completions/mean_terminated_length": 584.50390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.1632, "grad_norm": 0.04574429616332054, "kl": 0.139495849609375, "learning_rate": 1.3055555555555556e-06, "loss": 0.0269, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03054162487387657, "mask/share_reasoning": 0.8452918529510498, "mask/share_step_conf": 0.1241665631532669, "num_tokens": 37588132.0, "reward": 0.8886984586715698, "reward_std": 0.22010064125061035, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6561176180839539, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8298731446266174, "step": 153 }, { "adv/mean_abs_final_conf": 0.6796963810920715, "adv/mean_abs_reasoning": 0.5204298496246338, "adv/mean_abs_step_conf": 0.7172330617904663, "adv/ratio_final_to_reasoning": 1.3060288174906005, "adv/ratio_step_to_reasoning": 1.3781551198644335, "adv/std_final_conf": 0.9021387100219727, "adv/std_reasoning": 0.7927193641662598, "adv/std_step_conf": 0.9346775412559509, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7935254638768259, "calib/avg_num_step_conf": 6.34765625, "calib/ece": 0.1801593625498008, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3705179282868526, "calib/gap": 0.44687261481773927, "calib/mean_conf": 0.5024701195219123, "calib/mu_c": 0.7677450980392158, "calib/mu_w": 0.3208724832214765, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13812749003984065, "calib/std_conf": 0.429219937891115, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.45802980132450327, "calib/step_q_c_n": 604.0, "calib/step_q_gap": 0.17397495313645234, "calib/step_q_w": 0.28405484818805093, "calib/step_q_w_n": 1021.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 532.17578125, "completions/mean_terminated_length": 534.2627563476562, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.16426666666666667, "grad_norm": 0.026894347742199898, "kl": 0.1571197509765625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0654, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03247256577014923, "mask/share_reasoning": 0.8317904472351074, "mask/share_step_conf": 0.13183078169822693, "num_tokens": 37828809.0, "reward": 0.948471188545227, "reward_std": 0.20287421345710754, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.7616676092147827, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8610559105873108, "step": 154 }, { "adv/mean_abs_final_conf": 0.7484700679779053, "adv/mean_abs_reasoning": 0.46591895818710327, "adv/mean_abs_step_conf": 0.729465901851654, "adv/ratio_final_to_reasoning": 1.6064383189948142, "adv/ratio_step_to_reasoning": 1.565649753103019, "adv/std_final_conf": 0.9147512912750244, "adv/std_reasoning": 0.7575052976608276, "adv/std_step_conf": 0.9337078928947449, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7547084623323014, "calib/avg_num_step_conf": 6.0546875, "calib/ece": 0.192244094488189, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.2755905511811024, "calib/gap": 0.38964912280701747, "calib/mean_conf": 0.4051574803149606, "calib/mu_c": 0.6383333333333332, "calib/mu_w": 0.24868421052631576, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.09791338582677167, "calib/std_conf": 0.4184680709218479, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.3828700906344411, "calib/step_q_c_n": 662.0, "calib/step_q_gap": 0.1125885591029096, "calib/step_q_w": 0.2702815315315315, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 477.25390625, "completions/mean_terminated_length": 479.1255187988281, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.16533333333333333, "grad_norm": 0.03665679320693016, "kl": 0.157745361328125, "learning_rate": 1.25e-06, "loss": -0.1132, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03502430021762848, "mask/share_reasoning": 0.8271464109420776, "mask/share_step_conf": 0.1339230239391327, "num_tokens": 38058202.0, "reward": 0.9258972406387329, "reward_std": 0.214618980884552, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.7388640642166138, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8402740955352783, "step": 155 }, { "adv/mean_abs_final_conf": 0.7042100429534912, "adv/mean_abs_reasoning": 0.5381708145141602, "adv/mean_abs_step_conf": 0.7314833998680115, "adv/ratio_final_to_reasoning": 1.3085251447335078, "adv/ratio_step_to_reasoning": 1.3592030265118826, "adv/std_final_conf": 0.9068632125854492, "adv/std_reasoning": 0.792761504650116, "adv/std_step_conf": 0.9347695112228394, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7569124423963133, "calib/avg_num_step_conf": 6.62109375, "calib/ece": 0.21859999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.392, "calib/gap": 0.3890847414234512, "calib/mean_conf": 0.5012399999999999, "calib/mu_c": 0.6973387096774194, "calib/mu_w": 0.3082539682539682, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11191999999999998, "calib/std_conf": 0.4379913953492694, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3803166869671133, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.07485902106322312, "calib/step_q_w": 0.3054576659038902, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 529.1640625, "completions/mean_terminated_length": 533.3306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.1664, "grad_norm": 0.03280864655971527, "kl": 0.1468963623046875, "learning_rate": 1.2222222222222223e-06, "loss": -0.0633, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033598169684410095, "mask/share_reasoning": 0.8221753835678101, "mask/share_step_conf": 0.13641397655010223, "num_tokens": 38298428.0, "reward": 0.9258990287780762, "reward_std": 0.21684980392456055, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7279226779937744, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8340317010879517, "step": 156 }, { "adv/mean_abs_final_conf": 0.7056574821472168, "adv/mean_abs_reasoning": 0.575699508190155, "adv/mean_abs_step_conf": 0.7127252221107483, "adv/ratio_final_to_reasoning": 1.2257392478336742, "adv/ratio_step_to_reasoning": 1.2380160343568216, "adv/std_final_conf": 0.9043259620666504, "adv/std_reasoning": 0.8265734314918518, "adv/std_step_conf": 0.9344764947891235, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8329150579150579, "calib/avg_num_step_conf": 7.14453125, "calib/ece": 0.17561752988047816, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.40239043824701193, "calib/gap": 0.5030920205920206, "calib/mean_conf": 0.5243027888446214, "calib/mu_c": 0.7467857142857143, "calib/mu_w": 0.24369369369369365, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.07107569721115545, "calib/std_conf": 0.43926211664983866, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4437849462365592, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.17983611420096407, "calib/step_q_w": 0.2639488320355951, "calib/step_q_w_n": 899.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 547.28515625, "completions/mean_terminated_length": 547.28515625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.16746666666666668, "grad_norm": 0.03486839309334755, "kl": 0.1378326416015625, "learning_rate": 1.1944444444444446e-06, "loss": -0.0098, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03291250020265579, "mask/share_reasoning": 0.8276211619377136, "mask/share_step_conf": 0.13946637511253357, "num_tokens": 38542261.0, "reward": 0.9581904411315918, "reward_std": 0.22784942388534546, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7759785056114197, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8380587100982666, "step": 157 }, { "adv/mean_abs_final_conf": 0.7180853486061096, "adv/mean_abs_reasoning": 0.6382078528404236, "adv/mean_abs_step_conf": 0.7345453500747681, "adv/ratio_final_to_reasoning": 1.1251590612841589, "adv/ratio_step_to_reasoning": 1.1509500342334906, "adv/std_final_conf": 0.8862758874893188, "adv/std_reasoning": 0.8430286645889282, "adv/std_step_conf": 0.9345474243164062, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7362024500322373, "calib/avg_num_step_conf": 6.296875, "calib/ece": 0.21848605577689245, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4302788844621514, "calib/gap": 0.35798001289490644, "calib/mean_conf": 0.563187250996016, "calib/mu_c": 0.7200709219858156, "calib/mu_w": 0.36209090909090913, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.10996015936254981, "calib/std_conf": 0.422513113963258, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4147051597051597, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.09493448301343033, "calib/step_q_w": 0.31977067669172937, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 527.61328125, "completions/mean_terminated_length": 527.61328125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.16853333333333334, "grad_norm": 0.03904188424348831, "kl": 0.1563262939453125, "learning_rate": 1.1666666666666668e-06, "loss": -0.0257, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03514157235622406, "mask/share_reasoning": 0.8268975019454956, "mask/share_step_conf": 0.13796088099479675, "num_tokens": 38782570.0, "reward": 0.9303572773933411, "reward_std": 0.20081466436386108, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7250785231590271, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8332923054695129, "step": 158 }, { "adv/mean_abs_final_conf": 0.7105034589767456, "adv/mean_abs_reasoning": 0.549357533454895, "adv/mean_abs_step_conf": 0.7166721820831299, "adv/ratio_final_to_reasoning": 1.2933352429125857, "adv/ratio_step_to_reasoning": 1.3045642199097507, "adv/std_final_conf": 0.897257924079895, "adv/std_reasoning": 0.8098735809326172, "adv/std_step_conf": 0.9343751072883606, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7527901785714285, "calib/avg_num_step_conf": 6.2109375, "calib/ece": 0.22258064516129036, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.43548387096774194, "calib/gap": 0.39795168067226894, "calib/mean_conf": 0.5548387096774194, "calib/mu_c": 0.7345588235294118, "calib/mu_w": 0.3366071428571429, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11451612903225813, "calib/std_conf": 0.43352732523725346, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4350317662007624, "calib/step_q_c_n": 787.0, "calib/step_q_gap": 0.16991096918955445, "calib/step_q_w": 0.26512079701120794, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 500.1171875, "completions/mean_terminated_length": 500.1171875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.1696, "grad_norm": 0.034035515040159225, "kl": 0.15728759765625, "learning_rate": 1.138888888888889e-06, "loss": 0.0961, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03542216867208481, "mask/share_reasoning": 0.8289942145347595, "mask/share_step_conf": 0.13558357954025269, "num_tokens": 39015384.0, "reward": 0.9383366107940674, "reward_std": 0.21544626355171204, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7279398441314697, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8510770797729492, "step": 159 }, { "adv/mean_abs_final_conf": 0.6851258873939514, "adv/mean_abs_reasoning": 0.505584716796875, "adv/mean_abs_step_conf": 0.7561661005020142, "adv/ratio_final_to_reasoning": 1.3551158977560813, "adv/ratio_step_to_reasoning": 1.4956268957113539, "adv/std_final_conf": 0.8633057475090027, "adv/std_reasoning": 0.7754652500152588, "adv/std_step_conf": 0.9348845481872559, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7667584940312214, "calib/avg_num_step_conf": 6.75, "calib/ece": 0.21218623481781382, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4048582995951417, "calib/gap": 0.40031680440771367, "calib/mean_conf": 0.5177732793522267, "calib/mu_c": 0.7219834710743803, "calib/mu_w": 0.32166666666666666, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12004048582995956, "calib/std_conf": 0.4386465903080075, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4451233671988389, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.15851124015360313, "calib/step_q_w": 0.28661212704523575, "calib/step_q_w_n": 1039.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 559.60546875, "completions/mean_terminated_length": 564.0117797851562, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.17066666666666666, "grad_norm": 0.025723211467266083, "kl": 0.1482696533203125, "learning_rate": 1.111111111111111e-06, "loss": 0.0134, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03189799189567566, "mask/share_reasoning": 0.8309605121612549, "mask/share_step_conf": 0.12932898104190826, "num_tokens": 39263483.0, "reward": 0.9138648509979248, "reward_std": 0.22909203171730042, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7227054834365845, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8190867304801941, "step": 160 }, { "adv/mean_abs_final_conf": 0.6754765510559082, "adv/mean_abs_reasoning": 0.5887191891670227, "adv/mean_abs_step_conf": 0.7318241596221924, "adv/ratio_final_to_reasoning": 1.1473662885214226, "adv/ratio_step_to_reasoning": 1.2430784881628345, "adv/std_final_conf": 0.8675520420074463, "adv/std_reasoning": 0.8266057372093201, "adv/std_step_conf": 0.9342828989028931, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.742180774748924, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.257710843373494, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4578313253012048, "calib/gap": 0.37000215208034437, "calib/mean_conf": 0.5787550200803213, "calib/mu_c": 0.7050609756097561, "calib/mu_w": 0.33505882352941174, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.0889156626506024, "calib/std_conf": 0.4327511414503791, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44351409978308026, "calib/step_q_c_n": 922.0, "calib/step_q_gap": 0.1667862710369029, "calib/step_q_w": 0.27672782874617735, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 514.06640625, "completions/mean_terminated_length": 514.06640625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.17173333333333332, "grad_norm": 0.024139799177646637, "kl": 0.1499176025390625, "learning_rate": 1.0833333333333335e-06, "loss": 0.1239, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03729427233338356, "mask/share_reasoning": 0.8286226987838745, "mask/share_step_conf": 0.13408304750919342, "num_tokens": 39499004.0, "reward": 0.9434548616409302, "reward_std": 0.22313153743743896, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7232269048690796, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8425890207290649, "step": 161 }, { "adv/mean_abs_final_conf": 0.693047285079956, "adv/mean_abs_reasoning": 0.589208722114563, "adv/mean_abs_step_conf": 0.7283810377120972, "adv/ratio_final_to_reasoning": 1.1762339202867456, "adv/ratio_step_to_reasoning": 1.236202062824308, "adv/std_final_conf": 0.8884176015853882, "adv/std_reasoning": 0.8100014328956604, "adv/std_step_conf": 0.9344998598098755, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7707106812447435, "calib/avg_num_step_conf": 5.82421875, "calib/ece": 0.21223107569721117, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5059760956175299, "calib/gap": 0.397907905803196, "calib/mean_conf": 0.6206772908366535, "calib/mu_c": 0.7585975609756098, "calib/mu_w": 0.3606896551724138, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.0897609561752988, "calib/std_conf": 0.4271847285111425, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.43395248380129586, "calib/step_q_c_n": 926.0, "calib/step_q_gap": 0.0908374395535082, "calib/step_q_w": 0.34311504424778766, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 468.21875, "completions/mean_terminated_length": 468.21875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1728, "grad_norm": 0.037547219544649124, "kl": 0.1618194580078125, "learning_rate": 1.0555555555555557e-06, "loss": 0.0287, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03762802481651306, "mask/share_reasoning": 0.8248691558837891, "mask/share_step_conf": 0.1375027745962143, "num_tokens": 39723012.0, "reward": 0.9379310607910156, "reward_std": 0.22288298606872559, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7288386821746826, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8274922370910645, "step": 162 }, { "adv/mean_abs_final_conf": 0.6022693514823914, "adv/mean_abs_reasoning": 0.5638035535812378, "adv/mean_abs_step_conf": 0.7278131246566772, "adv/ratio_final_to_reasoning": 1.0682255329126993, "adv/ratio_step_to_reasoning": 1.2908984344523247, "adv/std_final_conf": 0.82899010181427, "adv/std_reasoning": 0.8099051713943481, "adv/std_step_conf": 0.9342523813247681, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8520267611176703, "calib/avg_num_step_conf": 7.03125, "calib/ece": 0.1543319838056681, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4008097165991903, "calib/gap": 0.5143998425816607, "calib/mean_conf": 0.5124696356275305, "calib/mu_c": 0.7748760330578512, "calib/mu_w": 0.2604761904761905, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08846153846153852, "calib/std_conf": 0.43631041174122165, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.43778807947019865, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.1376608067429259, "calib/step_q_w": 0.30012727272727274, "calib/step_q_w_n": 1045.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 556.234375, "completions/mean_terminated_length": 560.6141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.17386666666666667, "grad_norm": 0.03478431701660156, "kl": 0.1400909423828125, "learning_rate": 1.0277777777777777e-06, "loss": -0.0236, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033809371292591095, "mask/share_reasoning": 0.8174687623977661, "mask/share_step_conf": 0.14090941846370697, "num_tokens": 39970240.0, "reward": 0.9545302391052246, "reward_std": 0.2042197585105896, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7759039402008057, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8480002880096436, "step": 163 }, { "adv/mean_abs_final_conf": 0.7186143398284912, "adv/mean_abs_reasoning": 0.5782505869865417, "adv/mean_abs_step_conf": 0.7410357594490051, "adv/ratio_final_to_reasoning": 1.242738625780619, "adv/ratio_step_to_reasoning": 1.2815131988205868, "adv/std_final_conf": 0.9063172340393066, "adv/std_reasoning": 0.8099116683006287, "adv/std_step_conf": 0.9345961213111877, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.728862590401052, "calib/avg_num_step_conf": 6.88671875, "calib/ece": 0.25417004048583, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.38461538461538464, "calib/gap": 0.3163589743589743, "calib/mean_conf": 0.5063157894736843, "calib/mu_c": 0.6728205128205128, "calib/mu_w": 0.3564615384615385, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1434008097165992, "calib/std_conf": 0.4343376332177941, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.418381414701803, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.13701865078625597, "calib/step_q_w": 0.28136276391554704, "calib/step_q_w_n": 1042.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 590.64453125, "completions/mean_terminated_length": 590.64453125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.17493333333333333, "grad_norm": 0.0259377583861351, "kl": 0.1478271484375, "learning_rate": 1.0000000000000002e-06, "loss": -0.0438, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029251031577587128, "mask/share_reasoning": 0.8463806509971619, "mask/share_step_conf": 0.1243683397769928, "num_tokens": 40227581.0, "reward": 0.9030520915985107, "reward_std": 0.21659719944000244, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6856386661529541, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8368717432022095, "step": 164 }, { "adv/mean_abs_final_conf": 0.6869238018989563, "adv/mean_abs_reasoning": 0.45050010085105896, "adv/mean_abs_step_conf": 0.7137496471405029, "adv/ratio_final_to_reasoning": 1.5248027705238227, "adv/ratio_step_to_reasoning": 1.584349583478734, "adv/std_final_conf": 0.8749675154685974, "adv/std_reasoning": 0.7206880450248718, "adv/std_step_conf": 0.9337314963340759, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7735745614035088, "calib/avg_num_step_conf": 7.0546875, "calib/ece": 0.20679999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.408, "calib/gap": 0.43031733746130024, "calib/mean_conf": 0.53696, "calib/mu_c": 0.7710526315789473, "calib/mu_w": 0.3407352941176471, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14387999999999998, "calib/std_conf": 0.43471135066846367, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4286647314949202, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.13150716658892203, "calib/step_q_w": 0.29715756490599815, "calib/step_q_w_n": 1117.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 561.78515625, "completions/mean_terminated_length": 563.98828125, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.176, "grad_norm": 0.02698436565697193, "kl": 0.147064208984375, "learning_rate": 9.722222222222224e-07, "loss": 0.0705, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032273873686790466, "mask/share_reasoning": 0.8308225870132446, "mask/share_step_conf": 0.13299725949764252, "num_tokens": 40476974.0, "reward": 0.9402658939361572, "reward_std": 0.1786276400089264, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.744081974029541, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8536372184753418, "step": 165 }, { "adv/mean_abs_final_conf": 0.6894981861114502, "adv/mean_abs_reasoning": 0.5755706429481506, "adv/mean_abs_step_conf": 0.7388315200805664, "adv/ratio_final_to_reasoning": 1.1979384191308773, "adv/ratio_step_to_reasoning": 1.283650459127261, "adv/std_final_conf": 0.8869600892066956, "adv/std_reasoning": 0.8099501132965088, "adv/std_step_conf": 0.9338527917861938, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.830402344790834, "calib/avg_num_step_conf": 7.05859375, "calib/ece": 0.14919028340080978, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4493927125506073, "calib/gap": 0.5527331468158806, "calib/mean_conf": 0.5517004048582995, "calib/mu_c": 0.7933812949640288, "calib/mu_w": 0.24064814814814814, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.06906882591093122, "calib/std_conf": 0.44479120257315763, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.45416470588235297, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.16483032761694028, "calib/step_q_w": 0.2893343782654127, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 585.5625, "completions/mean_terminated_length": 587.85888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.17706666666666668, "grad_norm": 0.02961987629532814, "kl": 0.1471710205078125, "learning_rate": 9.444444444444445e-07, "loss": -0.1233, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030628211796283722, "mask/share_reasoning": 0.8332484364509583, "mask/share_step_conf": 0.13221710920333862, "num_tokens": 40733062.0, "reward": 0.9741063117980957, "reward_std": 0.23434357345104218, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7911171913146973, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8570953607559204, "step": 166 }, { "adv/mean_abs_final_conf": 0.6714800000190735, "adv/mean_abs_reasoning": 0.5729876756668091, "adv/mean_abs_step_conf": 0.7475663423538208, "adv/ratio_final_to_reasoning": 1.1718925703552783, "adv/ratio_step_to_reasoning": 1.3046813641913806, "adv/std_final_conf": 0.8594887852668762, "adv/std_reasoning": 0.8099749088287354, "adv/std_step_conf": 0.9354498386383057, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7133557046979866, "calib/avg_num_step_conf": 6.296875, "calib/ece": 0.22803212851405624, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5943775100401606, "calib/gap": 0.33976510067114096, "calib/mean_conf": 0.6983132530120482, "calib/mu_c": 0.834765100671141, "calib/mu_w": 0.495, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.16397590361445782, "calib/std_conf": 0.40734171817593934, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.44661474558670816, "calib/step_q_c_n": 963.0, "calib/step_q_gap": 0.05849456068686226, "calib/step_q_w": 0.3881201848998459, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 529.953125, "completions/mean_terminated_length": 529.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.17813333333333334, "grad_norm": 0.046685412526130676, "kl": 0.1434783935546875, "learning_rate": 9.166666666666666e-07, "loss": -0.1107, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033018916845321655, "mask/share_reasoning": 0.8381372690200806, "mask/share_step_conf": 0.12884379923343658, "num_tokens": 40974338.0, "reward": 0.9086949825286865, "reward_std": 0.263832151889801, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7043409943580627, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8075802326202393, "step": 167 }, { "adv/mean_abs_final_conf": 0.6658957600593567, "adv/mean_abs_reasoning": 0.5945309400558472, "adv/mean_abs_step_conf": 0.7367751598358154, "adv/ratio_final_to_reasoning": 1.1200355022680668, "adv/ratio_step_to_reasoning": 1.2392545285643277, "adv/std_final_conf": 0.8759635090827942, "adv/std_reasoning": 0.8100911974906921, "adv/std_step_conf": 0.9350863695144653, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8082469431153643, "calib/avg_num_step_conf": 7.3203125, "calib/ece": 0.17772357723577242, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4715447154471545, "calib/gap": 0.4370175438596491, "calib/mean_conf": 0.6041463414634146, "calib/mu_c": 0.8066666666666666, "calib/mu_w": 0.36964912280701756, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12264227642276426, "calib/std_conf": 0.4214173275389015, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.46009720534629406, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.17525419868597053, "calib/step_q_w": 0.28484300666032353, "calib/step_q_w_n": 1051.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 543.21484375, "completions/mean_terminated_length": 554.035888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.1792, "grad_norm": 0.04968612268567085, "kl": 0.1490478515625, "learning_rate": 8.88888888888889e-07, "loss": -0.1385, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.031077388674020767, "mask/share_reasoning": 0.820555567741394, "mask/share_step_conf": 0.12883584201335907, "num_tokens": 41218073.0, "reward": 0.9409385919570923, "reward_std": 0.24200639128684998, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7511484622955322, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8377599716186523, "step": 168 }, { "adv/mean_abs_final_conf": 0.7448919415473938, "adv/mean_abs_reasoning": 0.6670287251472473, "adv/mean_abs_step_conf": 0.7577471733093262, "adv/ratio_final_to_reasoning": 1.1167314291943846, "adv/ratio_step_to_reasoning": 1.1360038102437833, "adv/std_final_conf": 0.9090984463691711, "adv/std_reasoning": 0.8747960329055786, "adv/std_step_conf": 0.934616208076477, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7648950131233596, "calib/avg_num_step_conf": 6.8359375, "calib/ece": 0.22121457489878543, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4777327935222672, "calib/gap": 0.40010236220472434, "calib/mean_conf": 0.5932793522267207, "calib/mu_c": 0.7989999999999999, "calib/mu_w": 0.3988976377952756, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.164331983805668, "calib/std_conf": 0.4297803022442411, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.45931645569620255, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.1249518723628692, "calib/step_q_w": 0.33436458333333335, "calib/step_q_w_n": 960.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 569.7265625, "completions/mean_terminated_length": 571.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.18026666666666666, "grad_norm": 0.03378553315997124, "kl": 0.14874267578125, "learning_rate": 8.611111111111112e-07, "loss": -0.0706, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03206229209899902, "mask/share_reasoning": 0.8360965251922607, "mask/share_step_conf": 0.12793496251106262, "num_tokens": 41468107.0, "reward": 0.9184834957122803, "reward_std": 0.2511550784111023, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7157703042030334, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8368216156959534, "step": 169 }, { "adv/mean_abs_final_conf": 0.6869648098945618, "adv/mean_abs_reasoning": 0.6289730072021484, "adv/mean_abs_step_conf": 0.7552691698074341, "adv/ratio_final_to_reasoning": 1.0922007813187047, "adv/ratio_step_to_reasoning": 1.2007974287594423, "adv/std_final_conf": 0.8834342956542969, "adv/std_reasoning": 0.8429259657859802, "adv/std_step_conf": 0.9347723722457886, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.812171743697479, "calib/avg_num_step_conf": 6.87890625, "calib/ece": 0.21886639676113367, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5465587044534413, "calib/gap": 0.45621192226890755, "calib/mean_conf": 0.6547368421052631, "calib/mu_c": 0.87453125, "calib/mu_w": 0.4183193277310924, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.17769230769230776, "calib/std_conf": 0.4249189391366208, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.47638487208008895, "calib/step_q_c_n": 899.0, "calib/step_q_gap": 0.15251712266013534, "calib/step_q_w": 0.3238677494199536, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 554.39453125, "completions/mean_terminated_length": 558.7598266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 50.0, "epoch": 0.18133333333333335, "grad_norm": 0.03549743816256523, "kl": 0.141571044921875, "learning_rate": 8.333333333333333e-07, "loss": -0.1443, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030027873814105988, "mask/share_reasoning": 0.8298295140266418, "mask/share_step_conf": 0.13233011960983276, "num_tokens": 41714184.0, "reward": 0.924481213092804, "reward_std": 0.2552022635936737, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7320507764816284, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8270678520202637, "step": 170 }, { "adv/mean_abs_final_conf": 0.7088747024536133, "adv/mean_abs_reasoning": 0.6031726002693176, "adv/mean_abs_step_conf": 0.7413415312767029, "adv/ratio_final_to_reasoning": 1.1752435407992663, "adv/ratio_step_to_reasoning": 1.2290703041645006, "adv/std_final_conf": 0.9034948348999023, "adv/std_reasoning": 0.8428179621696472, "adv/std_step_conf": 0.9349961280822754, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7144636015325669, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.3018181818181818, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4743083003952569, "calib/gap": 0.2947420178799488, "calib/mean_conf": 0.5753359683794467, "calib/mu_c": 0.7442592592592592, "calib/mu_w": 0.4495172413793104, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22513833992094867, "calib/std_conf": 0.43983870760345284, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4494075144508671, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.13023663477442626, "calib/step_q_w": 0.3191708796764408, "calib/step_q_w_n": 989.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 520.73828125, "completions/mean_terminated_length": 520.73828125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1824, "grad_norm": 0.030517173931002617, "kl": 0.145599365234375, "learning_rate": 8.055555555555557e-07, "loss": -0.0771, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03335035592317581, "mask/share_reasoning": 0.830026388168335, "mask/share_step_conf": 0.1366232931613922, "num_tokens": 41954389.0, "reward": 0.8957168459892273, "reward_std": 0.23421956598758698, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.6607366800308228, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8510094881057739, "step": 171 }, { "adv/mean_abs_final_conf": 0.7520140409469604, "adv/mean_abs_reasoning": 0.5973731279373169, "adv/mean_abs_step_conf": 0.7457846403121948, "adv/ratio_final_to_reasoning": 1.2588682111356542, "adv/ratio_step_to_reasoning": 1.248440221754419, "adv/std_final_conf": 0.9215742349624634, "adv/std_reasoning": 0.8100011348724365, "adv/std_step_conf": 0.9346137642860413, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7111765887631419, "calib/avg_num_step_conf": 6.78515625, "calib/ece": 0.2620149051490514, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5447154471544715, "calib/gap": 0.28152911895354804, "calib/mean_conf": 0.6972452574525745, "calib/mu_c": 0.8219878345498783, "calib/mu_w": 0.5404587155963303, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2011747967479674, "calib/std_conf": 0.3830020412565528, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.440875, "calib/step_q_c_n": 880.0, "calib/step_q_gap": 0.09097348308051345, "calib/step_q_w": 0.34990151691948657, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 525.3359375, "completions/mean_terminated_length": 531.5652465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.18346666666666667, "grad_norm": 0.033332545310258865, "kl": 0.1470794677734375, "learning_rate": 7.777777777777779e-07, "loss": -0.078, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.033402156084775925, "mask/share_reasoning": 0.8161604404449463, "mask/share_step_conf": 0.13871869444847107, "num_tokens": 42192227.0, "reward": 0.9112281799316406, "reward_std": 0.25344720482826233, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.693537712097168, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8296998739242554, "step": 172 }, { "adv/mean_abs_final_conf": 0.7470325231552124, "adv/mean_abs_reasoning": 0.6900979280471802, "adv/mean_abs_step_conf": 0.7407292127609253, "adv/ratio_final_to_reasoning": 1.0825021968536324, "adv/ratio_step_to_reasoning": 1.0733682607293433, "adv/std_final_conf": 0.9220872521400452, "adv/std_reasoning": 0.8749023079872131, "adv/std_step_conf": 0.9353823661804199, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7292431561996779, "calib/avg_num_step_conf": 6.66015625, "calib/ece": 0.26159999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.604, "calib/gap": 0.31702415458937194, "calib/mean_conf": 0.70928, "calib/mu_c": 0.855111111111111, "calib/mu_w": 0.5380869565217391, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2154399999999999, "calib/std_conf": 0.3978244356496971, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.42527352297593, "calib/step_q_c_n": 914.0, "calib/step_q_gap": 0.03423686052333835, "calib/step_q_w": 0.39103666245259167, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 568.78515625, "completions/mean_terminated_length": 568.78515625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.18453333333333333, "grad_norm": 0.03139273077249527, "kl": 0.147369384765625, "learning_rate": 7.5e-07, "loss": -0.095, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03458606079220772, "mask/share_reasoning": 0.834449052810669, "mask/share_step_conf": 0.13096490502357483, "num_tokens": 42440996.0, "reward": 0.8875725269317627, "reward_std": 0.2774200439453125, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.681814432144165, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7972366809844971, "step": 173 }, { "adv/mean_abs_final_conf": 0.7353447675704956, "adv/mean_abs_reasoning": 0.6441205739974976, "adv/mean_abs_step_conf": 0.7374603748321533, "adv/ratio_final_to_reasoning": 1.141625958330827, "adv/ratio_step_to_reasoning": 1.1449104478302512, "adv/std_final_conf": 0.8920470476150513, "adv/std_reasoning": 0.8101984262466431, "adv/std_step_conf": 0.9349736571311951, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7004191863661353, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.25469135802469134, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.411522633744856, "calib/gap": 0.30320368334249603, "calib/mean_conf": 0.5339506172839507, "calib/mu_c": 0.7036448598130842, "calib/mu_w": 0.4004411764705882, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.17415637860082303, "calib/std_conf": 0.4292907702974546, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.43064371257485035, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.10949846117820233, "calib/step_q_w": 0.321145251396648, "calib/step_q_w_n": 1074.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 589.47265625, "completions/mean_terminated_length": 594.1141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.1856, "grad_norm": 0.06488266587257385, "kl": 0.141143798828125, "learning_rate": 7.222222222222222e-07, "loss": -0.1133, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02945755049586296, "mask/share_reasoning": 0.839420735836029, "mask/share_step_conf": 0.12330922484397888, "num_tokens": 42696133.0, "reward": 0.8655682802200317, "reward_std": 0.25477084517478943, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.6679195165634155, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7921232581138611, "step": 174 }, { "adv/mean_abs_final_conf": 0.6823322176933289, "adv/mean_abs_reasoning": 0.5172361135482788, "adv/mean_abs_step_conf": 0.7226990461349487, "adv/ratio_final_to_reasoning": 1.3191890508427153, "adv/ratio_step_to_reasoning": 1.3972323803478814, "adv/std_final_conf": 0.8660195469856262, "adv/std_reasoning": 0.7754583358764648, "adv/std_step_conf": 0.9350190758705139, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8058183538315988, "calib/avg_num_step_conf": 7.16015625, "calib/ece": 0.19823293172690765, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3534136546184739, "calib/gap": 0.4353682930125693, "calib/mean_conf": 0.4786345381526105, "calib/mu_c": 0.7426530612244898, "calib/mu_w": 0.3072847682119205, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.14164658634538152, "calib/std_conf": 0.43176733909865955, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4574626865671642, "calib/step_q_c_n": 603.0, "calib/step_q_gap": 0.18419520689236746, "calib/step_q_w": 0.27326747967479675, "calib/step_q_w_n": 1230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 573.8203125, "completions/mean_terminated_length": 578.3385620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.18666666666666668, "grad_norm": 0.03821876645088196, "kl": 0.139007568359375, "learning_rate": 6.944444444444446e-07, "loss": -0.0861, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030831394717097282, "mask/share_reasoning": 0.8289897441864014, "mask/share_step_conf": 0.1323663592338562, "num_tokens": 42948855.0, "reward": 0.9255420565605164, "reward_std": 0.23187220096588135, "rewards/accuracy_reward_step": 0.3828125, "rewards/final_brier_reward_step": 0.7417949438095093, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8421017527580261, "step": 175 }, { "adv/mean_abs_final_conf": 0.7288157939910889, "adv/mean_abs_reasoning": 0.6108537912368774, "adv/mean_abs_step_conf": 0.7525293827056885, "adv/ratio_final_to_reasoning": 1.193110044410722, "adv/ratio_step_to_reasoning": 1.2319304447336596, "adv/std_final_conf": 0.9071044325828552, "adv/std_reasoning": 0.8099657893180847, "adv/std_step_conf": 0.9341826438903809, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7841346153846154, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.19042666666666674, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.368, "calib/gap": 0.41957905982905985, "calib/mean_conf": 0.5190133333333332, "calib/mu_c": 0.7371944444444445, "calib/mu_w": 0.31761538461538463, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.11472000000000007, "calib/std_conf": 0.4250891721352905, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.43961379310344834, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.14185387884835732, "calib/step_q_w": 0.297759914255091, "calib/step_q_w_n": 933.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 551.00390625, "completions/mean_terminated_length": 553.1647338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.18773333333333334, "grad_norm": 0.037594687193632126, "kl": 0.1331939697265625, "learning_rate": 6.666666666666667e-07, "loss": -0.0645, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03376203775405884, "mask/share_reasoning": 0.8257086277008057, "mask/share_step_conf": 0.1366230547428131, "num_tokens": 43193976.0, "reward": 0.9356297254562378, "reward_std": 0.23185500502586365, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.743251383304596, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8436331152915955, "step": 176 }, { "adv/mean_abs_final_conf": 0.7497272491455078, "adv/mean_abs_reasoning": 0.6292704343795776, "adv/mean_abs_step_conf": 0.7332544922828674, "adv/ratio_final_to_reasoning": 1.1914229688618587, "adv/ratio_step_to_reasoning": 1.1652454210816559, "adv/std_final_conf": 0.9185965061187744, "adv/std_reasoning": 0.8429964184761047, "adv/std_step_conf": 0.9343379139900208, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7373385012919897, "calib/avg_num_step_conf": 6.734375, "calib/ece": 0.21686746987951808, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3855421686746988, "calib/gap": 0.3612693798449613, "calib/mean_conf": 0.5465863453815262, "calib/mu_c": 0.7337500000000001, "calib/mu_w": 0.3724806201550388, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14076305220883534, "calib/std_conf": 0.4245110506249608, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4502612481857765, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.1357974800698345, "calib/step_q_w": 0.314463768115942, "calib/step_q_w_n": 1035.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 542.15625, "completions/mean_terminated_length": 546.4251708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.1888, "grad_norm": 0.03646094724535942, "kl": 0.137420654296875, "learning_rate": 6.388888888888889e-07, "loss": -0.0679, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033001139760017395, "mask/share_reasoning": 0.8247432112693787, "mask/share_step_conf": 0.13444316387176514, "num_tokens": 43436600.0, "reward": 0.9250853061676025, "reward_std": 0.22668462991714478, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7222949266433716, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8411569595336914, "step": 177 }, { "adv/mean_abs_final_conf": 0.6748043894767761, "adv/mean_abs_reasoning": 0.5698914527893066, "adv/mean_abs_step_conf": 0.7398583889007568, "adv/ratio_final_to_reasoning": 1.1840928411436566, "adv/ratio_step_to_reasoning": 1.298244402999124, "adv/std_final_conf": 0.8798059225082397, "adv/std_reasoning": 0.7929153442382812, "adv/std_step_conf": 0.9339677095413208, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8274903137107861, "calib/avg_num_step_conf": 6.19140625, "calib/ece": 0.13019762845849808, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.391304347826087, "calib/gap": 0.5169072615923009, "calib/mean_conf": 0.5277470355731225, "calib/mu_c": 0.7872222222222222, "calib/mu_w": 0.2703149606299213, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.07996047430830047, "calib/std_conf": 0.4283981368738465, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.44836879432624116, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.16823243068987753, "calib/step_q_w": 0.28013636363636363, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 505.48046875, "completions/mean_terminated_length": 507.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.18986666666666666, "grad_norm": 0.031120702624320984, "kl": 0.1497802734375, "learning_rate": 6.111111111111112e-07, "loss": -0.0717, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03448161482810974, "mask/share_reasoning": 0.8300520181655884, "mask/share_step_conf": 0.13156011700630188, "num_tokens": 43672075.0, "reward": 0.9877314567565918, "reward_std": 0.1797032356262207, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.8070328235626221, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8746801614761353, "step": 178 }, { "adv/mean_abs_final_conf": 0.7142655849456787, "adv/mean_abs_reasoning": 0.6147536039352417, "adv/mean_abs_step_conf": 0.7718060612678528, "adv/ratio_final_to_reasoning": 1.1618729526324496, "adv/ratio_step_to_reasoning": 1.2554722027284855, "adv/std_final_conf": 0.8914516568183899, "adv/std_reasoning": 0.8100534677505493, "adv/std_step_conf": 0.9345685839653015, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.770054908263024, "calib/avg_num_step_conf": 6.53125, "calib/ece": 0.2055510204081633, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.39183673469387753, "calib/gap": 0.40611557519753577, "calib/mean_conf": 0.5335510204081632, "calib/mu_c": 0.7225190839694656, "calib/mu_w": 0.31640350877192985, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10220408163265306, "calib/std_conf": 0.431957153072843, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4421166032953105, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.14329440623981782, "calib/step_q_w": 0.2988221970554927, "calib/step_q_w_n": 883.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 550.17578125, "completions/mean_terminated_length": 556.6996459960938, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.19093333333333334, "grad_norm": 0.03112025186419487, "kl": 0.1467742919921875, "learning_rate": 5.833333333333334e-07, "loss": -0.0728, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0336512066423893, "mask/share_reasoning": 0.8248656988143921, "mask/share_step_conf": 0.12976431846618652, "num_tokens": 43919184.0, "reward": 0.9264711141586304, "reward_std": 0.23171232640743256, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7263144254684448, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8344402313232422, "step": 179 }, { "adv/mean_abs_final_conf": 0.7504141926765442, "adv/mean_abs_reasoning": 0.6587625741958618, "adv/mean_abs_step_conf": 0.7519873380661011, "adv/ratio_final_to_reasoning": 1.1391269359716734, "adv/ratio_step_to_reasoning": 1.1415149668817128, "adv/std_final_conf": 0.8830128908157349, "adv/std_reasoning": 0.8430957198143005, "adv/std_step_conf": 0.9344708919525146, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6929724234337273, "calib/avg_num_step_conf": 6.7265625, "calib/ece": 0.2840637450199204, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.2824666412504766, "calib/mean_conf": 0.5317131474103586, "calib/mu_c": 0.6768852459016393, "calib/mu_w": 0.39441860465116274, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.16486055776892433, "calib/std_conf": 0.4366340073345297, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.41103666245259163, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.10133204376301058, "calib/step_q_w": 0.30970461868958105, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 622.55078125, "completions/mean_terminated_length": 622.55078125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.192, "grad_norm": 0.0328676737844944, "kl": 0.135284423828125, "learning_rate": 5.555555555555555e-07, "loss": -0.063, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02904374897480011, "mask/share_reasoning": 0.8486019372940063, "mask/share_step_conf": 0.12235426902770996, "num_tokens": 44182413.0, "reward": 0.8859990835189819, "reward_std": 0.24790097773075104, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6629855632781982, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8238563537597656, "step": 180 }, { "adv/mean_abs_final_conf": 0.6953284740447998, "adv/mean_abs_reasoning": 0.5843336582183838, "adv/mean_abs_step_conf": 0.7292240858078003, "adv/ratio_final_to_reasoning": 1.1899510908969988, "adv/ratio_step_to_reasoning": 1.247958380544402, "adv/std_final_conf": 0.9049976468086243, "adv/std_reasoning": 0.8265220522880554, "adv/std_step_conf": 0.934059739112854, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8302599564604944, "calib/avg_num_step_conf": 6.78125, "calib/ece": 0.13549800796812747, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3147410358565737, "calib/gap": 0.49808810347035476, "calib/mean_conf": 0.46848605577689245, "calib/mu_c": 0.7403508771929825, "calib/mu_w": 0.24226277372262778, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07490039840637447, "calib/std_conf": 0.4211595863991668, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43901217861975644, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.1599359499336983, "calib/step_q_w": 0.27907622868605814, "calib/step_q_w_n": 997.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 507.296875, "completions/mean_terminated_length": 511.2913513183594, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.19306666666666666, "grad_norm": 0.03237759321928024, "kl": 0.1660614013671875, "learning_rate": 5.277777777777779e-07, "loss": -0.1071, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032008811831474304, "mask/share_reasoning": 0.8223521709442139, "mask/share_step_conf": 0.13782651722431183, "num_tokens": 44418545.0, "reward": 0.9796043634414673, "reward_std": 0.191335991024971, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.8015214800834656, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8733123540878296, "step": 181 }, { "adv/mean_abs_final_conf": 0.7607433795928955, "adv/mean_abs_reasoning": 0.6217296123504639, "adv/mean_abs_step_conf": 0.7305428981781006, "adv/ratio_final_to_reasoning": 1.2235919996103881, "adv/ratio_step_to_reasoning": 1.1750170551089973, "adv/std_final_conf": 0.9192807078361511, "adv/std_reasoning": 0.8429564833641052, "adv/std_step_conf": 0.9337859749794006, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.713858868404323, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.23800796812749003, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.44223107569721115, "calib/gap": 0.3338232676414495, "calib/mean_conf": 0.5732270916334661, "calib/mu_c": 0.7341538461538462, "calib/mu_w": 0.4003305785123967, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14665338645418324, "calib/std_conf": 0.4292033111404569, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40531690140845067, "calib/step_q_c_n": 852.0, "calib/step_q_gap": 0.08159703918112571, "calib/step_q_w": 0.32371986222732496, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 572.48828125, "completions/mean_terminated_length": 572.48828125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.19413333333333332, "grad_norm": 0.057896681129932404, "kl": 0.139312744140625, "learning_rate": 5.000000000000001e-07, "loss": -0.0456, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03143026679754257, "mask/share_reasoning": 0.8360381126403809, "mask/share_step_conf": 0.13253158330917358, "num_tokens": 44671262.0, "reward": 0.9323064088821411, "reward_std": 0.24068082869052887, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7115858793258667, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8561519384384155, "step": 182 }, { "adv/mean_abs_final_conf": 0.6727540493011475, "adv/mean_abs_reasoning": 0.5549894571304321, "adv/mean_abs_step_conf": 0.7398919463157654, "adv/ratio_final_to_reasoning": 1.2121924852043426, "adv/ratio_step_to_reasoning": 1.333163967008256, "adv/std_final_conf": 0.8799149990081787, "adv/std_reasoning": 0.8099837303161621, "adv/std_step_conf": 0.9335006475448608, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8204834605597965, "calib/avg_num_step_conf": 6.47265625, "calib/ece": 0.1960159362549801, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.32669322709163345, "calib/gap": 0.4512843511450381, "calib/mean_conf": 0.4352191235059761, "calib/mu_c": 0.67075, "calib/mu_w": 0.21946564885496184, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.07657370517928286, "calib/std_conf": 0.43562595412257965, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4167121418826739, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.12872512889566096, "calib/step_q_w": 0.28798701298701296, "calib/step_q_w_n": 924.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 523.46875, "completions/mean_terminated_length": 527.590576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.1952, "grad_norm": 0.02976737916469574, "kl": 0.143646240234375, "learning_rate": 4.7222222222222226e-07, "loss": -0.1384, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03223654627799988, "mask/share_reasoning": 0.8284767270088196, "mask/share_step_conf": 0.13147422671318054, "num_tokens": 44911950.0, "reward": 0.9484266638755798, "reward_std": 0.2068886160850525, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7585452795028687, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8515892028808594, "step": 183 }, { "adv/mean_abs_final_conf": 0.6752747893333435, "adv/mean_abs_reasoning": 0.5506070852279663, "adv/mean_abs_step_conf": 0.7362768650054932, "adv/ratio_final_to_reasoning": 1.226418633995168, "adv/ratio_step_to_reasoning": 1.3372092091780012, "adv/std_final_conf": 0.8800711035728455, "adv/std_reasoning": 0.7928036451339722, "adv/std_step_conf": 0.9339790344238281, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8077777777777778, "calib/avg_num_step_conf": 6.71484375, "calib/ece": 0.1896031746031746, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.44047619047619047, "calib/gap": 0.48925490196078436, "calib/mean_conf": 0.5456349206349206, "calib/mu_c": 0.7436666666666667, "calib/mu_w": 0.25441176470588234, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07000000000000002, "calib/std_conf": 0.437287809825313, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4014785214785215, "calib/step_q_c_n": 1001.0, "calib/step_q_gap": 0.11227239334481676, "calib/step_q_w": 0.2892061281337047, "calib/step_q_w_n": 718.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 521.51171875, "completions/mean_terminated_length": 525.6181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.19626666666666667, "grad_norm": 0.03793696314096451, "kl": 0.159942626953125, "learning_rate": 4.444444444444445e-07, "loss": 0.0157, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031797416508197784, "mask/share_reasoning": 0.8241387605667114, "mask/share_step_conf": 0.1362513303756714, "num_tokens": 45150737.0, "reward": 0.9748945832252502, "reward_std": 0.18860071897506714, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7809671759605408, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8563219308853149, "step": 184 }, { "adv/mean_abs_final_conf": 0.7041515111923218, "adv/mean_abs_reasoning": 0.5635975003242493, "adv/mean_abs_step_conf": 0.737541139125824, "adv/ratio_final_to_reasoning": 1.2493872147892935, "adv/ratio_step_to_reasoning": 1.3086309621698133, "adv/std_final_conf": 0.896740734577179, "adv/std_reasoning": 0.8265684843063354, "adv/std_step_conf": 0.9342312216758728, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8083511063716342, "calib/avg_num_step_conf": 6.796875, "calib/ece": 0.18306081632653065, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4530612244897959, "calib/gap": 0.4999691815515862, "calib/mean_conf": 0.5642861224489795, "calib/mu_c": 0.8112096774193548, "calib/mu_w": 0.3112404958677686, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1206122448979592, "calib/std_conf": 0.4405038356191336, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.44092207792207794, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.18606640781898515, "calib/step_q_w": 0.2548556701030928, "calib/step_q_w_n": 970.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 556.76953125, "completions/mean_terminated_length": 558.9530029296875, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.19733333333333333, "grad_norm": 0.028952667489647865, "kl": 0.13585662841796875, "learning_rate": 4.1666666666666667e-07, "loss": 0.0111, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03193461149930954, "mask/share_reasoning": 0.8293842077255249, "mask/share_step_conf": 0.13477492332458496, "num_tokens": 45400190.0, "reward": 0.9280073642730713, "reward_std": 0.24359695613384247, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7500066757202148, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8224143981933594, "step": 185 }, { "adv/mean_abs_final_conf": 0.7179276347160339, "adv/mean_abs_reasoning": 0.5549343824386597, "adv/mean_abs_step_conf": 0.7462427616119385, "adv/ratio_final_to_reasoning": 1.2937162616616045, "adv/ratio_step_to_reasoning": 1.3447405409132769, "adv/std_final_conf": 0.9043472409248352, "adv/std_reasoning": 0.8097760081291199, "adv/std_step_conf": 0.9339725375175476, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.77057535959975, "calib/avg_num_step_conf": 6.828125, "calib/ece": 0.20245059288537548, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.41106719367588934, "calib/gap": 0.42446341463414633, "calib/mean_conf": 0.5266403162055336, "calib/mu_c": 0.733, "calib/mu_w": 0.30853658536585366, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10762845849802372, "calib/std_conf": 0.4359000695429722, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4225626716604245, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.1352553854935818, "calib/step_q_w": 0.2873072861668427, "calib/step_q_w_n": 947.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 553.19921875, "completions/mean_terminated_length": 553.19921875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1984, "grad_norm": 0.03475416079163551, "kl": 0.146728515625, "learning_rate": 3.8888888888888895e-07, "loss": -0.1265, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03105587139725685, "mask/share_reasoning": 0.8362254500389099, "mask/share_step_conf": 0.13271866738796234, "num_tokens": 45646849.0, "reward": 0.960451602935791, "reward_std": 0.20122194290161133, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7569808959960938, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8662659525871277, "step": 186 }, { "adv/mean_abs_final_conf": 0.7896785736083984, "adv/mean_abs_reasoning": 0.679695188999176, "adv/mean_abs_step_conf": 0.7941338419914246, "adv/ratio_final_to_reasoning": 1.1618128043118394, "adv/ratio_step_to_reasoning": 1.1683676077812981, "adv/std_final_conf": 0.9174456000328064, "adv/std_reasoning": 0.8591005206108093, "adv/std_step_conf": 0.9346184134483337, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6905064183946961, "calib/avg_num_step_conf": 7.2734375, "calib/ece": 0.28705394190871375, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.3941908713692946, "calib/gap": 0.2636711807024969, "calib/mean_conf": 0.5146887966804979, "calib/mu_c": 0.666764705882353, "calib/mu_w": 0.40309352517985614, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.18925311203319506, "calib/std_conf": 0.4275016762955222, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.42846666666666666, "calib/step_q_c_n": 600.0, "calib/step_q_gap": 0.14417982039091393, "calib/step_q_w": 0.28428684627575274, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 667.828125, "completions/mean_terminated_length": 667.828125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.19946666666666665, "grad_norm": 0.029744060710072517, "kl": 0.123138427734375, "learning_rate": 3.611111111111111e-07, "loss": 0.1127, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.028970632702112198, "mask/share_reasoning": 0.8479760885238647, "mask/share_step_conf": 0.12305329740047455, "num_tokens": 45919357.0, "reward": 0.8574899435043335, "reward_std": 0.24872718751430511, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.6425703167915344, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8075656890869141, "step": 187 }, { "adv/mean_abs_final_conf": 0.7544801235198975, "adv/mean_abs_reasoning": 0.6453492045402527, "adv/mean_abs_step_conf": 0.7617857456207275, "adv/ratio_final_to_reasoning": 1.1691036700934492, "adv/ratio_step_to_reasoning": 1.1804240870854166, "adv/std_final_conf": 0.9098480939865112, "adv/std_reasoning": 0.8589980602264404, "adv/std_step_conf": 0.9342439770698547, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7976353332030487, "calib/avg_num_step_conf": 6.9375, "calib/ece": 0.20193548387096763, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3588709677419355, "calib/gap": 0.4378411829848219, "calib/mean_conf": 0.4895967741935484, "calib/mu_c": 0.6996899224806202, "calib/mu_w": 0.26184873949579834, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08568548387096765, "calib/std_conf": 0.4361837789531521, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.41524390243902437, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.11278574344320852, "calib/step_q_w": 0.30245815899581585, "calib/step_q_w_n": 956.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 599.65625, "completions/mean_terminated_length": 602.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.20053333333333334, "grad_norm": 0.03370795398950577, "kl": 0.1395416259765625, "learning_rate": 3.3333333333333335e-07, "loss": 0.0514, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03147049620747566, "mask/share_reasoning": 0.8312385678291321, "mask/share_step_conf": 0.13338470458984375, "num_tokens": 46176941.0, "reward": 0.9409340620040894, "reward_std": 0.22601254284381866, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7418223023414612, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8478583097457886, "step": 188 }, { "adv/mean_abs_final_conf": 0.6637656092643738, "adv/mean_abs_reasoning": 0.5426450371742249, "adv/mean_abs_step_conf": 0.7172122001647949, "adv/ratio_final_to_reasoning": 1.2232040538339268, "adv/ratio_step_to_reasoning": 1.3216967834067235, "adv/std_final_conf": 0.8593613505363464, "adv/std_reasoning": 0.7928246259689331, "adv/std_step_conf": 0.9344122409820557, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8002862347124642, "calib/avg_num_step_conf": 6.57421875, "calib/ece": 0.18821236559139787, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.31451612903225806, "calib/gap": 0.453576632838928, "calib/mean_conf": 0.41638440860215054, "calib/mu_c": 0.6468306010928963, "calib/mu_w": 0.19325396825396826, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.05633064516129035, "calib/std_conf": 0.43000735707708637, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.42563421828908554, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.19948496455774226, "calib/step_q_w": 0.22614925373134329, "calib/step_q_w_n": 1005.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 522.41796875, "completions/mean_terminated_length": 526.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.2016, "grad_norm": 0.025214437395334244, "kl": 0.1561431884765625, "learning_rate": 3.055555555555556e-07, "loss": -0.1702, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032530926167964935, "mask/share_reasoning": 0.833452582359314, "mask/share_step_conf": 0.1262039989233017, "num_tokens": 46418448.0, "reward": 0.9340362548828125, "reward_std": 0.21618613600730896, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7459970712661743, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8369191288948059, "step": 189 }, { "adv/mean_abs_final_conf": 0.736014723777771, "adv/mean_abs_reasoning": 0.519279956817627, "adv/mean_abs_step_conf": 0.7525043487548828, "adv/ratio_final_to_reasoning": 1.4173755680623392, "adv/ratio_step_to_reasoning": 1.4491303561311246, "adv/std_final_conf": 0.9154024720191956, "adv/std_reasoning": 0.775436520576477, "adv/std_step_conf": 0.9342000484466553, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7823073919833421, "calib/avg_num_step_conf": 6.703125, "calib/ece": 0.20297188755020082, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3092369477911647, "calib/gap": 0.40319104633003644, "calib/mean_conf": 0.4552610441767068, "calib/mu_c": 0.638235294117647, "calib/mu_w": 0.2350442477876106, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.05602409638554215, "calib/std_conf": 0.4188893712544085, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.3838032786885246, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.10417781052373054, "calib/step_q_w": 0.27962546816479406, "calib/step_q_w_n": 801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 563.4609375, "completions/mean_terminated_length": 565.6705932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.20266666666666666, "grad_norm": 0.06035936623811722, "kl": 0.1408233642578125, "learning_rate": 2.7777777777777776e-07, "loss": -0.1432, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030015822499990463, "mask/share_reasoning": 0.8396799564361572, "mask/share_step_conf": 0.1263979971408844, "num_tokens": 46668302.0, "reward": 0.9438740015029907, "reward_std": 0.20643171668052673, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.739453136920929, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8490761518478394, "step": 190 }, { "adv/mean_abs_final_conf": 0.7668330669403076, "adv/mean_abs_reasoning": 0.6082367300987244, "adv/mean_abs_step_conf": 0.7396363019943237, "adv/ratio_final_to_reasoning": 1.2607477138314898, "adv/ratio_step_to_reasoning": 1.2160336023677354, "adv/std_final_conf": 0.9332967400550842, "adv/std_reasoning": 0.8266222476959229, "adv/std_step_conf": 0.9346864819526672, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7542235437221247, "calib/avg_num_step_conf": 7.11328125, "calib/ece": 0.21726907630522088, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.37349397590361444, "calib/gap": 0.40706649547236107, "calib/mean_conf": 0.5136546184738956, "calib/mu_c": 0.7605102040816326, "calib/mu_w": 0.35344370860927155, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16867469879518074, "calib/std_conf": 0.438051427805417, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.45821848739495796, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.15850396863476218, "calib/step_q_w": 0.2997145187601958, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2903.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 576.875, "completions/mean_terminated_length": 576.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.20373333333333332, "grad_norm": 0.04359288513660431, "kl": 0.1428985595703125, "learning_rate": 2.5000000000000004e-07, "loss": -0.0011, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03338399529457092, "mask/share_reasoning": 0.8276382684707642, "mask/share_step_conf": 0.13897772133350372, "num_tokens": 46920150.0, "reward": 0.9028844833374023, "reward_std": 0.24160155653953552, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.7154718637466431, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8215470314025879, "step": 191 }, { "adv/mean_abs_final_conf": 0.7420070171356201, "adv/mean_abs_reasoning": 0.6264379024505615, "adv/mean_abs_step_conf": 0.7037802934646606, "adv/ratio_final_to_reasoning": 1.1844861465645102, "adv/ratio_step_to_reasoning": 1.1234637794289641, "adv/std_final_conf": 0.916337788105011, "adv/std_reasoning": 0.8430317044258118, "adv/std_step_conf": 0.9342029094696045, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8022857142857143, "calib/avg_num_step_conf": 6.87890625, "calib/ece": 0.19637450199203188, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.36254980079681276, "calib/gap": 0.4784342857142856, "calib/mean_conf": 0.4474501992031873, "calib/mu_c": 0.6857142857142856, "calib/mu_w": 0.20728, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.07091633466135458, "calib/std_conf": 0.4440952489605836, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.38982278481012655, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.1389473986103325, "calib/step_q_w": 0.25087538619979405, "calib/step_q_w_n": 971.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 567.75390625, "completions/mean_terminated_length": 572.2244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.2048, "grad_norm": 0.029248613864183426, "kl": 0.1450042724609375, "learning_rate": 2.2222222222222224e-07, "loss": 0.0297, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03411424905061722, "mask/share_reasoning": 0.8246064186096191, "mask/share_step_conf": 0.13346685469150543, "num_tokens": 47170471.0, "reward": 0.953478217124939, "reward_std": 0.22533637285232544, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7501965165138245, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8669161200523376, "step": 192 }, { "adv/mean_abs_final_conf": 0.766300618648529, "adv/mean_abs_reasoning": 0.6802315711975098, "adv/mean_abs_step_conf": 0.7448439002037048, "adv/ratio_final_to_reasoning": 1.1265290396614487, "adv/ratio_step_to_reasoning": 1.0949857838742307, "adv/std_final_conf": 0.9214043021202087, "adv/std_reasoning": 0.8592211008071899, "adv/std_step_conf": 0.9346270561218262, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7196492805755397, "calib/avg_num_step_conf": 6.125, "calib/ece": 0.2221513944223108, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.21115537848605578, "calib/gap": 0.3011208890030832, "calib/mean_conf": 0.4049402390438247, "calib/mu_c": 0.5716964285714285, "calib/mu_w": 0.27057553956834535, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09043824701195222, "calib/std_conf": 0.39759573305878976, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3862333825701625, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.06997187864199189, "calib/step_q_w": 0.3162615039281706, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 509.3515625, "completions/mean_terminated_length": 511.34906005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.20586666666666667, "grad_norm": 0.03636451065540314, "kl": 0.1459808349609375, "learning_rate": 1.9444444444444447e-07, "loss": -0.0495, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03263282775878906, "mask/share_reasoning": 0.8374545574188232, "mask/share_step_conf": 0.12600642442703247, "num_tokens": 47406577.0, "reward": 0.927412211894989, "reward_std": 0.2111629843711853, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7223039269447327, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8504891991615295, "step": 193 }, { "adv/mean_abs_final_conf": 0.6695102453231812, "adv/mean_abs_reasoning": 0.5588866472244263, "adv/mean_abs_step_conf": 0.7345038652420044, "adv/ratio_final_to_reasoning": 1.1979356612796888, "adv/ratio_step_to_reasoning": 1.3142268989422776, "adv/std_final_conf": 0.8768190741539001, "adv/std_reasoning": 0.7928152680397034, "adv/std_step_conf": 0.9336667060852051, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8134920634920635, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.16755020080321287, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.37751004016064255, "calib/gap": 0.4849632210607821, "calib/mean_conf": 0.5169477911646586, "calib/mu_c": 0.7565079365079366, "calib/mu_w": 0.27154471544715447, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08923694779116467, "calib/std_conf": 0.43803174373464926, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4300995024875622, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.16561890173662352, "calib/step_q_w": 0.26448060075093865, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 511.00390625, "completions/mean_terminated_length": 513.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.20693333333333333, "grad_norm": 0.03281570225954056, "kl": 0.1405181884765625, "learning_rate": 1.6666666666666668e-07, "loss": -0.015, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0331403985619545, "mask/share_reasoning": 0.8325739502906799, "mask/share_step_conf": 0.13037940859794617, "num_tokens": 47643338.0, "reward": 0.9630546569824219, "reward_std": 0.19925682246685028, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7746968865394592, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8584436178207397, "step": 194 }, { "adv/mean_abs_final_conf": 0.7317264080047607, "adv/mean_abs_reasoning": 0.6601746082305908, "adv/mean_abs_step_conf": 0.7371601462364197, "adv/ratio_final_to_reasoning": 1.1083831442198966, "adv/ratio_step_to_reasoning": 1.1166139034219547, "adv/std_final_conf": 0.919392466545105, "adv/std_reasoning": 0.859114408493042, "adv/std_step_conf": 0.9345622062683105, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7941827893948277, "calib/avg_num_step_conf": 6.72265625, "calib/ece": 0.21443548387096772, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.3588709677419355, "calib/gap": 0.44475278483486413, "calib/mean_conf": 0.4640322580645161, "calib/mu_c": 0.6774418604651162, "calib/mu_w": 0.23268907563025207, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.07915322580645162, "calib/std_conf": 0.4345695299704894, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4276574307304787, "calib/step_q_c_n": 794.0, "calib/step_q_gap": 0.17257436708430823, "calib/step_q_w": 0.25508306364617045, "calib/step_q_w_n": 927.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 574.30859375, "completions/mean_terminated_length": 576.560791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.208, "grad_norm": 0.04601413384079933, "kl": 0.14105224609375, "learning_rate": 1.3888888888888888e-07, "loss": 0.01, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03239461034536362, "mask/share_reasoning": 0.8322038650512695, "mask/share_step_conf": 0.13149525225162506, "num_tokens": 47896345.0, "reward": 0.9279034733772278, "reward_std": 0.23658813536167145, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7333793044090271, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8325839042663574, "step": 195 }, { "adv/mean_abs_final_conf": 0.7037650346755981, "adv/mean_abs_reasoning": 0.5706368684768677, "adv/mean_abs_step_conf": 0.761438250541687, "adv/ratio_final_to_reasoning": 1.2332975199343033, "adv/ratio_step_to_reasoning": 1.3343656756250302, "adv/std_final_conf": 0.9014235138893127, "adv/std_reasoning": 0.7928955554962158, "adv/std_step_conf": 0.9342949390411377, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.702474690663667, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.26434782608695656, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4505928853754941, "calib/gap": 0.29691163604549436, "calib/mean_conf": 0.5812648221343873, "calib/mu_c": 0.7291338582677166, "calib/mu_w": 0.43222222222222223, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17181818181818181, "calib/std_conf": 0.4268674168044643, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44906030855539975, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.09664300575641754, "calib/step_q_w": 0.3524173027989822, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 435.875, "completions/mean_terminated_length": 437.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.20906666666666668, "grad_norm": 0.02775087207555771, "kl": 0.162445068359375, "learning_rate": 1.1111111111111112e-07, "loss": -0.0394, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03831184655427933, "mask/share_reasoning": 0.8148221969604492, "mask/share_step_conf": 0.14295971393585205, "num_tokens": 48110473.0, "reward": 0.9260044693946838, "reward_std": 0.21588167548179626, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6996980309486389, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.856217086315155, "step": 196 }, { "adv/mean_abs_final_conf": 0.7061256170272827, "adv/mean_abs_reasoning": 0.5036745071411133, "adv/mean_abs_step_conf": 0.7423129677772522, "adv/ratio_final_to_reasoning": 1.4019482960042868, "adv/ratio_step_to_reasoning": 1.4737949950865394, "adv/std_final_conf": 0.8724972009658813, "adv/std_reasoning": 0.7576428651809692, "adv/std_step_conf": 0.933771014213562, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8175076064908722, "calib/avg_num_step_conf": 7.34375, "calib/ece": 0.1787698412698412, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.49297920892494945, "calib/mean_conf": 0.4517063492063492, "calib/mu_c": 0.7177586206896553, "calib/mu_w": 0.2247794117647059, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.085079365079365, "calib/std_conf": 0.4412687617749729, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.43830136986301377, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.1627970220369268, "calib/step_q_w": 0.27550434782608696, "calib/step_q_w_n": 1150.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 564.4765625, "completions/mean_terminated_length": 566.6902465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 46.0, "epoch": 0.21013333333333334, "grad_norm": 0.027106324210762978, "kl": 0.147796630859375, "learning_rate": 8.333333333333334e-08, "loss": -0.1334, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031168133020401, "mask/share_reasoning": 0.8300639986991882, "mask/share_step_conf": 0.13486161828041077, "num_tokens": 48360035.0, "reward": 0.9748759269714355, "reward_std": 0.19350528717041016, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7844051122665405, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8794092535972595, "step": 197 }, { "adv/mean_abs_final_conf": 0.6879289150238037, "adv/mean_abs_reasoning": 0.5308552980422974, "adv/mean_abs_step_conf": 0.7343833446502686, "adv/ratio_final_to_reasoning": 1.2958878202040494, "adv/ratio_step_to_reasoning": 1.383396468601797, "adv/std_final_conf": 0.8872906565666199, "adv/std_reasoning": 0.7755385637283325, "adv/std_step_conf": 0.9340898394584656, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8280604338842976, "calib/avg_num_step_conf": 7.02734375, "calib/ece": 0.1369076305220884, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2931726907630522, "calib/gap": 0.49805397727272727, "calib/mean_conf": 0.44124497991967876, "calib/mu_c": 0.6972727272727273, "calib/mu_w": 0.19921875, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.04610441767068277, "calib/std_conf": 0.421304695929381, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4291873589164785, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.13000882660541613, "calib/step_q_w": 0.2991785323110624, "calib/step_q_w_n": 913.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 528.96484375, "completions/mean_terminated_length": 533.1299438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.2112, "grad_norm": 0.028157765045762062, "kl": 0.1432647705078125, "learning_rate": 5.555555555555556e-08, "loss": -0.0526, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.034415990114212036, "mask/share_reasoning": 0.8118489384651184, "mask/share_step_conf": 0.14592257142066956, "num_tokens": 48600834.0, "reward": 0.9752619862556458, "reward_std": 0.17956218123435974, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7932171821594238, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.869025468826294, "step": 198 }, { "adv/mean_abs_final_conf": 0.7226166725158691, "adv/mean_abs_reasoning": 0.691262423992157, "adv/mean_abs_step_conf": 0.7821977138519287, "adv/ratio_final_to_reasoning": 1.0453579529791828, "adv/ratio_step_to_reasoning": 1.1315495920270122, "adv/std_final_conf": 0.8770599365234375, "adv/std_reasoning": 0.8748891353607178, "adv/std_step_conf": 0.9344019293785095, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7468637394901908, "calib/avg_num_step_conf": 6.82421875, "calib/ece": 0.21922448979591835, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.3224489795918367, "calib/gap": 0.34982517015881476, "calib/mean_conf": 0.4873061224489796, "calib/mu_c": 0.66864406779661, "calib/mu_w": 0.31881889763779525, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11244897959183672, "calib/std_conf": 0.41766221114146396, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.39829530201342284, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.11073043175394182, "calib/step_q_w": 0.287564870259481, "calib/step_q_w_n": 1002.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 585.66015625, "completions/mean_terminated_length": 592.604736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.21226666666666666, "grad_norm": 0.030716493725776672, "kl": 0.1441192626953125, "learning_rate": 2.777777777777778e-08, "loss": -0.1249, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.033089008182287216, "mask/share_reasoning": 0.8238986134529114, "mask/share_step_conf": 0.1312936544418335, "num_tokens": 48854963.0, "reward": 0.9157102704048157, "reward_std": 0.23339220881462097, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7084863185882568, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.841684103012085, "step": 199 }, { "adv/mean_abs_final_conf": 0.6958111524581909, "adv/mean_abs_reasoning": 0.5497347116470337, "adv/mean_abs_step_conf": 0.7385485172271729, "adv/ratio_final_to_reasoning": 1.2657216976048404, "adv/ratio_step_to_reasoning": 1.3434634953547744, "adv/std_final_conf": 0.8847732543945312, "adv/std_reasoning": 0.7929638028144836, "adv/std_step_conf": 0.9344330430030823, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8473539518900344, "calib/avg_num_step_conf": 6.02734375, "calib/ece": 0.14963562753036444, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.48582995951417, "calib/gap": 0.5701237113402062, "calib/mean_conf": 0.5821052631578948, "calib/mu_c": 0.806, "calib/mu_w": 0.2358762886597938, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.06222672064777332, "calib/std_conf": 0.44081955906755055, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.44430604982206406, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.18290604982206404, "calib/step_q_w": 0.2614, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 546.375, "completions/mean_terminated_length": 548.5177001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.21333333333333335, "grad_norm": 0.05999818071722984, "kl": 0.1330413818359375, "learning_rate": 0.0, "loss": -0.0525, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03310883790254593, "mask/share_reasoning": 0.8392375707626343, "mask/share_step_conf": 0.12374730408191681, "num_tokens": 49102883.0, "reward": 0.9791679382324219, "reward_std": 0.22837120294570923, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7976699471473694, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.852853536605835, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 7.663144015993457, "train_runtime": 15537.4107, "train_samples_per_second": 3.295, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 49102883, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }