{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7502421140670776, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5723537492194897, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9357826709747314, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04299500212073326, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0136, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 1.264374852180481, "reward_std": 0.26098379492759705, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.770934522151947, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5102895722914849, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9358851313591003, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04044683277606964, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 1.198354721069336, "reward_std": 0.24474793672561646, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7638648152351379, "adv/mean_abs_reasoning": 0.4602765142917633, "adv/mean_abs_step_conf": 0.7489376068115234, "adv/ratio_final_to_reasoning": 1.659578082993246, "adv/ratio_step_to_reasoning": 1.6271471247320293, "adv/std_final_conf": 0.9298396706581116, "adv/std_reasoning": 0.7392901182174683, "adv/std_step_conf": 0.9354903697967529, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.37831234083557025, "calib/avg_num_step_conf": 5.04296875, "calib/ece": 0.23330708661417332, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.33070866141732286, "calib/gap": -0.01997935164154463, "calib/mean_conf": 0.8830708661417324, "calib/mu_c": 0.8762275449101795, "calib/mu_w": 0.8962068965517241, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22944881889763788, "calib/std_conf": 0.046108598460652704, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7731469440832249, "calib/step_q_c_n": 769.0, "calib/step_q_gap": -0.006010144039380494, "calib/step_q_w": 0.7791570881226054, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 499.40625, "completions/mean_terminated_length": 503.3385925292969, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.04884733632206917, "kl": 0.00032722949981689453, "learning_rate": 7.5e-07, "loss": -0.0388, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033127740025520325, "mask/share_reasoning": 0.8509975671768188, "mask/share_step_conf": 0.10806218534708023, "num_tokens": 691765.0, "reward": 1.2584209442138672, "reward_std": 0.24406561255455017, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7049156427383423, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7422913312911987, "step": 3 }, { "adv/mean_abs_final_conf": 0.7799098491668701, "adv/mean_abs_reasoning": 0.49395906925201416, "adv/mean_abs_step_conf": 0.7630683183670044, "adv/ratio_final_to_reasoning": 1.5788956974671235, "adv/ratio_step_to_reasoning": 1.5448007048893615, "adv/std_final_conf": 0.930604100227356, "adv/std_reasoning": 0.739273726940155, "adv/std_step_conf": 0.935773491859436, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5133688061190556, "calib/avg_num_step_conf": 4.98046875, "calib/ece": 0.26297619047619053, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2619047619047619, "calib/gap": 0.002369803791153924, "calib/mean_conf": 0.8780555555555557, "calib/mu_c": 0.8789677419354839, "calib/mu_w": 0.87659793814433, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26297619047619053, "calib/std_conf": 0.04478109109802321, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7936363636363637, "calib/step_q_c_n": 759.0, "calib/step_q_gap": 0.012570472163495494, "calib/step_q_w": 0.7810658914728682, "calib/step_q_w_n": 516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 513.3203125, "completions/mean_terminated_length": 515.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.004266666666666667, "grad_norm": 0.04170985519886017, "kl": 0.0006768703460693359, "learning_rate": 1.0000000000000002e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03208368271589279, "mask/share_reasoning": 0.8533977270126343, "mask/share_step_conf": 0.11061234772205353, "num_tokens": 929343.0, "reward": 1.2184948921203613, "reward_std": 0.2545901834964752, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6823722124099731, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7179338335990906, "step": 4 }, { "adv/mean_abs_final_conf": 0.7979696989059448, "adv/mean_abs_reasoning": 0.4005493223667145, "adv/mean_abs_step_conf": 0.7814561128616333, "adv/ratio_final_to_reasoning": 1.9921883631983293, "adv/ratio_step_to_reasoning": 1.9509610158476005, "adv/std_final_conf": 0.9299609065055847, "adv/std_reasoning": 0.640365481376648, "adv/std_step_conf": 0.9355432391166687, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5075853350189633, "calib/avg_num_step_conf": 4.875, "calib/ece": 0.33939024390243894, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3089430894308943, "calib/gap": 0.00040255506021680265, "calib/mean_conf": 0.8800406504065041, "calib/mu_c": 0.8802255639097744, "calib/mu_w": 0.8798230088495576, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.33939024390243894, "calib/std_conf": 0.0459364882111211, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7968796433878157, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.011749208605206896, "calib/step_q_w": 0.7851304347826088, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 531.83984375, "completions/mean_terminated_length": 531.83984375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.005333333333333333, "grad_norm": 0.04463869705796242, "kl": 0.00034159421920776367, "learning_rate": 1.25e-06, "loss": 0.0342, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03385140001773834, "mask/share_reasoning": 0.8535634279251099, "mask/share_step_conf": 0.11258512735366821, "num_tokens": 1172182.0, "reward": 1.1342566013336182, "reward_std": 0.20135310292243958, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6097691059112549, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6813250780105591, "step": 5 }, { "adv/mean_abs_final_conf": 0.7624006271362305, "adv/mean_abs_reasoning": 0.37178319692611694, "adv/mean_abs_step_conf": 0.7744120359420776, "adv/ratio_final_to_reasoning": 2.0506591837385795, "adv/ratio_step_to_reasoning": 2.0829667460629575, "adv/std_final_conf": 0.9308306574821472, "adv/std_reasoning": 0.6402367949485779, "adv/std_step_conf": 0.9357687830924988, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.45032532532532527, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.29960937499999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.25390625, "calib/gap": -0.009524524524524569, "calib/mean_conf": 0.877734375, "calib/mu_c": 0.8737162162162162, "calib/mu_w": 0.8832407407407408, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29960937499999996, "calib/std_conf": 0.04397646465280463, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7874965986394558, "calib/step_q_c_n": 735.0, "calib/step_q_gap": -0.007189812510369986, "calib/step_q_w": 0.7946864111498257, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 428.1796875, "completions/mean_terminated_length": 429.8588562011719, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.0064, "grad_norm": 0.04509090259671211, "kl": 0.007303744554519653, "learning_rate": 1.5e-06, "loss": -0.0218, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0379551500082016, "mask/share_reasoning": 0.8295083045959473, "mask/share_step_conf": 0.12863025069236755, "num_tokens": 1387748.0, "reward": 1.2086174488067627, "reward_std": 0.20253591239452362, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6597577929496765, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7209259867668152, "step": 6 }, { "adv/mean_abs_final_conf": 0.7563202977180481, "adv/mean_abs_reasoning": 0.45961794257164, "adv/mean_abs_step_conf": 0.733751654624939, "adv/ratio_final_to_reasoning": 1.6455412804084808, "adv/ratio_step_to_reasoning": 1.5964382298033766, "adv/std_final_conf": 0.9309816360473633, "adv/std_reasoning": 0.7393760681152344, "adv/std_step_conf": 0.9360485672950745, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.3975895502289254, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.2612698412698413, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2619047619047619, "calib/gap": -0.01635335308376007, "calib/mean_conf": 0.8776190476190477, "calib/mu_c": 0.8715189873417721, "calib/mu_w": 0.8878723404255322, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.255952380952381, "calib/std_conf": 0.046750410768564156, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7927122641509434, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.02499025642893571, "calib/step_q_w": 0.7677220077220077, "calib/step_q_w_n": 518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 516.0234375, "completions/mean_terminated_length": 520.0866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.007466666666666667, "grad_norm": 0.04311549291014671, "kl": 0.0002957582473754883, "learning_rate": 1.75e-06, "loss": 0.0227, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031259916722774506, "mask/share_reasoning": 0.8517844080924988, "mask/share_step_conf": 0.10914316773414612, "num_tokens": 1627274.0, "reward": 1.2275378704071045, "reward_std": 0.27290546894073486, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6782523393630981, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7290366888046265, "step": 7 }, { "adv/mean_abs_final_conf": 0.7779233455657959, "adv/mean_abs_reasoning": 0.45598095655441284, "adv/mean_abs_step_conf": 0.7926830053329468, "adv/ratio_final_to_reasoning": 1.7060434967374898, "adv/ratio_step_to_reasoning": 1.7384125234588712, "adv/std_final_conf": 0.9291698336601257, "adv/std_reasoning": 0.720556914806366, "adv/std_step_conf": 0.9360244870185852, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4444444444444444, "calib/avg_num_step_conf": 4.72265625, "calib/ece": 0.33764, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.272, "calib/gap": -0.008557165861513893, "calib/mean_conf": 0.8736399999999999, "calib/mu_c": 0.8697037037037036, "calib/mu_w": 0.8782608695652175, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33564, "calib/std_conf": 0.05211478101268392, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7928643216080402, "calib/step_q_c_n": 597.0, "calib/step_q_gap": 0.03082183794790949, "calib/step_q_w": 0.7620424836601307, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 524.265625, "completions/mean_terminated_length": 524.265625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.008533333333333334, "grad_norm": 0.03729280084371567, "kl": 0.000550001859664917, "learning_rate": 2.0000000000000003e-06, "loss": 0.0413, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03387031704187393, "mask/share_reasoning": 0.8588097095489502, "mask/share_step_conf": 0.10732000321149826, "num_tokens": 1867998.0, "reward": 1.191420555114746, "reward_std": 0.2365012764930725, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6184738874435425, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7317929863929749, "step": 8 }, { "adv/mean_abs_final_conf": 0.7659152150154114, "adv/mean_abs_reasoning": 0.37201279401779175, "adv/mean_abs_step_conf": 0.7619096040725708, "adv/ratio_final_to_reasoning": 2.058841059586733, "adv/ratio_step_to_reasoning": 2.0480736585530765, "adv/std_final_conf": 0.9287598133087158, "adv/std_reasoning": 0.6404236555099487, "adv/std_step_conf": 0.9355725646018982, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4804807889870557, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.25428, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.288, "calib/gap": 0.005627696733100596, "calib/mean_conf": 0.87676, "calib/mu_c": 0.8788535031847134, "calib/mu_w": 0.8732258064516129, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25152, "calib/std_conf": 0.07093872285289607, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7624759284731774, "calib/step_q_c_n": 727.0, "calib/step_q_gap": 0.013918236165485043, "calib/step_q_w": 0.7485576923076923, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 495.79296875, "completions/mean_terminated_length": 503.6627197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.0096, "grad_norm": 0.06231572851538658, "kl": 0.000376969575881958, "learning_rate": 2.25e-06, "loss": -0.0138, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03376583009958267, "mask/share_reasoning": 0.8470334410667419, "mask/share_step_conf": 0.1035757064819336, "num_tokens": 2102457.0, "reward": 1.214963674545288, "reward_std": 0.2494477927684784, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6849726438522339, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.713883638381958, "step": 9 }, { "adv/mean_abs_final_conf": 0.7786163091659546, "adv/mean_abs_reasoning": 0.5259579420089722, "adv/mean_abs_step_conf": 0.7487469911575317, "adv/ratio_final_to_reasoning": 1.4803775111597657, "adv/ratio_step_to_reasoning": 1.42358719462926, "adv/std_final_conf": 0.9327322840690613, "adv/std_reasoning": 0.7754129767417908, "adv/std_step_conf": 0.9361671805381775, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4564078282828283, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.3181102362204725, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3346456692913386, "calib/gap": -0.007467171717171706, "calib/mean_conf": 0.8850393700787402, "calib/mu_c": 0.8818055555555555, "calib/mu_w": 0.8892727272727272, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3181102362204725, "calib/std_conf": 0.047989073817926636, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.782121661721068, "calib/step_q_c_n": 674.0, "calib/step_q_gap": -0.006169795565364122, "calib/step_q_w": 0.7882914572864321, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 515.734375, "completions/mean_terminated_length": 515.734375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.010666666666666666, "grad_norm": 0.04843964800238609, "kl": 0.00044846534729003906, "learning_rate": 2.5e-06, "loss": 0.0816, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032743364572525024, "mask/share_reasoning": 0.8583770990371704, "mask/share_step_conf": 0.10887955129146576, "num_tokens": 2341285.0, "reward": 1.1771084070205688, "reward_std": 0.3006208539009094, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6394945383071899, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7030642032623291, "step": 10 }, { "adv/mean_abs_final_conf": 0.7562990188598633, "adv/mean_abs_reasoning": 0.4163981080055237, "adv/mean_abs_step_conf": 0.7739204168319702, "adv/ratio_final_to_reasoning": 1.8162883171645696, "adv/ratio_step_to_reasoning": 1.8586069483814847, "adv/std_final_conf": 0.9316168427467346, "adv/std_reasoning": 0.7013375163078308, "adv/std_step_conf": 0.9359068870544434, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5338258680095415, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.2791269841269842, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.34523809523809523, "calib/gap": 0.0005565862708720726, "calib/mean_conf": 0.8788095238095239, "calib/mu_c": 0.879025974025974, "calib/mu_w": 0.878469387755102, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2734126984126985, "calib/std_conf": 0.07808325982470639, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7691400233372229, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.0038918915563941336, "calib/step_q_w": 0.773031914893617, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 530.22265625, "completions/mean_terminated_length": 532.302001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.011733333333333333, "grad_norm": 0.04254964739084244, "kl": 0.0007021427154541016, "learning_rate": 2.7500000000000004e-06, "loss": 0.0465, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03229888156056404, "mask/share_reasoning": 0.8458642959594727, "mask/share_step_conf": 0.11793056130409241, "num_tokens": 2581502.0, "reward": 1.2219064235687256, "reward_std": 0.22927621006965637, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6688515543937683, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7296680212020874, "step": 11 }, { "adv/mean_abs_final_conf": 0.7611274123191833, "adv/mean_abs_reasoning": 0.43331027030944824, "adv/mean_abs_step_conf": 0.7788050174713135, "adv/ratio_final_to_reasoning": 1.7565413618643857, "adv/ratio_step_to_reasoning": 1.7973380065862052, "adv/std_final_conf": 0.9290933012962341, "adv/std_reasoning": 0.7015314102172852, "adv/std_step_conf": 0.9357473254203796, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49364010475121584, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.21764227642276426, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3780487804878049, "calib/gap": -0.0031739618406285697, "calib/mean_conf": 0.8824390243902438, "calib/mu_c": 0.8813939393939394, "calib/mu_w": 0.884567901234568, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21467479674796752, "calib/std_conf": 0.05675996332316081, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7721990740740741, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.027444975713418307, "calib/step_q_w": 0.7447540983606558, "calib/step_q_w_n": 549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 501.53515625, "completions/mean_terminated_length": 509.4960632324219, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0128, "grad_norm": 0.05267712473869324, "kl": 0.001438736915588379, "learning_rate": 3e-06, "loss": -0.0246, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.035315074026584625, "mask/share_reasoning": 0.8239625692367554, "mask/share_step_conf": 0.1250973641872406, "num_tokens": 2814071.0, "reward": 1.2636216878890991, "reward_std": 0.25986015796661377, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6973953247070312, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7547677755355835, "step": 12 }, { "adv/mean_abs_final_conf": 0.790421187877655, "adv/mean_abs_reasoning": 0.45352885127067566, "adv/mean_abs_step_conf": 0.7578126788139343, "adv/ratio_final_to_reasoning": 1.7428244877102976, "adv/ratio_step_to_reasoning": 1.6709249625260458, "adv/std_final_conf": 0.9323068857192993, "adv/std_reasoning": 0.7014724016189575, "adv/std_step_conf": 0.9361147880554199, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5766365257259297, "calib/avg_num_step_conf": 4.88671875, "calib/ece": 0.2895686274509803, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3764705882352941, "calib/gap": 0.01803298522669372, "calib/mean_conf": 0.8782745098039215, "calib/mu_c": 0.8856291390728475, "calib/mu_w": 0.8675961538461537, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28784313725490185, "calib/std_conf": 0.06189937388620991, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7543232044198895, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.010736866658978661, "calib/step_q_w": 0.7435863377609109, "calib/step_q_w_n": 527.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 472.28515625, "completions/mean_terminated_length": 474.13726806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 81.0, "epoch": 0.013866666666666666, "grad_norm": 0.057831306010484695, "kl": 0.0020656585693359375, "learning_rate": 3.2500000000000002e-06, "loss": -0.003, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03480253368616104, "mask/share_reasoning": 0.8483515977859497, "mask/share_step_conf": 0.11293961852788925, "num_tokens": 3039568.0, "reward": 1.2582135200500488, "reward_std": 0.24947398900985718, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6782461404800415, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7608873844146729, "step": 13 }, { "adv/mean_abs_final_conf": 0.7915438413619995, "adv/mean_abs_reasoning": 0.5042266249656677, "adv/mean_abs_step_conf": 0.7892253994941711, "adv/ratio_final_to_reasoning": 1.5698176220184623, "adv/ratio_step_to_reasoning": 1.5652196064575303, "adv/std_final_conf": 0.9285059571266174, "adv/std_reasoning": 0.7393607497215271, "adv/std_step_conf": 0.9360507130622864, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.42527734778121773, "calib/avg_num_step_conf": 5.125, "calib/ece": 0.35727999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": -0.010869453044375699, "calib/mean_conf": 0.90128, "calib/mu_c": 0.8963235294117646, "calib/mu_w": 0.9071929824561403, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35727999999999993, "calib/std_conf": 0.041616842744254395, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7304722222222223, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.019728978978979073, "calib/step_q_w": 0.7107432432432432, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 540.88671875, "completions/mean_terminated_length": 545.1456909179688, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.014933333333333333, "grad_norm": 0.04276634752750397, "kl": 0.004217386245727539, "learning_rate": 3.5e-06, "loss": 0.0095, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03212915360927582, "mask/share_reasoning": 0.847929060459137, "mask/share_step_conf": 0.11212927103042603, "num_tokens": 3283435.0, "reward": 1.2056703567504883, "reward_std": 0.2518189251422882, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6026976108551025, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7535402178764343, "step": 14 }, { "adv/mean_abs_final_conf": 0.7444257140159607, "adv/mean_abs_reasoning": 0.3867869973182678, "adv/mean_abs_step_conf": 0.752798318862915, "adv/ratio_final_to_reasoning": 1.9246399676755672, "adv/ratio_step_to_reasoning": 1.9462865196667267, "adv/std_final_conf": 0.9274711608886719, "adv/std_reasoning": 0.6815034747123718, "adv/std_step_conf": 0.9360182881355286, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4767543859649123, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.34996062992125976, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5590551181102362, "calib/gap": 0.002071428571428724, "calib/mean_conf": 0.9011417322834645, "calib/mu_c": 0.9020714285714286, "calib/mu_w": 0.8999999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34996062992125976, "calib/std_conf": 0.06886506541632316, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6757304964539007, "calib/step_q_c_n": 705.0, "calib/step_q_gap": -0.008103425807583498, "calib/step_q_w": 0.6838339222614842, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 480.51171875, "completions/mean_terminated_length": 480.51171875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.016, "grad_norm": 0.04166586697101593, "kl": 0.007984638214111328, "learning_rate": 3.7500000000000005e-06, "loss": 0.0245, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03449499234557152, "mask/share_reasoning": 0.8479899168014526, "mask/share_step_conf": 0.11751505732536316, "num_tokens": 3514326.0, "reward": 1.2254838943481445, "reward_std": 0.22204411029815674, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6215355396270752, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7608097791671753, "step": 15 }, { "adv/mean_abs_final_conf": 0.7652251720428467, "adv/mean_abs_reasoning": 0.3866991400718689, "adv/mean_abs_step_conf": 0.757154107093811, "adv/ratio_final_to_reasoning": 1.9788644264909094, "adv/ratio_step_to_reasoning": 1.9579927355232605, "adv/std_final_conf": 0.9266905188560486, "adv/std_reasoning": 0.6613523960113525, "adv/std_step_conf": 0.9358673691749573, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49171952078928827, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.25693227091633475, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.649402390438247, "calib/gap": 0.004776603241719357, "calib/mean_conf": 0.9143027888446216, "calib/mu_c": 0.9159393939393939, "calib/mu_w": 0.9111627906976746, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25693227091633475, "calib/std_conf": 0.05200953743549559, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6268780019212296, "calib/step_q_c_n": 1041.0, "calib/step_q_gap": 0.04862042616365381, "calib/step_q_w": 0.5782575757575757, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 616.54296875, "completions/mean_terminated_length": 618.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.017066666666666667, "grad_norm": 0.033337414264678955, "kl": 0.009404182434082031, "learning_rate": 4.000000000000001e-06, "loss": -0.0097, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02647656947374344, "mask/share_reasoning": 0.8592157363891602, "mask/share_step_conf": 0.1104014441370964, "num_tokens": 3781009.0, "reward": 1.3121362924575806, "reward_std": 0.2243286669254303, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6943659782409668, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8024532794952393, "step": 16 }, { "adv/mean_abs_final_conf": 0.756464958190918, "adv/mean_abs_reasoning": 0.4089164435863495, "adv/mean_abs_step_conf": 0.7704800963401794, "adv/ratio_final_to_reasoning": 1.849925504478222, "adv/ratio_step_to_reasoning": 1.8841993478735706, "adv/std_final_conf": 0.9228365421295166, "adv/std_reasoning": 0.6816248893737793, "adv/std_step_conf": 0.9359196424484253, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5401098901098901, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.20666666666666672, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7103174603174603, "calib/gap": 0.007098901098900989, "calib/mean_conf": 0.9191269841269841, "calib/mu_c": 0.921098901098901, "calib/mu_w": 0.914, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20178571428571432, "calib/std_conf": 0.053652613381367925, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6106968325791856, "calib/step_q_c_n": 1105.0, "calib/step_q_gap": 0.04331819180248664, "calib/step_q_w": 0.567378640776699, "calib/step_q_w_n": 412.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 538.015625, "completions/mean_terminated_length": 540.1255493164062, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.018133333333333335, "grad_norm": 0.053884051740169525, "kl": 0.014090538024902344, "learning_rate": 4.25e-06, "loss": 0.0679, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03222394734621048, "mask/share_reasoning": 0.8395468592643738, "mask/share_step_conf": 0.12432297319173813, "num_tokens": 4022269.0, "reward": 1.3697597980499268, "reward_std": 0.22861257195472717, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7480968236923218, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8265707492828369, "step": 17 }, { "adv/mean_abs_final_conf": 0.7751752734184265, "adv/mean_abs_reasoning": 0.3621034622192383, "adv/mean_abs_step_conf": 0.7781474590301514, "adv/ratio_final_to_reasoning": 2.140756314970252, "adv/ratio_step_to_reasoning": 2.1489644265235337, "adv/std_final_conf": 0.9170754551887512, "adv/std_reasoning": 0.6403347849845886, "adv/std_step_conf": 0.9358564615249634, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4807206866086078, "calib/avg_num_step_conf": 4.78125, "calib/ece": 0.38624505928853753, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7865612648221344, "calib/gap": -0.0065663258866592855, "calib/mean_conf": 0.924901185770751, "calib/mu_c": 0.9219424460431653, "calib/mu_w": 0.9285087719298246, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38086956521739124, "calib/std_conf": 0.07853204903647092, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5981441717791411, "calib/step_q_c_n": 652.0, "calib/step_q_gap": -0.0037089750740056937, "calib/step_q_w": 0.6018531468531468, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 521.83203125, "completions/mean_terminated_length": 521.83203125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.0192, "grad_norm": 0.03436724469065666, "kl": 0.015043258666992188, "learning_rate": 4.5e-06, "loss": -0.0453, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032233573496341705, "mask/share_reasoning": 0.8630064725875854, "mask/share_step_conf": 0.10475993156433105, "num_tokens": 4266578.0, "reward": 1.2287178039550781, "reward_std": 0.21607418358325958, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5942296981811523, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.778868556022644, "step": 18 }, { "adv/mean_abs_final_conf": 0.7444548606872559, "adv/mean_abs_reasoning": 0.42004138231277466, "adv/mean_abs_step_conf": 0.7566590309143066, "adv/ratio_final_to_reasoning": 1.7723369459176614, "adv/ratio_step_to_reasoning": 1.8013916313390212, "adv/std_final_conf": 0.9174063205718994, "adv/std_reasoning": 0.7013636231422424, "adv/std_step_conf": 0.9357441067695618, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.555988515176374, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.3030677290836654, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8645418326693227, "calib/gap": 0.02592904019688269, "calib/mean_conf": 0.9365338645418326, "calib/mu_c": 0.9460377358490567, "calib/mu_w": 0.920108695652174, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3030677290836654, "calib/std_conf": 0.0807505741632516, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5960899182561308, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.029726281892494377, "calib/step_q_w": 0.5663636363636364, "calib/step_q_w_n": 495.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 494.65234375, "completions/mean_terminated_length": 498.5472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.020266666666666665, "grad_norm": 66.31549835205078, "kl": 78.52267646789551, "learning_rate": 4.75e-06, "loss": 3.7526, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03115307353436947, "mask/share_reasoning": 0.8544229865074158, "mask/share_step_conf": 0.10661141574382782, "num_tokens": 4497969.0, "reward": 1.2911115884780884, "reward_std": 0.2555881142616272, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6604000329971313, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.801536500453949, "step": 19 }, { "adv/mean_abs_final_conf": 0.7562682032585144, "adv/mean_abs_reasoning": 0.3773419260978699, "adv/mean_abs_step_conf": 0.7754217386245728, "adv/ratio_final_to_reasoning": 2.004198714622461, "adv/ratio_step_to_reasoning": 2.054957811455741, "adv/std_final_conf": 0.9134057760238647, "adv/std_reasoning": 0.6403542757034302, "adv/std_step_conf": 0.9357218146324158, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47950980392156856, "calib/avg_num_step_conf": 5.45703125, "calib/ece": 0.3364822134387352, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8932806324110671, "calib/gap": -0.0012810457516337914, "calib/mean_conf": 0.9412252964426877, "calib/mu_c": 0.940718954248366, "calib/mu_w": 0.9419999999999998, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3364822134387352, "calib/std_conf": 0.034965830757530916, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5691792929292929, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.02997268135904496, "calib/step_q_w": 0.539206611570248, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2286.0, "completions/max_terminated_length": 2286.0, "completions/mean_length": 472.0, "completions/mean_terminated_length": 473.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.021333333333333333, "grad_norm": 0.05530129000544548, "kl": 0.04509735107421875, "learning_rate": 5e-06, "loss": -0.0349, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0354749858379364, "mask/share_reasoning": 0.8304793238639832, "mask/share_step_conf": 0.13013947010040283, "num_tokens": 4723673.0, "reward": 1.2788856029510498, "reward_std": 0.21537214517593384, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.634465217590332, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.803449809551239, "step": 20 }, { "adv/mean_abs_final_conf": 0.7452648878097534, "adv/mean_abs_reasoning": 0.46478623151779175, "adv/mean_abs_step_conf": 0.7354752421379089, "adv/ratio_final_to_reasoning": 1.6034573256958131, "adv/ratio_step_to_reasoning": 1.5823946413734404, "adv/std_final_conf": 0.9150353670120239, "adv/std_reasoning": 0.7205474972724915, "adv/std_step_conf": 0.9360754489898682, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5659854851031322, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.34500000000000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9453125, "calib/gap": 0.024924879042526493, "calib/mean_conf": 0.9465625, "calib/mu_c": 0.9564935064935066, "calib/mu_w": 0.9315686274509801, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34500000000000003, "calib/std_conf": 0.08631734236959568, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5779044117647059, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.03806314192343596, "calib/step_q_w": 0.5398412698412699, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 492.2421875, "completions/mean_terminated_length": 494.1725769042969, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.0224, "grad_norm": 0.029058117419481277, "kl": 0.027462005615234375, "learning_rate": 4.9722222222222224e-06, "loss": -0.0362, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033350586891174316, "mask/share_reasoning": 0.8396820425987244, "mask/share_step_conf": 0.12306112051010132, "num_tokens": 4952647.0, "reward": 1.3049554824829102, "reward_std": 0.23162305355072021, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6419129371643066, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8242334127426147, "step": 21 }, { "adv/mean_abs_final_conf": 0.7346631288528442, "adv/mean_abs_reasoning": 0.3285854458808899, "adv/mean_abs_step_conf": 0.7635392546653748, "adv/ratio_final_to_reasoning": 2.2358358778895973, "adv/ratio_step_to_reasoning": 2.3237159899716096, "adv/std_final_conf": 0.8875892162322998, "adv/std_reasoning": 0.6184437274932861, "adv/std_step_conf": 0.9357779622077942, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5129274965800273, "calib/avg_num_step_conf": 5.9296875, "calib/ece": 0.2939843750000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.95703125, "calib/gap": 0.0002462380300959355, "calib/mean_conf": 0.95609375, "calib/mu_c": 0.9561764705882354, "calib/mu_w": 0.9559302325581395, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2930078125000001, "calib/std_conf": 0.026684920103637176, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5590693069306931, "calib/step_q_c_n": 1010.0, "calib/step_q_gap": 0.029167732127543533, "calib/step_q_w": 0.5299015748031496, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 474.53515625, "completions/mean_terminated_length": 476.3961181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.023466666666666667, "grad_norm": 0.03352082520723343, "kl": 0.046070098876953125, "learning_rate": 4.944444444444445e-06, "loss": -0.05, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03306357562541962, "mask/share_reasoning": 0.830856204032898, "mask/share_step_conf": 0.13217401504516602, "num_tokens": 5175944.0, "reward": 1.3375494480133057, "reward_std": 0.17658598721027374, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6910320520401001, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8256270289421082, "step": 22 }, { "adv/mean_abs_final_conf": 0.7495639324188232, "adv/mean_abs_reasoning": 0.44568973779678345, "adv/mean_abs_step_conf": 0.7528454065322876, "adv/ratio_final_to_reasoning": 1.6818065772037007, "adv/ratio_step_to_reasoning": 1.6891692643718774, "adv/std_final_conf": 0.9074048399925232, "adv/std_reasoning": 0.7206984162330627, "adv/std_step_conf": 0.9363330006599426, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5708492975734355, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.41808764940239046, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.014082375478927545, "calib/mean_conf": 0.9559362549800797, "calib/mu_c": 0.9624444444444445, "calib/mu_w": 0.948362068965517, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.41808764940239046, "calib/std_conf": 0.06459358016031481, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5693057409879839, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.012652164550396305, "calib/step_q_w": 0.5566535764375876, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 521.40234375, "completions/mean_terminated_length": 525.5078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.024533333333333334, "grad_norm": 0.029922185465693474, "kl": 0.03609466552734375, "learning_rate": 4.9166666666666665e-06, "loss": 0.0052, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033446379005908966, "mask/share_reasoning": 0.8323598504066467, "mask/share_step_conf": 0.1263812780380249, "num_tokens": 5413359.0, "reward": 1.2099261283874512, "reward_std": 0.2760199308395386, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5681461095809937, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7750717997550964, "step": 23 }, { "adv/mean_abs_final_conf": 0.7759602665901184, "adv/mean_abs_reasoning": 0.6207110285758972, "adv/mean_abs_step_conf": 0.7428168654441833, "adv/ratio_final_to_reasoning": 1.250115159658772, "adv/ratio_step_to_reasoning": 1.196719296495238, "adv/std_final_conf": 0.9179912209510803, "adv/std_reasoning": 0.826636552810669, "adv/std_step_conf": 0.936242938041687, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5058531746031745, "calib/avg_num_step_conf": 6.40625, "calib/ece": 0.47865853658536595, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.959349593495935, "calib/gap": 0.0039642857142857535, "calib/mean_conf": 0.9628861788617886, "calib/mu_c": 0.9649166666666668, "calib/mu_w": 0.960952380952381, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4768699186991871, "calib/std_conf": 0.04071768822243461, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.555012853470437, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.04475763305280356, "calib/step_q_w": 0.5102552204176335, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 592.5078125, "completions/mean_terminated_length": 594.8314208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0256, "grad_norm": 0.04278721660375595, "kl": 0.029100418090820312, "learning_rate": 4.888888888888889e-06, "loss": -0.048, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030732743442058563, "mask/share_reasoning": 0.8376131057739258, "mask/share_step_conf": 0.12774784862995148, "num_tokens": 5669553.0, "reward": 1.1673494577407837, "reward_std": 0.30620497465133667, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5042706727981567, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7722452878952026, "step": 24 }, { "adv/mean_abs_final_conf": 0.7407515048980713, "adv/mean_abs_reasoning": 0.48812973499298096, "adv/mean_abs_step_conf": 0.7623431086540222, "adv/ratio_final_to_reasoning": 1.5175299757322158, "adv/ratio_step_to_reasoning": 1.5617633059477196, "adv/std_final_conf": 0.9006784558296204, "adv/std_reasoning": 0.7394152283668518, "adv/std_step_conf": 0.9362833499908447, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4982081188505897, "calib/avg_num_step_conf": 5.96875, "calib/ece": 0.3764285714285715, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.0003766208379486491, "calib/mean_conf": 0.9676984126984127, "calib/mu_c": 0.9678523489932884, "calib/mu_w": 0.9674757281553398, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3764285714285715, "calib/std_conf": 0.0166744124102322, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5621410891089109, "calib/step_q_c_n": 808.0, "calib/step_q_gap": 0.018224422442244204, "calib/step_q_w": 0.5439166666666667, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 489.6875, "completions/mean_terminated_length": 495.49407958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.02666666666666667, "grad_norm": 0.02991301380097866, "kl": 0.04373931884765625, "learning_rate": 4.861111111111111e-06, "loss": -0.0677, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03178451210260391, "mask/share_reasoning": 0.8335721492767334, "mask/share_step_conf": 0.12292458117008209, "num_tokens": 5898137.0, "reward": 1.237117052078247, "reward_std": 0.27862468361854553, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6068245768547058, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7774547338485718, "step": 25 }, { "adv/mean_abs_final_conf": 0.6941282749176025, "adv/mean_abs_reasoning": 0.42233434319496155, "adv/mean_abs_step_conf": 0.7388712167739868, "adv/ratio_final_to_reasoning": 1.6435515749595888, "adv/ratio_step_to_reasoning": 1.7494935675475078, "adv/std_final_conf": 0.8959683179855347, "adv/std_reasoning": 0.7012890577316284, "adv/std_step_conf": 0.9362882375717163, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4266475252939568, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.3290438247011955, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.0070508613617716875, "calib/mean_conf": 0.9625099601593626, "calib/mu_c": 0.9650943396226412, "calib/mu_w": 0.9580434782608696, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3290438247011955, "calib/std_conf": 0.06376371737738691, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.609272076372315, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.07155395556694588, "calib/step_q_w": 0.5377181208053691, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 520.453125, "completions/mean_terminated_length": 522.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 293.0, "epoch": 0.027733333333333332, "grad_norm": 109.62390899658203, "kl": 1576.0367279052734, "learning_rate": 4.833333333333333e-06, "loss": 8.7786, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029633793979883194, "mask/share_reasoning": 0.8511450290679932, "mask/share_step_conf": 0.11531488597393036, "num_tokens": 6136613.0, "reward": 1.2813361883163452, "reward_std": 0.25779348611831665, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6458855271339417, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7978465557098389, "step": 26 }, { "adv/mean_abs_final_conf": 0.7589015364646912, "adv/mean_abs_reasoning": 0.53789222240448, "adv/mean_abs_step_conf": 0.7639832496643066, "adv/ratio_final_to_reasoning": 1.4108802932160978, "adv/ratio_step_to_reasoning": 1.4203277493940274, "adv/std_final_conf": 0.9002174735069275, "adv/std_reasoning": 0.7754333019256592, "adv/std_step_conf": 0.9361553192138672, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.576093023255814, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.46306299212598434, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.968503937007874, "calib/gap": -0.0014078759689922782, "calib/mean_conf": 0.9551889763779527, "calib/mu_c": 0.9544961240310077, "calib/mu_w": 0.955904, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4551889763779528, "calib/std_conf": 0.10139509005828919, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5801657381615598, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.04477885730479936, "calib/step_q_w": 0.5353868808567604, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 501.37109375, "completions/mean_terminated_length": 501.37109375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.0288, "grad_norm": 0.17514687776565552, "kl": 0.9313392639160156, "learning_rate": 4.805555555555556e-06, "loss": -0.0095, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03224721550941467, "mask/share_reasoning": 0.8422399163246155, "mask/share_step_conf": 0.12551286816596985, "num_tokens": 6370180.0, "reward": 1.2039215564727783, "reward_std": 0.2828693687915802, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5347757935523987, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7869243621826172, "step": 27 }, { "adv/mean_abs_final_conf": 0.7057210206985474, "adv/mean_abs_reasoning": 0.41700735688209534, "adv/mean_abs_step_conf": 0.772865891456604, "adv/ratio_final_to_reasoning": 1.6923466913752385, "adv/ratio_step_to_reasoning": 1.8533627253850202, "adv/std_final_conf": 0.8598683476448059, "adv/std_reasoning": 0.6817318797111511, "adv/std_step_conf": 0.9360816478729248, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5477935839744519, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.3089598393574297, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": -0.0040903614457831194, "calib/mean_conf": 0.9646224899598392, "calib/mu_c": 0.9632590361445782, "calib/mu_w": 0.9673493975903613, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3034578313253012, "calib/std_conf": 0.06454818299485003, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5916505894962486, "calib/step_q_c_n": 933.0, "calib/step_q_gap": 0.05791964212782752, "calib/step_q_w": 0.5337309473684211, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 555.08984375, "completions/mean_terminated_length": 555.08984375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.029866666666666666, "grad_norm": 0.02628139778971672, "kl": 0.032978057861328125, "learning_rate": 4.777777777777778e-06, "loss": 0.0195, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030390523374080658, "mask/share_reasoning": 0.8543544411659241, "mask/share_step_conf": 0.11525503545999527, "num_tokens": 6619227.0, "reward": 1.279329538345337, "reward_std": 0.25941693782806396, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6638847589492798, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7860590219497681, "step": 28 }, { "adv/mean_abs_final_conf": 0.7725566625595093, "adv/mean_abs_reasoning": 0.5233668088912964, "adv/mean_abs_step_conf": 0.7560856342315674, "adv/ratio_final_to_reasoning": 1.4761285000019362, "adv/ratio_step_to_reasoning": 1.4446572105580482, "adv/std_final_conf": 0.896633505821228, "adv/std_reasoning": 0.7575854063034058, "adv/std_step_conf": 0.9362297058105469, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.46384673272293997, "calib/avg_num_step_conf": 6.22265625, "calib/ece": 0.49857707509881427, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.0002063213345036008, "calib/mean_conf": 0.9689328063241106, "calib/mu_c": 0.9688235294117648, "calib/mu_w": 0.9690298507462684, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.49857707509881427, "calib/std_conf": 0.018249680023997687, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5732078907435508, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.051212173398797134, "calib/step_q_w": 0.5219957173447537, "calib/step_q_w_n": 934.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 594.38671875, "completions/mean_terminated_length": 596.7177124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.030933333333333334, "grad_norm": 0.029749587178230286, "kl": 0.037349700927734375, "learning_rate": 4.75e-06, "loss": -0.0868, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02785518765449524, "mask/share_reasoning": 0.8521197438240051, "mask/share_step_conf": 0.11611880362033844, "num_tokens": 6878518.0, "reward": 1.176987648010254, "reward_std": 0.27402737736701965, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.49552401900291443, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7846944332122803, "step": 29 }, { "adv/mean_abs_final_conf": 0.7688121795654297, "adv/mean_abs_reasoning": 0.6039595603942871, "adv/mean_abs_step_conf": 0.7581928968429565, "adv/ratio_final_to_reasoning": 1.2729530749766105, "adv/ratio_step_to_reasoning": 1.2553703038461386, "adv/std_final_conf": 0.9099051356315613, "adv/std_reasoning": 0.8266823887825012, "adv/std_step_conf": 0.936536967754364, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49207281268349967, "calib/avg_num_step_conf": 6.078125, "calib/ece": 0.4412379032258064, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9798387096774194, "calib/gap": 0.002922881190056792, "calib/mean_conf": 0.9589798387096776, "calib/mu_c": 0.9603587786259542, "calib/mu_w": 0.9574358974358974, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4359959677419355, "calib/std_conf": 0.094493789247895, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5554888888888889, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.031199381935665182, "calib/step_q_w": 0.5242895069532237, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 591.859375, "completions/mean_terminated_length": 598.8775024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.032, "grad_norm": 0.026807932183146477, "kl": 0.03380012512207031, "learning_rate": 4.722222222222222e-06, "loss": -0.0778, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.027108782902359962, "mask/share_reasoning": 0.8485285043716431, "mask/share_step_conf": 0.11264392733573914, "num_tokens": 7137018.0, "reward": 1.1777077913284302, "reward_std": 0.3551919758319855, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5364526510238647, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7618252038955688, "step": 30 }, { "adv/mean_abs_final_conf": 0.7549532651901245, "adv/mean_abs_reasoning": 0.38709408044815063, "adv/mean_abs_step_conf": 0.7758951187133789, "adv/ratio_final_to_reasoning": 1.9503095069707397, "adv/ratio_step_to_reasoning": 2.0044096717136606, "adv/std_final_conf": 0.8967974781990051, "adv/std_reasoning": 0.6613409519195557, "adv/std_step_conf": 0.9362436532974243, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49223726114649685, "calib/avg_num_step_conf": 6.50390625, "calib/ece": 0.5870316205533596, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.0065144639065818355, "calib/mean_conf": 0.9664782608695652, "calib/mu_c": 0.9705208333333334, "calib/mu_w": 0.9640063694267516, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5870316205533596, "calib/std_conf": 0.06328001421734168, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5654389799635701, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.07273351401374922, "calib/step_q_w": 0.49270546594982084, "calib/step_q_w_n": 1116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 612.84375, "completions/mean_terminated_length": 612.84375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.03306666666666667, "grad_norm": 0.03330698609352112, "kl": 0.03545379638671875, "learning_rate": 4.694444444444445e-06, "loss": -0.0565, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02880018949508667, "mask/share_reasoning": 0.8523030281066895, "mask/share_step_conf": 0.11889677494764328, "num_tokens": 7399818.0, "reward": 1.1024078130722046, "reward_std": 0.2412220537662506, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.4129304587841034, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7600051164627075, "step": 31 }, { "adv/mean_abs_final_conf": 0.7467050552368164, "adv/mean_abs_reasoning": 0.4965146780014038, "adv/mean_abs_step_conf": 0.7660564184188843, "adv/ratio_final_to_reasoning": 1.503893214682981, "adv/ratio_step_to_reasoning": 1.5428676177358012, "adv/std_final_conf": 0.9030807614326477, "adv/std_reasoning": 0.7575876116752625, "adv/std_step_conf": 0.9363879561424255, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5172176308539945, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.4602874493927124, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0008768201495474859, "calib/mean_conf": 0.9689514170040485, "calib/mu_c": 0.9693809523809525, "calib/mu_w": 0.968504132231405, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4595587044534411, "calib/std_conf": 0.01734076391436663, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5947622047244094, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.07739248250218733, "calib/step_q_w": 0.5173697222222221, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 565.86328125, "completions/mean_terminated_length": 572.5731201171875, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.034133333333333335, "grad_norm": 0.027324911206960678, "kl": 0.04216766357421875, "learning_rate": 4.666666666666667e-06, "loss": -0.0062, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028847582638263702, "mask/share_reasoning": 0.8494766354560852, "mask/share_step_conf": 0.10995703935623169, "num_tokens": 7651383.0, "reward": 1.1622298955917358, "reward_std": 0.29328110814094543, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5207406282424927, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7557656764984131, "step": 32 }, { "adv/mean_abs_final_conf": 0.7327004671096802, "adv/mean_abs_reasoning": 0.4044535458087921, "adv/mean_abs_step_conf": 0.7745844125747681, "adv/ratio_final_to_reasoning": 1.8115812673727152, "adv/ratio_step_to_reasoning": 1.9151381428139527, "adv/std_final_conf": 0.8804807066917419, "adv/std_reasoning": 0.6816219687461853, "adv/std_step_conf": 0.9363637566566467, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4688944530046225, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.49979999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.988, "calib/gap": -0.009604519774011222, "calib/mean_conf": 0.9638, "calib/mu_c": 0.958728813559322, "calib/mu_w": 0.9683333333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.49579999999999996, "calib/std_conf": 0.06370211927400844, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5878157894736843, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.03392619556505483, "calib/step_q_w": 0.5538895939086295, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 585.109375, "completions/mean_terminated_length": 585.109375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0352, "grad_norm": 0.027998236939311028, "kl": 0.0415496826171875, "learning_rate": 4.638888888888889e-06, "loss": 0.022, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028615616261959076, "mask/share_reasoning": 0.8616238236427307, "mask/share_step_conf": 0.10976054519414902, "num_tokens": 7908043.0, "reward": 1.1462751626968384, "reward_std": 0.24753305315971375, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.48835116624832153, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7583495378494263, "step": 33 }, { "adv/mean_abs_final_conf": 0.7453949451446533, "adv/mean_abs_reasoning": 0.6234875917434692, "adv/mean_abs_step_conf": 0.7739483714103699, "adv/ratio_final_to_reasoning": 1.1955249070158596, "adv/ratio_step_to_reasoning": 1.2413212093702852, "adv/std_final_conf": 0.9072105884552002, "adv/std_reasoning": 0.8429659008979797, "adv/std_step_conf": 0.936280369758606, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4666860916860917, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.39868514342629496, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.006329069347319027, "calib/mean_conf": 0.9672907211155379, "calib/mu_c": 0.9700139860139857, "calib/mu_w": 0.9636849166666667, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39812737450199215, "calib/std_conf": 0.06283266440044762, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5647769869513641, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.02021490473910703, "calib/step_q_w": 0.5445620822122571, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 498.63671875, "completions/mean_terminated_length": 502.56298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.03626666666666667, "grad_norm": 0.02821074239909649, "kl": 0.0689544677734375, "learning_rate": 4.611111111111112e-06, "loss": 0.0305, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03244972601532936, "mask/share_reasoning": 0.8274343013763428, "mask/share_step_conf": 0.13230347633361816, "num_tokens": 8140806.0, "reward": 1.2510292530059814, "reward_std": 0.3187180757522583, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.58431476354599, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8049654960632324, "step": 34 }, { "adv/mean_abs_final_conf": 0.699095606803894, "adv/mean_abs_reasoning": 0.41293585300445557, "adv/mean_abs_step_conf": 0.7703684568405151, "adv/ratio_final_to_reasoning": 1.6929883944864212, "adv/ratio_step_to_reasoning": 1.8655886894669882, "adv/std_final_conf": 0.8799118399620056, "adv/std_reasoning": 0.7013152837753296, "adv/std_step_conf": 0.9362316131591797, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49717675421900775, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.4048181818181818, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.008815569090216968, "calib/mean_conf": 0.9660830039525691, "calib/mu_c": 0.9699507042253519, "calib/mu_w": 0.9611351351351349, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4048181818181818, "calib/std_conf": 0.062200976598366195, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5695566433566432, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.029053551732135086, "calib/step_q_w": 0.5405030916245082, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2899.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 564.66015625, "completions/mean_terminated_length": 566.8745727539062, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.037333333333333336, "grad_norm": 0.02963513694703579, "kl": 0.0429534912109375, "learning_rate": 4.583333333333333e-06, "loss": 0.0041, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02836042456328869, "mask/share_reasoning": 0.8690199851989746, "mask/share_step_conf": 0.09871330857276917, "num_tokens": 8394615.0, "reward": 1.2391767501831055, "reward_std": 0.2501593828201294, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5834301710128784, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.793164849281311, "step": 35 }, { "adv/mean_abs_final_conf": 0.6682970523834229, "adv/mean_abs_reasoning": 0.3160165548324585, "adv/mean_abs_step_conf": 0.7482949495315552, "adv/ratio_final_to_reasoning": 2.1147533006229113, "adv/ratio_step_to_reasoning": 2.3678979410691836, "adv/std_final_conf": 0.8350497484207153, "adv/std_reasoning": 0.6184996962547302, "adv/std_step_conf": 0.9360457062721252, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5037674037674038, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.23240239043824695, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005545454545455852, "calib/mean_conf": 0.9694541832669322, "calib/mu_c": 0.9696, "calib/mu_w": 0.9690454545454544, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23240239043824695, "calib/std_conf": 0.012041135190817554, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5679854555125725, "calib/step_q_c_n": 1034.0, "calib/step_q_gap": -0.0008944464482117942, "calib/step_q_w": 0.5688799019607843, "calib/step_q_w_n": 408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2349.0, "completions/max_terminated_length": 2349.0, "completions/mean_length": 509.48828125, "completions/mean_terminated_length": 511.4862976074219, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0384, "grad_norm": 0.035114437341690063, "kl": 0.054393768310546875, "learning_rate": 4.555555555555556e-06, "loss": -0.0151, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033349648118019104, "mask/share_reasoning": 0.8350939750671387, "mask/share_step_conf": 0.12765014171600342, "num_tokens": 8627756.0, "reward": 1.355540156364441, "reward_std": 0.21335530281066895, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7375601530075073, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8164474964141846, "step": 36 }, { "adv/mean_abs_final_conf": 0.6995407938957214, "adv/mean_abs_reasoning": 0.44902467727661133, "adv/mean_abs_step_conf": 0.7672286629676819, "adv/ratio_final_to_reasoning": 1.5579116901514645, "adv/ratio_step_to_reasoning": 1.7086558975355564, "adv/std_final_conf": 0.8746408224105835, "adv/std_reasoning": 0.7393237352371216, "adv/std_step_conf": 0.9361535310745239, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5347666666666667, "calib/avg_num_step_conf": 5.80859375, "calib/ece": 0.4689551020408164, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9755102040816327, "calib/gap": 0.02292133333333346, "calib/mean_conf": 0.9561387755102042, "calib/mu_c": 0.9678333333333334, "calib/mu_w": 0.944912, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4676489795918368, "calib/std_conf": 0.1099609073694876, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5881790808240887, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.11911482848764005, "calib/step_q_w": 0.4690642523364486, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 540.04296875, "completions/mean_terminated_length": 548.6151123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.039466666666666664, "grad_norm": 0.03515337035059929, "kl": 0.049747467041015625, "learning_rate": 4.527777777777778e-06, "loss": -0.0503, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030141158029437065, "mask/share_reasoning": 0.837505578994751, "mask/share_step_conf": 0.11672825366258621, "num_tokens": 8873103.0, "reward": 1.1600000858306885, "reward_std": 0.2818068265914917, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5091338753700256, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.762855052947998, "step": 37 }, { "adv/mean_abs_final_conf": 0.7247108221054077, "adv/mean_abs_reasoning": 0.5161527991294861, "adv/mean_abs_step_conf": 0.7817589044570923, "adv/ratio_final_to_reasoning": 1.4040625631163168, "adv/ratio_step_to_reasoning": 1.5145881331566202, "adv/std_final_conf": 0.8897721767425537, "adv/std_reasoning": 0.7754086852073669, "adv/std_step_conf": 0.9362268447875977, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5271099744245524, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.4236932270916334, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.010359015345268507, "calib/mean_conf": 0.9655258964143426, "calib/mu_c": 0.9702720588235294, "calib/mu_w": 0.9599130434782609, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4236932270916334, "calib/std_conf": 0.06272537315110564, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.58784, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.06180626324786331, "calib/step_q_w": 0.5260337367521367, "calib/step_q_w_n": 585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2594.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 538.31640625, "completions/mean_terminated_length": 538.31640625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.04053333333333333, "grad_norm": 0.028706612065434456, "kl": 0.106964111328125, "learning_rate": 4.5e-06, "loss": -0.0183, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030747022479772568, "mask/share_reasoning": 0.8551157116889954, "mask/share_step_conf": 0.11413724720478058, "num_tokens": 9117800.0, "reward": 1.2067832946777344, "reward_std": 0.3105279803276062, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5579559803009033, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7774146795272827, "step": 38 }, { "adv/mean_abs_final_conf": 0.7692740559577942, "adv/mean_abs_reasoning": 0.41159647703170776, "adv/mean_abs_step_conf": 0.7705304622650146, "adv/ratio_final_to_reasoning": 1.8690005840320454, "adv/ratio_step_to_reasoning": 1.8720531036169583, "adv/std_final_conf": 0.868905246257782, "adv/std_reasoning": 0.6404502391815186, "adv/std_step_conf": 0.9361722469329834, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5618107769423559, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.4203137795275591, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.004658521303257945, "calib/mean_conf": 0.9714948818897639, "calib/mu_c": 0.973585714285714, "calib/mu_w": 0.9689271929824561, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4203137795275591, "calib/std_conf": 0.015466978203573886, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5696733285094067, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.01542487547073268, "calib/step_q_w": 0.554248453038674, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 531.41796875, "completions/mean_terminated_length": 531.41796875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.0416, "grad_norm": 0.034416694194078445, "kl": 0.0525054931640625, "learning_rate": 4.472222222222223e-06, "loss": -0.0403, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031083395704627037, "mask/share_reasoning": 0.8509421348571777, "mask/share_step_conf": 0.11797446012496948, "num_tokens": 9359931.0, "reward": 1.2292721271514893, "reward_std": 0.2425519824028015, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5735056400299072, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7886128425598145, "step": 39 }, { "adv/mean_abs_final_conf": 0.7616095542907715, "adv/mean_abs_reasoning": 0.533380389213562, "adv/mean_abs_step_conf": 0.7608983516693115, "adv/ratio_final_to_reasoning": 1.427891931710725, "adv/ratio_step_to_reasoning": 1.4265585444399471, "adv/std_final_conf": 0.8854411840438843, "adv/std_reasoning": 0.7577327489852905, "adv/std_step_conf": 0.9363117218017578, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49098298196596385, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.4785019920318725, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0010485775971551536, "calib/mean_conf": 0.9725258964143426, "calib/mu_c": 0.9730564516129031, "calib/mu_w": 0.972007874015748, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4785019920318725, "calib/std_conf": 0.014743299061754455, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5764660347551344, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.024854657509625344, "calib/step_q_w": 0.551611377245509, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 598.98828125, "completions/mean_terminated_length": 598.98828125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.042666666666666665, "grad_norm": 0.02472682110965252, "kl": 0.052127838134765625, "learning_rate": 4.444444444444444e-06, "loss": -0.0055, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029470203444361687, "mask/share_reasoning": 0.8670581579208374, "mask/share_step_conf": 0.10347166657447815, "num_tokens": 9620032.0, "reward": 1.1567916870117188, "reward_std": 0.3067583441734314, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5111952424049377, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7543189525604248, "step": 40 }, { "adv/mean_abs_final_conf": 0.6801141500473022, "adv/mean_abs_reasoning": 0.46634307503700256, "adv/mean_abs_step_conf": 0.7382851839065552, "adv/ratio_final_to_reasoning": 1.4583987335790025, "adv/ratio_step_to_reasoning": 1.583137444140186, "adv/std_final_conf": 0.8588567972183228, "adv/std_reasoning": 0.7393189668655396, "adv/std_step_conf": 0.9362447261810303, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.526853428261879, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.2572332015810277, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.001999071351183934, "calib/mean_conf": 0.9718577075098815, "calib/mu_c": 0.9712967032967031, "calib/mu_w": 0.9732957746478871, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2548616600790514, "calib/std_conf": 0.03903909449631632, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5804437837837838, "calib/step_q_c_n": 925.0, "calib/step_q_gap": -0.0006741937443061063, "calib/step_q_w": 0.5811179775280899, "calib/step_q_w_n": 356.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 476.0390625, "completions/mean_terminated_length": 477.9059143066406, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.04373333333333333, "grad_norm": 0.03796948492527008, "kl": 0.066650390625, "learning_rate": 4.416666666666667e-06, "loss": -0.0581, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034183815121650696, "mask/share_reasoning": 0.8409860730171204, "mask/share_step_conf": 0.12092389166355133, "num_tokens": 9849146.0, "reward": 1.3258066177368164, "reward_std": 0.2873002290725708, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7234610915184021, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7941542863845825, "step": 41 }, { "adv/mean_abs_final_conf": 0.6904580593109131, "adv/mean_abs_reasoning": 0.3798826038837433, "adv/mean_abs_step_conf": 0.7600710391998291, "adv/ratio_final_to_reasoning": 1.8175564036151974, "adv/ratio_step_to_reasoning": 2.0008050682742926, "adv/std_final_conf": 0.8443465232849121, "adv/std_reasoning": 0.68148273229599, "adv/std_step_conf": 0.9358599185943604, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5600777624482628, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.44073517786561267, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.013663113006396332, "calib/mean_conf": 0.9703794466403161, "calib/mu_c": 0.9768059701492535, "calib/mu_w": 0.9631428571428572, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44073517786561267, "calib/std_conf": 0.06500713165870504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5658237017310254, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.026537552146537857, "calib/step_q_w": 0.5392861495844875, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 450.90625, "completions/mean_terminated_length": 452.6745300292969, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0448, "grad_norm": 0.03713899478316307, "kl": 0.0667572021484375, "learning_rate": 4.388888888888889e-06, "loss": 0.0299, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034279078245162964, "mask/share_reasoning": 0.8265150785446167, "mask/share_step_conf": 0.13529960811138153, "num_tokens": 10068946.0, "reward": 1.2144601345062256, "reward_std": 0.20866739749908447, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5526596307754517, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7869584560394287, "step": 42 }, { "adv/mean_abs_final_conf": 0.7522804737091064, "adv/mean_abs_reasoning": 0.5812476873397827, "adv/mean_abs_step_conf": 0.7755796909332275, "adv/ratio_final_to_reasoning": 1.294251125801628, "adv/ratio_step_to_reasoning": 1.3343359600841616, "adv/std_final_conf": 0.8822755813598633, "adv/std_reasoning": 0.7929232716560364, "adv/std_step_conf": 0.9361656904220581, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.51890756302521, "calib/avg_num_step_conf": 4.81640625, "calib/ece": 0.4436106798418973, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.010437271541452353, "calib/mean_conf": 0.9732549486166008, "calib/mu_c": 0.9781641791044775, "calib/mu_w": 0.9677269075630252, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4436106798418973, "calib/std_conf": 0.0633822962934615, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5934626291793313, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.036869930048896604, "calib/step_q_w": 0.5565926991304347, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 522.05078125, "completions/mean_terminated_length": 522.05078125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.04586666666666667, "grad_norm": 0.028802556917071342, "kl": 0.06124114990234375, "learning_rate": 4.361111111111112e-06, "loss": 0.0244, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032978661358356476, "mask/share_reasoning": 0.8547524213790894, "mask/share_step_conf": 0.11226895451545715, "num_tokens": 10307815.0, "reward": 1.203322410583496, "reward_std": 0.3031473457813263, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5487642288208008, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7777683734893799, "step": 43 }, { "adv/mean_abs_final_conf": 0.7487004399299622, "adv/mean_abs_reasoning": 0.46431854367256165, "adv/mean_abs_step_conf": 0.7619017362594604, "adv/ratio_final_to_reasoning": 1.6124715459521841, "adv/ratio_step_to_reasoning": 1.6409030968979672, "adv/std_final_conf": 0.897011935710907, "adv/std_reasoning": 0.7206284403800964, "adv/std_step_conf": 0.9361453056335449, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5503915451709429, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.5321837301587303, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0023173171197556908, "calib/mean_conf": 0.9789297619047619, "calib/mu_c": 0.98020796460177, "calib/mu_w": 0.9778906474820143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.531350396825397, "calib/std_conf": 0.01960602502812919, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.571471961102107, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.04919096819430546, "calib/step_q_w": 0.5222809929078015, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 521.46484375, "completions/mean_terminated_length": 523.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.046933333333333334, "grad_norm": 0.03608904406428337, "kl": 0.0600738525390625, "learning_rate": 4.333333333333334e-06, "loss": -0.0058, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030586857348680496, "mask/share_reasoning": 0.8524494171142578, "mask/share_step_conf": 0.1130574494600296, "num_tokens": 10547630.0, "reward": 1.150404453277588, "reward_std": 0.27872079610824585, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.46460020542144775, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7751356363296509, "step": 44 }, { "adv/mean_abs_final_conf": 0.7562808990478516, "adv/mean_abs_reasoning": 0.6538466215133667, "adv/mean_abs_step_conf": 0.7737356424331665, "adv/ratio_final_to_reasoning": 1.1566640771155086, "adv/ratio_step_to_reasoning": 1.1833595479048429, "adv/std_final_conf": 0.9265098571777344, "adv/std_reasoning": 0.8747087121009827, "adv/std_step_conf": 0.9363182783126831, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6100305110602593, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.4179729761904761, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9642857142857143, "calib/gap": 0.03228314645308916, "calib/mean_conf": 0.9655920238095238, "calib/mu_c": 0.980196304347826, "calib/mu_w": 0.9479131578947368, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4179729761904761, "calib/std_conf": 0.09740204758261063, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5701288936548913, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.06116639604266594, "calib/step_q_w": 0.5089624976122253, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 539.61328125, "completions/mean_terminated_length": 539.61328125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.048, "grad_norm": 0.034370165318250656, "kl": 0.1003265380859375, "learning_rate": 4.305555555555556e-06, "loss": -0.0024, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032215941697359085, "mask/share_reasoning": 0.8426299095153809, "mask/share_step_conf": 0.12515416741371155, "num_tokens": 10790819.0, "reward": 1.2384848594665527, "reward_std": 0.3294350504875183, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5748702883720398, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7990965843200684, "step": 45 }, { "adv/mean_abs_final_conf": 0.69017094373703, "adv/mean_abs_reasoning": 0.5205807685852051, "adv/mean_abs_step_conf": 0.7614392638206482, "adv/ratio_final_to_reasoning": 1.3257711106246284, "adv/ratio_step_to_reasoning": 1.4626726720812797, "adv/std_final_conf": 0.859300971031189, "adv/std_reasoning": 0.775497555732727, "adv/std_step_conf": 0.9362326860427856, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5912034009156311, "calib/avg_num_step_conf": 5.80078125, "calib/ece": 0.4221120481927712, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.0053081098757360445, "calib/mean_conf": 0.9803449799196788, "calib/mu_c": 0.9826899280575541, "calib/mu_w": 0.977381818181818, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4221120481927712, "calib/std_conf": 0.015544703797893853, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5270673518742442, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.011641060080931243, "calib/step_q_w": 0.515426291793313, "calib/step_q_w_n": 658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 570.2265625, "completions/mean_terminated_length": 572.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.04906666666666667, "grad_norm": 0.023864952847361565, "kl": 0.0557861328125, "learning_rate": 4.277777777777778e-06, "loss": 0.0192, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032648518681526184, "mask/share_reasoning": 0.8368463516235352, "mask/share_step_conf": 0.12659892439842224, "num_tokens": 11041565.0, "reward": 1.2223206758499146, "reward_std": 0.29786819219589233, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5617954730987549, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7898604273796082, "step": 46 }, { "adv/mean_abs_final_conf": 0.6998475193977356, "adv/mean_abs_reasoning": 0.4415852427482605, "adv/mean_abs_step_conf": 0.7632977962493896, "adv/ratio_final_to_reasoning": 1.5848525984295756, "adv/ratio_step_to_reasoning": 1.7285400922796044, "adv/std_final_conf": 0.8608697056770325, "adv/std_reasoning": 0.7013309001922607, "adv/std_step_conf": 0.9359551072120667, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6020815156425278, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.4211152941176471, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.012229041505670946, "calib/mean_conf": 0.9779780392156863, "calib/mu_c": 0.9833971830985914, "calib/mu_w": 0.9711681415929204, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4211152941176471, "calib/std_conf": 0.06264071647787252, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5504315794191919, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.039325589913944525, "calib/step_q_w": 0.5111059895052473, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 528.3515625, "completions/mean_terminated_length": 530.423583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.050133333333333335, "grad_norm": 0.02478746324777603, "kl": 0.06172943115234375, "learning_rate": 4.25e-06, "loss": 0.0007, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030939877033233643, "mask/share_reasoning": 0.8441919684410095, "mask/share_step_conf": 0.12096185982227325, "num_tokens": 11282799.0, "reward": 1.256608486175537, "reward_std": 0.22781646251678467, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5757490396499634, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8136557936668396, "step": 47 }, { "adv/mean_abs_final_conf": 0.7483422756195068, "adv/mean_abs_reasoning": 0.5428138971328735, "adv/mean_abs_step_conf": 0.7645820379257202, "adv/ratio_final_to_reasoning": 1.378635071010945, "adv/ratio_step_to_reasoning": 1.4085528059694479, "adv/std_final_conf": 0.8885745406150818, "adv/std_reasoning": 0.7753939628601074, "adv/std_step_conf": 0.9362524151802063, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.546958006577283, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.5160222222222224, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": -0.0037177333670629364, "calib/mean_conf": 0.976815873015873, "calib/mu_c": 0.9748389830508475, "calib/mu_w": 0.9785567164179104, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.5122920634920637, "calib/std_conf": 0.06378458341465201, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5602868462757529, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.02064861498323589, "calib/step_q_w": 0.539638231292517, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 504.9140625, "completions/mean_terminated_length": 504.9140625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0512, "grad_norm": 0.026672683656215668, "kl": 0.064849853515625, "learning_rate": 4.222222222222223e-06, "loss": 0.0202, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0343143455684185, "mask/share_reasoning": 0.838284969329834, "mask/share_step_conf": 0.1274007260799408, "num_tokens": 11515745.0, "reward": 1.1532585620880127, "reward_std": 0.2807466983795166, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.478743314743042, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7697461843490601, "step": 48 }, { "adv/mean_abs_final_conf": 0.692025899887085, "adv/mean_abs_reasoning": 0.4786022901535034, "adv/mean_abs_step_conf": 0.7627895474433899, "adv/ratio_final_to_reasoning": 1.44593102482884, "adv/ratio_step_to_reasoning": 1.5937858283100532, "adv/std_final_conf": 0.8504143953323364, "adv/std_reasoning": 0.7393408417701721, "adv/std_step_conf": 0.9362682104110718, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5810846386731605, "calib/avg_num_step_conf": 5.66015625, "calib/ece": 0.4124707228915664, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.002727752402263861, "calib/mean_conf": 0.9827518473895583, "calib/mu_c": 0.9839240140845068, "calib/mu_w": 0.9811962616822429, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4124707228915664, "calib/std_conf": 0.012489955924533542, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5635376092544987, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.05391768873780722, "calib/step_q_w": 0.5096199205166915, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 520.98046875, "completions/mean_terminated_length": 520.98046875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.05226666666666667, "grad_norm": 0.023565007373690605, "kl": 0.06116485595703125, "learning_rate": 4.194444444444445e-06, "loss": 0.0253, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03131241351366043, "mask/share_reasoning": 0.8423235416412354, "mask/share_step_conf": 0.12636405229568481, "num_tokens": 11753652.0, "reward": 1.2214174270629883, "reward_std": 0.2773900628089905, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5696588754653931, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.784244179725647, "step": 49 }, { "adv/mean_abs_final_conf": 0.7077445983886719, "adv/mean_abs_reasoning": 0.46105533838272095, "adv/mean_abs_step_conf": 0.7690777778625488, "adv/ratio_final_to_reasoning": 1.53505347291126, "adv/ratio_step_to_reasoning": 1.6680812775323277, "adv/std_final_conf": 0.882941484451294, "adv/std_reasoning": 0.7205641269683838, "adv/std_step_conf": 0.9361612200737, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5750131027253668, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.35825490196078436, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": 0.008864976415094206, "calib/mean_conf": 0.9800588235294118, "calib/mu_c": 0.9833962264150943, "calib/mu_w": 0.9745312500000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35739215686274517, "calib/std_conf": 0.0451282320073157, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5687125153374233, "calib/step_q_c_n": 815.0, "calib/step_q_gap": 0.042891514232450856, "calib/step_q_w": 0.5258210011049724, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 502.16796875, "completions/mean_terminated_length": 504.1372985839844, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.05333333333333334, "grad_norm": 0.027782263234257698, "kl": 0.0827178955078125, "learning_rate": 4.166666666666667e-06, "loss": -0.0097, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03265595808625221, "mask/share_reasoning": 0.8410335183143616, "mask/share_step_conf": 0.12240426242351532, "num_tokens": 11987567.0, "reward": 1.2991454601287842, "reward_std": 0.26816582679748535, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6338658928871155, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8208842873573303, "step": 50 }, { "adv/mean_abs_final_conf": 0.6769541501998901, "adv/mean_abs_reasoning": 0.4819958209991455, "adv/mean_abs_step_conf": 0.7450014352798462, "adv/ratio_final_to_reasoning": 1.404481368316864, "adv/ratio_step_to_reasoning": 1.5456595323492792, "adv/std_final_conf": 0.8652604222297668, "adv/std_reasoning": 0.7575814723968506, "adv/std_step_conf": 0.9360484480857849, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6032391318602058, "calib/avg_num_step_conf": 5.18359375, "calib/ece": 0.3813896825396826, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.00809154809520718, "calib/mean_conf": 0.9805960317460318, "calib/mu_c": 0.9838390728476822, "calib/mu_w": 0.9757475247524751, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3813896825396826, "calib/std_conf": 0.016892831675913256, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5683220778061225, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.02109250322048717, "calib/step_q_w": 0.5472295745856354, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 536.4375, "completions/mean_terminated_length": 536.4375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0544, "grad_norm": 0.02513463795185089, "kl": 0.06470489501953125, "learning_rate": 4.138888888888889e-06, "loss": 0.025, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032385122030973434, "mask/share_reasoning": 0.8494609594345093, "mask/share_step_conf": 0.11815392971038818, "num_tokens": 12234191.0, "reward": 1.2722687721252441, "reward_std": 0.27277636528015137, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6083289384841919, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8106822967529297, "step": 51 }, { "adv/mean_abs_final_conf": 0.6640352606773376, "adv/mean_abs_reasoning": 0.4049059748649597, "adv/mean_abs_step_conf": 0.7678852081298828, "adv/ratio_final_to_reasoning": 1.639973973954818, "adv/ratio_step_to_reasoning": 1.8964531417101969, "adv/std_final_conf": 0.8312138915061951, "adv/std_reasoning": 0.6816580891609192, "adv/std_step_conf": 0.935936689376831, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6188590116279069, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.295332142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.015278343023255814, "calib/mean_conf": 0.9778718253968255, "calib/mu_c": 0.9827220930232557, "calib/mu_w": 0.9674437499999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.295332142857143, "calib/std_conf": 0.039204568791510054, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5293833658008659, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.015291933762030863, "calib/step_q_w": 0.514091432038835, "calib/step_q_w_n": 412.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 505.328125, "completions/mean_terminated_length": 505.328125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.055466666666666664, "grad_norm": 0.03392941132187843, "kl": 0.07430267333984375, "learning_rate": 4.111111111111111e-06, "loss": 0.0163, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033117808401584625, "mask/share_reasoning": 0.8461878299713135, "mask/share_step_conf": 0.12069441378116608, "num_tokens": 12471507.0, "reward": 1.3340625762939453, "reward_std": 0.2229156792163849, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6863217949867249, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8256672024726868, "step": 52 }, { "adv/mean_abs_final_conf": 0.6531897783279419, "adv/mean_abs_reasoning": 0.4806157946586609, "adv/mean_abs_step_conf": 0.746953547000885, "adv/ratio_final_to_reasoning": 1.359068481700326, "adv/ratio_step_to_reasoning": 1.5541593832374576, "adv/std_final_conf": 0.8576918244361877, "adv/std_reasoning": 0.7575007081031799, "adv/std_step_conf": 0.9360480904579163, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5677368086458995, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.41577096234644195, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.005495318071715105, "calib/mean_conf": 0.9809883536507897, "calib/mu_c": 0.9833776223776223, "calib/mu_w": 0.9778823043059072, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41577096234644195, "calib/std_conf": 0.01716129647237015, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5705359947643979, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.061252509893888196, "calib/step_q_w": 0.5092834848705097, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 497.57421875, "completions/mean_terminated_length": 497.57421875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.05653333333333333, "grad_norm": 0.02414063923060894, "kl": 0.0670623779296875, "learning_rate": 4.083333333333334e-06, "loss": -0.0015, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03233794867992401, "mask/share_reasoning": 0.843818187713623, "mask/share_step_conf": 0.12384383380413055, "num_tokens": 12704710.0, "reward": 1.2482346296310425, "reward_std": 0.2573202848434448, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5768751502037048, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8055001497268677, "step": 53 }, { "adv/mean_abs_final_conf": 0.593941330909729, "adv/mean_abs_reasoning": 0.34782564640045166, "adv/mean_abs_step_conf": 0.7776609063148499, "adv/ratio_final_to_reasoning": 1.707583489188501, "adv/ratio_step_to_reasoning": 2.2357779374886255, "adv/std_final_conf": 0.7841689586639404, "adv/std_reasoning": 0.640178918838501, "adv/std_step_conf": 0.9358230829238892, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6308143910883637, "calib/avg_num_step_conf": 5.14453125, "calib/ece": 0.2601203921568628, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": 0.049350526870389766, "calib/mean_conf": 0.9674145098039215, "calib/mu_c": 0.9815423076923078, "calib/mu_w": 0.932191780821918, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.256904705882353, "calib/std_conf": 0.11191986062013434, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5804673228346456, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.06184279012436533, "calib/step_q_w": 0.5186245327102803, "calib/step_q_w_n": 428.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 450.76171875, "completions/mean_terminated_length": 452.5294494628906, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.0576, "grad_norm": 0.027992311865091324, "kl": 0.06845855712890625, "learning_rate": 4.055555555555556e-06, "loss": -0.0377, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036233823746442795, "mask/share_reasoning": 0.8332265615463257, "mask/share_step_conf": 0.12663336098194122, "num_tokens": 12926337.0, "reward": 1.3820880651474, "reward_std": 0.20170414447784424, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7360745668411255, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8433475494384766, "step": 54 }, { "adv/mean_abs_final_conf": 0.7017855048179626, "adv/mean_abs_reasoning": 0.5232642889022827, "adv/mean_abs_step_conf": 0.7613639235496521, "adv/ratio_final_to_reasoning": 1.3411683535488086, "adv/ratio_step_to_reasoning": 1.4550274874420743, "adv/std_final_conf": 0.8780732154846191, "adv/std_reasoning": 0.7754185795783997, "adv/std_step_conf": 0.9361396431922913, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6526717557251909, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.4446177165354331, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.952755905511811, "calib/gap": 0.04878835102091483, "calib/mean_conf": 0.9603657480314961, "calib/mu_c": 0.983991603053435, "calib/mu_w": 0.9352032520325202, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4446177165354331, "calib/std_conf": 0.12634521874310717, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5507576452599389, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.030742413471859353, "calib/step_q_w": 0.5200152317880795, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 456.1875, "completions/mean_terminated_length": 457.97650146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.058666666666666666, "grad_norm": 0.03539075702428818, "kl": 0.076690673828125, "learning_rate": 4.027777777777779e-06, "loss": -0.0211, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03572503477334976, "mask/share_reasoning": 0.8378803730010986, "mask/share_step_conf": 0.12248837947845459, "num_tokens": 13150945.0, "reward": 1.2235780954360962, "reward_std": 0.28690779209136963, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5558318495750427, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7964434027671814, "step": 55 }, { "adv/mean_abs_final_conf": 0.7039684057235718, "adv/mean_abs_reasoning": 0.49995529651641846, "adv/mean_abs_step_conf": 0.7258338928222656, "adv/ratio_final_to_reasoning": 1.4080627020629104, "adv/ratio_step_to_reasoning": 1.4517975864636716, "adv/std_final_conf": 0.8813683986663818, "adv/std_reasoning": 0.7575625777244568, "adv/std_step_conf": 0.9361122846603394, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5774436090225565, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.5015957599712543, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": 0.010814957849168527, "calib/mean_conf": 0.975904060366511, "calib/mu_c": 0.981589393939394, "calib/mu_w": 0.9707744360902255, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5015957599712543, "calib/std_conf": 0.06488835931336258, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5589189078338228, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.04176423764912618, "calib/step_q_w": 0.5171546701846966, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 486.2578125, "completions/mean_terminated_length": 488.16473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.05973333333333333, "grad_norm": 0.03635377436876297, "kl": 0.07538604736328125, "learning_rate": 4.000000000000001e-06, "loss": -0.034, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03347313404083252, "mask/share_reasoning": 0.8359185457229614, "mask/share_step_conf": 0.12670202553272247, "num_tokens": 13382267.0, "reward": 1.2034459114074707, "reward_std": 0.26581820845603943, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.49430450797080994, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8109811544418335, "step": 56 }, { "adv/mean_abs_final_conf": 0.724047064781189, "adv/mean_abs_reasoning": 0.6356273293495178, "adv/mean_abs_step_conf": 0.7706122398376465, "adv/ratio_final_to_reasoning": 1.139106251963957, "adv/ratio_step_to_reasoning": 1.2123648626409256, "adv/std_final_conf": 0.89552241563797, "adv/std_reasoning": 0.8428938388824463, "adv/std_step_conf": 0.936430037021637, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6060245611779079, "calib/avg_num_step_conf": 4.97265625, "calib/ece": 0.38820039370078746, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.00575520478364322, "calib/mean_conf": 0.9794602362204724, "calib/mu_c": 0.9817940397350994, "calib/mu_w": 0.9760388349514562, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.386586220472441, "calib/std_conf": 0.031379416016721846, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5748436361185983, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.03467443459768571, "calib/step_q_w": 0.5401692015209126, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 479.8671875, "completions/mean_terminated_length": 479.8671875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.0608, "grad_norm": 0.030560489743947983, "kl": 0.06566619873046875, "learning_rate": 3.972222222222223e-06, "loss": -0.014, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03349709138274193, "mask/share_reasoning": 0.8486497402191162, "mask/share_step_conf": 0.11785320192575455, "num_tokens": 13611905.0, "reward": 1.2591137886047363, "reward_std": 0.3345072567462921, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5999172925949097, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8017332553863525, "step": 57 }, { "adv/mean_abs_final_conf": 0.7447444200515747, "adv/mean_abs_reasoning": 0.6445914506912231, "adv/mean_abs_step_conf": 0.7659875154495239, "adv/ratio_final_to_reasoning": 1.155374337113769, "adv/ratio_step_to_reasoning": 1.1883302433318383, "adv/std_final_conf": 0.8873608112335205, "adv/std_reasoning": 0.8429160118103027, "adv/std_step_conf": 0.9360173940658569, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4945436507936508, "calib/avg_num_step_conf": 5.984375, "calib/ece": 0.46773477690288734, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.968503937007874, "calib/gap": 0.0022207919973545875, "calib/mean_conf": 0.971671784776903, "calib/mu_c": 0.9727734375000001, "calib/mu_w": 0.9705526455026455, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.46773477690288734, "calib/std_conf": 0.04788829024024461, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5344919556171983, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.057724954877371004, "calib/step_q_w": 0.4767670007398273, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 536.26953125, "completions/mean_terminated_length": 538.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.06186666666666667, "grad_norm": 0.03641160577535629, "kl": 0.090789794921875, "learning_rate": 3.944444444444445e-06, "loss": -0.0299, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030999979004263878, "mask/share_reasoning": 0.8410400152206421, "mask/share_step_conf": 0.12405376881361008, "num_tokens": 13855510.0, "reward": 1.2054097652435303, "reward_std": 0.2900436520576477, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5219812989234924, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7959815263748169, "step": 58 }, { "adv/mean_abs_final_conf": 0.7536922097206116, "adv/mean_abs_reasoning": 0.5401763916015625, "adv/mean_abs_step_conf": 0.7711378335952759, "adv/ratio_final_to_reasoning": 1.3952705476187113, "adv/ratio_step_to_reasoning": 1.4275667089206519, "adv/std_final_conf": 0.8953720927238464, "adv/std_reasoning": 0.7576656937599182, "adv/std_step_conf": 0.9361147284507751, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5574965439235893, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.4041764705882354, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": 0.0017679401784591597, "calib/mean_conf": 0.9743333333333333, "calib/mu_c": 0.9750890410958905, "calib/mu_w": 0.9733211009174313, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4029803921568628, "calib/std_conf": 0.029456559352526033, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5775435356200528, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.01650467847719561, "calib/step_q_w": 0.5610388571428572, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 475.5859375, "completions/mean_terminated_length": 477.4510192871094, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.06293333333333333, "grad_norm": 0.03324250504374504, "kl": 0.063262939453125, "learning_rate": 3.916666666666667e-06, "loss": -0.0566, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034234900027513504, "mask/share_reasoning": 0.8443710207939148, "mask/share_step_conf": 0.1174878254532814, "num_tokens": 14083508.0, "reward": 1.2565596103668213, "reward_std": 0.2741585969924927, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5915107727050781, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8041635751724243, "step": 59 }, { "adv/mean_abs_final_conf": 0.7119778394699097, "adv/mean_abs_reasoning": 0.5205633640289307, "adv/mean_abs_step_conf": 0.7670281529426575, "adv/ratio_final_to_reasoning": 1.3677063901683657, "adv/ratio_step_to_reasoning": 1.4734578073381848, "adv/std_final_conf": 0.8901903033256531, "adv/std_reasoning": 0.7752787470817566, "adv/std_step_conf": 0.9362204074859619, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6234294161123428, "calib/avg_num_step_conf": 4.75, "calib/ece": 0.4560941176470589, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.006737620103473785, "calib/mean_conf": 0.9717803921568628, "calib/mu_c": 0.9750303030303031, "calib/mu_w": 0.9682926829268294, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4551137254901962, "calib/std_conf": 0.02803832824593159, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5701520826666667, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.04724744690507732, "calib/step_q_w": 0.5229046357615894, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 462.03515625, "completions/mean_terminated_length": 463.8470764160156, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.064, "grad_norm": 0.038515787571668625, "kl": 0.0582733154296875, "learning_rate": 3.88888888888889e-06, "loss": -0.0229, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0341600701212883, "mask/share_reasoning": 0.8478833436965942, "mask/share_step_conf": 0.11405040323734283, "num_tokens": 14310645.0, "reward": 1.2330572605133057, "reward_std": 0.2662425935268402, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.544211208820343, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8097797632217407, "step": 60 }, { "adv/mean_abs_final_conf": 0.7021837830543518, "adv/mean_abs_reasoning": 0.49892711639404297, "adv/mean_abs_step_conf": 0.7648937702178955, "adv/ratio_final_to_reasoning": 1.4073874920435887, "adv/ratio_step_to_reasoning": 1.533077167154405, "adv/std_final_conf": 0.8997812867164612, "adv/std_reasoning": 0.757564902305603, "adv/std_step_conf": 0.9359845519065857, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5786557855626326, "calib/avg_num_step_conf": 4.72265625, "calib/ece": 0.3507233201581028, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.011121815286624037, "calib/mean_conf": 0.9712766798418973, "calib/mu_c": 0.975496815286624, "calib/mu_w": 0.964375, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3507233201581028, "calib/std_conf": 0.029815283266292767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5859243101182655, "calib/step_q_c_n": 761.0, "calib/step_q_gap": 0.030174310118265546, "calib/step_q_w": 0.55575, "calib/step_q_w_n": 448.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 426.35546875, "completions/mean_terminated_length": 426.35546875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.06506666666666666, "grad_norm": 0.026272548362612724, "kl": 0.078460693359375, "learning_rate": 3.861111111111112e-06, "loss": -0.0135, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03931958228349686, "mask/share_reasoning": 0.834287166595459, "mask/share_step_conf": 0.12639322876930237, "num_tokens": 14523856.0, "reward": 1.2799744606018066, "reward_std": 0.28759801387786865, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6383060812950134, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8002746105194092, "step": 61 }, { "adv/mean_abs_final_conf": 0.7885167002677917, "adv/mean_abs_reasoning": 0.6170656085014343, "adv/mean_abs_step_conf": 0.778965950012207, "adv/ratio_final_to_reasoning": 1.2778490478228604, "adv/ratio_step_to_reasoning": 1.2623713577296156, "adv/std_final_conf": 0.9230625629425049, "adv/std_reasoning": 0.8100219964981079, "adv/std_step_conf": 0.9363936185836792, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.552384, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.45596000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.928, "calib/gap": 0.0196160000000003, "calib/mean_conf": 0.95396, "calib/mu_c": 0.9637680000000002, "calib/mu_w": 0.9441519999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45496000000000014, "calib/std_conf": 0.0837418318404846, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5657848739495798, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.0682291092436974, "calib/step_q_w": 0.49755576470588236, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 490.96875, "completions/mean_terminated_length": 494.83465576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.06613333333333334, "grad_norm": 0.036133233457803726, "kl": 0.0560760498046875, "learning_rate": 3.833333333333334e-06, "loss": 0.0055, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033206403255462646, "mask/share_reasoning": 0.8455889225006104, "mask/share_step_conf": 0.11339214444160461, "num_tokens": 14756624.0, "reward": 1.1832115650177002, "reward_std": 0.3248516917228699, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5339019894599915, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7697761058807373, "step": 62 }, { "adv/mean_abs_final_conf": 0.7545422911643982, "adv/mean_abs_reasoning": 0.5572085380554199, "adv/mean_abs_step_conf": 0.7710795998573303, "adv/ratio_final_to_reasoning": 1.3541470376560372, "adv/ratio_step_to_reasoning": 1.383825887787525, "adv/std_final_conf": 0.9127007722854614, "adv/std_reasoning": 0.7927935123443604, "adv/std_step_conf": 0.936107873916626, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6145327858653726, "calib/avg_num_step_conf": 4.68359375, "calib/ece": 0.40417254901960775, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9137254901960784, "calib/gap": 0.01083184023889483, "calib/mean_conf": 0.9480156862745098, "calib/mu_c": 0.9528581560283687, "calib/mu_w": 0.9420263157894738, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39962352941176466, "calib/std_conf": 0.05407873480271483, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5841763826606875, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.05211977888710262, "calib/step_q_w": 0.5320566037735849, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 488.4609375, "completions/mean_terminated_length": 490.3764953613281, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.0672, "grad_norm": 0.025932716205716133, "kl": 0.0595855712890625, "learning_rate": 3.8055555555555556e-06, "loss": -0.0468, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0346166267991066, "mask/share_reasoning": 0.8508518934249878, "mask/share_step_conf": 0.1106252670288086, "num_tokens": 14990310.0, "reward": 1.2687819004058838, "reward_std": 0.27470365166664124, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5966545343399048, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8161576986312866, "step": 63 }, { "adv/mean_abs_final_conf": 0.7520545125007629, "adv/mean_abs_reasoning": 0.521262526512146, "adv/mean_abs_step_conf": 0.7765277624130249, "adv/ratio_final_to_reasoning": 1.4427557598143192, "adv/ratio_step_to_reasoning": 1.4897057104965916, "adv/std_final_conf": 0.9201980233192444, "adv/std_reasoning": 0.7753161787986755, "adv/std_step_conf": 0.9359229803085327, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5721966205837173, "calib/avg_num_step_conf": 4.26953125, "calib/ece": 0.3141129921259842, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.905511811023622, "calib/gap": 0.01033412809724188, "calib/mean_conf": 0.9356901574803149, "calib/mu_c": 0.9394739130434783, "calib/mu_w": 0.9291397849462364, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3079724409448818, "calib/std_conf": 0.10479543450027383, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5752903790087464, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.017683499401866754, "calib/step_q_w": 0.5576068796068796, "calib/step_q_w_n": 407.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 434.5546875, "completions/mean_terminated_length": 436.25885009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.06826666666666667, "grad_norm": 0.035325441509485245, "kl": 0.06046295166015625, "learning_rate": 3.777777777777778e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0373111218214035, "mask/share_reasoning": 0.8508127927780151, "mask/share_step_conf": 0.10796987265348434, "num_tokens": 15205332.0, "reward": 1.3001196384429932, "reward_std": 0.26134374737739563, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6653909087181091, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8053147792816162, "step": 64 }, { "adv/mean_abs_final_conf": 0.7708741426467896, "adv/mean_abs_reasoning": 0.4588015675544739, "adv/mean_abs_step_conf": 0.7731654047966003, "adv/ratio_final_to_reasoning": 1.6801907342116111, "adv/ratio_step_to_reasoning": 1.6851847497334493, "adv/std_final_conf": 0.9213289618492126, "adv/std_reasoning": 0.720514178276062, "adv/std_step_conf": 0.936091959476471, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5939365671641791, "calib/avg_num_step_conf": 4.12109375, "calib/ece": 0.4219133858267716, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9251968503937008, "calib/gap": 0.02082661691542287, "calib/mean_conf": 0.947503937007874, "calib/mu_c": 0.9573432835820895, "calib/mu_w": 0.9365166666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4209291338582677, "calib/std_conf": 0.07623801044405916, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5993211678832117, "calib/step_q_c_n": 548.0, "calib/step_q_gap": 0.03390381088123928, "calib/step_q_w": 0.5654173570019724, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 407.1328125, "completions/mean_terminated_length": 408.72943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.06933333333333333, "grad_norm": 0.03840414434671402, "kl": 0.06768035888671875, "learning_rate": 3.7500000000000005e-06, "loss": -0.0382, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.039253175258636475, "mask/share_reasoning": 0.8444764614105225, "mask/share_step_conf": 0.11236413568258286, "num_tokens": 15414582.0, "reward": 1.2306933403015137, "reward_std": 0.23277725279331207, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5744519829750061, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7915141582489014, "step": 65 }, { "adv/mean_abs_final_conf": 0.7484292984008789, "adv/mean_abs_reasoning": 0.489560604095459, "adv/mean_abs_step_conf": 0.7633772492408752, "adv/ratio_final_to_reasoning": 1.5287776265897068, "adv/ratio_step_to_reasoning": 1.559311028818048, "adv/std_final_conf": 0.9219995141029358, "adv/std_reasoning": 0.7574658989906311, "adv/std_step_conf": 0.935945451259613, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6697958669354838, "calib/avg_num_step_conf": 4.7109375, "calib/ece": 0.4382460317460317, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7976190476190477, "calib/gap": 0.03244002016129033, "calib/mean_conf": 0.9261031746031746, "calib/mu_c": 0.9425806451612903, "calib/mu_w": 0.9101406249999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43614285714285717, "calib/std_conf": 0.07387616508522672, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5608768670309654, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.04943090051650567, "calib/step_q_w": 0.5114459665144597, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 499.89453125, "completions/mean_terminated_length": 501.85491943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.0704, "grad_norm": 0.0336519218981266, "kl": 0.060970306396484375, "learning_rate": 3.7222222222222225e-06, "loss": 0.0229, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035479508340358734, "mask/share_reasoning": 0.8528700470924377, "mask/share_step_conf": 0.10774420201778412, "num_tokens": 15648907.0, "reward": 1.227863073348999, "reward_std": 0.23462948203086853, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5634865760803223, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7992448806762695, "step": 66 }, { "adv/mean_abs_final_conf": 0.7395097613334656, "adv/mean_abs_reasoning": 0.4111238718032837, "adv/mean_abs_step_conf": 0.7691671848297119, "adv/ratio_final_to_reasoning": 1.798751695175972, "adv/ratio_step_to_reasoning": 1.870889134839016, "adv/std_final_conf": 0.9214187264442444, "adv/std_reasoning": 0.7012759447097778, "adv/std_step_conf": 0.936107337474823, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6463777285669092, "calib/avg_num_step_conf": 4.25390625, "calib/ece": 0.3530314960629921, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8188976377952756, "calib/gap": 0.03838721923441957, "calib/mean_conf": 0.9203543307086614, "calib/mu_c": 0.9368275862068965, "calib/mu_w": 0.898440366972477, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.351259842519685, "calib/std_conf": 0.1045672768825448, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6015963963963964, "calib/step_q_c_n": 555.0, "calib/step_q_gap": 0.09310605868532695, "calib/step_q_w": 0.5084903377110694, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 457.13671875, "completions/mean_terminated_length": 458.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.07146666666666666, "grad_norm": 0.030173419043421745, "kl": 0.05756378173828125, "learning_rate": 3.694444444444445e-06, "loss": -0.0259, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03676186501979828, "mask/share_reasoning": 0.8619298934936523, "mask/share_step_conf": 0.0974019393324852, "num_tokens": 15870942.0, "reward": 1.2942092418670654, "reward_std": 0.24686667323112488, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6318415999412537, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.822819709777832, "step": 67 }, { "adv/mean_abs_final_conf": 0.7641202211380005, "adv/mean_abs_reasoning": 0.40942007303237915, "adv/mean_abs_step_conf": 0.7585666179656982, "adv/ratio_final_to_reasoning": 1.8663477232040104, "adv/ratio_step_to_reasoning": 1.8527831631394065, "adv/std_final_conf": 0.9237704873085022, "adv/std_reasoning": 0.6815453171730042, "adv/std_step_conf": 0.9358956217765808, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6691687344913152, "calib/avg_num_step_conf": 4.2421875, "calib/ece": 0.40511811023622046, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7598425196850394, "calib/gap": 0.053322580645161155, "calib/mean_conf": 0.9119685039370078, "calib/mu_c": 0.9380000000000001, "calib/mu_w": 0.8846774193548389, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4026377952755905, "calib/std_conf": 0.11687896269809996, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5686440162271805, "calib/step_q_c_n": 493.0, "calib/step_q_gap": 0.05511568570441483, "calib/step_q_w": 0.5135283305227657, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 437.6640625, "completions/mean_terminated_length": 437.6640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.07253333333333334, "grad_norm": 0.03539825975894928, "kl": 0.06148529052734375, "learning_rate": 3.6666666666666666e-06, "loss": -0.0165, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.040313176810741425, "mask/share_reasoning": 0.8507459759712219, "mask/share_step_conf": 0.10894083976745605, "num_tokens": 16087072.0, "reward": 1.240929365158081, "reward_std": 0.21346575021743774, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5982882976531982, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7917851805686951, "step": 68 }, { "adv/mean_abs_final_conf": 0.7670649886131287, "adv/mean_abs_reasoning": 0.5342639088630676, "adv/mean_abs_step_conf": 0.7311890721321106, "adv/ratio_final_to_reasoning": 1.435741729673393, "adv/ratio_step_to_reasoning": 1.3685915518570337, "adv/std_final_conf": 0.9319685101509094, "adv/std_reasoning": 0.7928503155708313, "adv/std_step_conf": 0.9363734126091003, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6099412182675538, "calib/avg_num_step_conf": 4.140625, "calib/ece": 0.44009600000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.62, "calib/gap": 0.04333621859053027, "calib/mean_conf": 0.891296, "calib/mu_c": 0.9150442477876106, "calib/mu_w": 0.8717080291970803, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4396960000000001, "calib/std_conf": 0.11289492629874914, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5273900862068965, "calib/step_q_c_n": 464.0, "calib/step_q_gap": 0.033377313097652905, "calib/step_q_w": 0.49401277310924363, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 507.69921875, "completions/mean_terminated_length": 513.7193603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.0736, "grad_norm": 0.03430643677711487, "kl": 0.05869293212890625, "learning_rate": 3.638888888888889e-06, "loss": -0.1387, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03412723168730736, "mask/share_reasoning": 0.8615804314613342, "mask/share_step_conf": 0.0925736129283905, "num_tokens": 16321539.0, "reward": 1.1938366889953613, "reward_std": 0.3026812672615051, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.5487052202224731, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7792496681213379, "step": 69 }, { "adv/mean_abs_final_conf": 0.7933931350708008, "adv/mean_abs_reasoning": 0.4543910026550293, "adv/mean_abs_step_conf": 0.7425044775009155, "adv/ratio_final_to_reasoning": 1.7460581975324447, "adv/ratio_step_to_reasoning": 1.6340650962770495, "adv/std_final_conf": 0.925523579120636, "adv/std_reasoning": 0.7205556035041809, "adv/std_step_conf": 0.9361271858215332, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7422693750808642, "calib/avg_num_step_conf": 4.26953125, "calib/ece": 0.4013895582329316, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6104417670682731, "calib/gap": 0.1102944753525682, "calib/mean_conf": 0.8707871485943776, "calib/mu_c": 0.9288135593220338, "calib/mu_w": 0.8185190839694656, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39914056224899586, "calib/std_conf": 0.16316327987356224, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5501617107942974, "calib/step_q_c_n": 491.0, "calib/step_q_gap": 0.09954543172453006, "calib/step_q_w": 0.45061627906976737, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 477.78125, "completions/mean_terminated_length": 479.6549377441406, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.07466666666666667, "grad_norm": 0.04866304621100426, "kl": 0.065460205078125, "learning_rate": 3.6111111111111115e-06, "loss": -0.0177, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0366334542632103, "mask/share_reasoning": 0.8547972440719604, "mask/share_step_conf": 0.10466301441192627, "num_tokens": 16550843.0, "reward": 1.241321325302124, "reward_std": 0.2550710141658783, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6034541130065918, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7966254353523254, "step": 70 }, { "adv/mean_abs_final_conf": 0.7791687250137329, "adv/mean_abs_reasoning": 0.6073622703552246, "adv/mean_abs_step_conf": 0.7619677186012268, "adv/ratio_final_to_reasoning": 1.2828731105704423, "adv/ratio_step_to_reasoning": 1.2545522759515157, "adv/std_final_conf": 0.9324753284454346, "adv/std_reasoning": 0.8266918659210205, "adv/std_step_conf": 0.9363065361976624, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5827046035805626, "calib/avg_num_step_conf": 4.828125, "calib/ece": 0.35780478087649403, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6215139442231076, "calib/gap": 0.012531713554987278, "calib/mean_conf": 0.8823466135458167, "calib/mu_c": 0.8880882352941176, "calib/mu_w": 0.8755565217391303, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3491593625498008, "calib/std_conf": 0.12245338725525283, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4922910743801653, "calib/step_q_c_n": 605.0, "calib/step_q_gap": 0.04487443412659947, "calib/step_q_w": 0.4474166402535658, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 443.1484375, "completions/mean_terminated_length": 448.4031677246094, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.07573333333333333, "grad_norm": 0.03445587679743767, "kl": 0.0860137939453125, "learning_rate": 3.5833333333333335e-06, "loss": -0.1582, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03762516379356384, "mask/share_reasoning": 0.8341405987739563, "mask/share_step_conf": 0.11651550978422165, "num_tokens": 16768697.0, "reward": 1.251643180847168, "reward_std": 0.3011441230773926, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6108843088150024, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7954198122024536, "step": 71 }, { "adv/mean_abs_final_conf": 0.7744049429893494, "adv/mean_abs_reasoning": 0.5842318534851074, "adv/mean_abs_step_conf": 0.7654081583023071, "adv/ratio_final_to_reasoning": 1.3255096215137978, "adv/ratio_step_to_reasoning": 1.3101102819649975, "adv/std_final_conf": 0.9310498833656311, "adv/std_reasoning": 0.8099108338356018, "adv/std_step_conf": 0.9359899163246155, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6381246490735542, "calib/avg_num_step_conf": 4.42578125, "calib/ece": 0.34905511811023626, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6141732283464567, "calib/gap": 0.06327905670971357, "calib/mean_conf": 0.8756692913385827, "calib/mu_c": 0.9048175182481751, "calib/mu_w": 0.8415384615384616, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3426771653543308, "calib/std_conf": 0.1406961911028027, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5198737322893363, "calib/step_q_c_n": 596.0, "calib/step_q_gap": 0.06832252186103088, "calib/step_q_w": 0.4515512104283054, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 444.05078125, "completions/mean_terminated_length": 444.05078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0768, "grad_norm": 0.04978213459253311, "kl": 0.06729888916015625, "learning_rate": 3.555555555555556e-06, "loss": -0.0508, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03684322535991669, "mask/share_reasoning": 0.8544652462005615, "mask/share_step_conf": 0.10869147628545761, "num_tokens": 16986782.0, "reward": 1.319218635559082, "reward_std": 0.25763240456581116, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6421718597412109, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8457888960838318, "step": 72 }, { "adv/mean_abs_final_conf": 0.7550079226493835, "adv/mean_abs_reasoning": 0.4658733606338501, "adv/mean_abs_step_conf": 0.7642512321472168, "adv/ratio_final_to_reasoning": 1.620629094615214, "adv/ratio_step_to_reasoning": 1.6404699146295998, "adv/std_final_conf": 0.9236007928848267, "adv/std_reasoning": 0.7205851674079895, "adv/std_step_conf": 0.9357803463935852, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7105079365079365, "calib/avg_num_step_conf": 4.265625, "calib/ece": 0.2807015686274509, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.596078431372549, "calib/gap": 0.10843876190476176, "calib/mean_conf": 0.8623494117647059, "calib/mu_c": 0.9070006666666666, "calib/mu_w": 0.7985619047619048, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2774078431372548, "calib/std_conf": 0.16312133673941612, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5269798657718121, "calib/step_q_c_n": 596.0, "calib/step_q_gap": 0.059091156094392694, "calib/step_q_w": 0.4678887096774194, "calib/step_q_w_n": 496.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 429.98046875, "completions/mean_terminated_length": 429.98046875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07786666666666667, "grad_norm": 0.047100406140089035, "kl": 0.06757354736328125, "learning_rate": 3.5277777777777784e-06, "loss": -0.0143, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03696557879447937, "mask/share_reasoning": 0.855215311050415, "mask/share_step_conf": 0.10781913995742798, "num_tokens": 17203889.0, "reward": 1.321640968322754, "reward_std": 0.21188318729400635, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7058013677597046, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8105372190475464, "step": 73 }, { "adv/mean_abs_final_conf": 0.7607376575469971, "adv/mean_abs_reasoning": 0.482050359249115, "adv/mean_abs_step_conf": 0.7662625908851624, "adv/ratio_final_to_reasoning": 1.5781290127695176, "adv/ratio_step_to_reasoning": 1.589590332592557, "adv/std_final_conf": 0.930891215801239, "adv/std_reasoning": 0.7394124269485474, "adv/std_step_conf": 0.9361252784729004, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6418861788617887, "calib/avg_num_step_conf": 3.8671875, "calib/ece": 0.3153104838709677, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.41935483870967744, "calib/gap": 0.07578308943089429, "calib/mean_conf": 0.7997459677419354, "calib/mu_c": 0.8379430894308944, "calib/mu_w": 0.7621600000000001, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3095443548387097, "calib/std_conf": 0.19638397415858946, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.48399766355140184, "calib/step_q_c_n": 428.0, "calib/step_q_gap": 0.05058823294641962, "calib/step_q_w": 0.4334094306049822, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 415.4609375, "completions/mean_terminated_length": 420.3873596191406, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.07893333333333333, "grad_norm": 0.039140306413173676, "kl": 0.0781707763671875, "learning_rate": 3.5e-06, "loss": -0.1344, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.04051174968481064, "mask/share_reasoning": 0.8400492668151855, "mask/share_step_conf": 0.10772022604942322, "num_tokens": 17414175.0, "reward": 1.267834186553955, "reward_std": 0.26047077775001526, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6346431970596313, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8059812784194946, "step": 74 }, { "adv/mean_abs_final_conf": 0.7445878982543945, "adv/mean_abs_reasoning": 0.4395519495010376, "adv/mean_abs_step_conf": 0.7424792051315308, "adv/ratio_final_to_reasoning": 1.6939701873683461, "adv/ratio_step_to_reasoning": 1.68917281785319, "adv/std_final_conf": 0.9286428093910217, "adv/std_reasoning": 0.7013537883758545, "adv/std_step_conf": 0.9361192584037781, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.77823029796714, "calib/avg_num_step_conf": 3.9296875, "calib/ece": 0.17942352941176468, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.19322034252297393, "calib/mean_conf": 0.840678431372549, "calib/mu_c": 0.9043274853801169, "calib/mu_w": 0.7111071428571429, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.174756862745098, "calib/std_conf": 0.19853079606392227, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5372239263803681, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.14038211847076365, "calib/step_q_w": 0.39684180790960444, "calib/step_q_w_n": 354.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 386.0234375, "completions/mean_terminated_length": 387.53729248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.08, "grad_norm": 0.03852907568216324, "kl": 0.08367156982421875, "learning_rate": 3.4722222222222224e-06, "loss": -0.057, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04084809869527817, "mask/share_reasoning": 0.8442814350128174, "mask/share_step_conf": 0.11096422374248505, "num_tokens": 17617749.0, "reward": 1.4051578044891357, "reward_std": 0.22026976943016052, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7908572554588318, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8437135219573975, "step": 75 }, { "adv/mean_abs_final_conf": 0.7429937124252319, "adv/mean_abs_reasoning": 0.48358088731765747, "adv/mean_abs_step_conf": 0.7725791931152344, "adv/ratio_final_to_reasoning": 1.5364414349510258, "adv/ratio_step_to_reasoning": 1.5976214390949202, "adv/std_final_conf": 0.912419319152832, "adv/std_reasoning": 0.7574656009674072, "adv/std_step_conf": 0.9359506368637085, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7136080148619958, "calib/avg_num_step_conf": 4.09375, "calib/ece": 0.17936758893280635, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4031620553359684, "calib/gap": 0.14742104564755842, "calib/mean_conf": 0.7776284584980238, "calib/mu_c": 0.8335668789808918, "calib/mu_w": 0.6861458333333333, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16822134387351778, "calib/std_conf": 0.22366668696544437, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45767772511848337, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.06006808656426649, "calib/step_q_w": 0.3976096385542169, "calib/step_q_w_n": 415.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 425.8515625, "completions/mean_terminated_length": 425.8515625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.08106666666666666, "grad_norm": 0.042207323014736176, "kl": 0.0843963623046875, "learning_rate": 3.444444444444445e-06, "loss": -0.0015, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.040179964154958725, "mask/share_reasoning": 0.8500825762748718, "mask/share_step_conf": 0.10973748564720154, "num_tokens": 17829823.0, "reward": 1.3595410585403442, "reward_std": 0.20747733116149902, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7503616809844971, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8242039680480957, "step": 76 }, { "adv/mean_abs_final_conf": 0.7458382844924927, "adv/mean_abs_reasoning": 0.41478484869003296, "adv/mean_abs_step_conf": 0.7500584125518799, "adv/ratio_final_to_reasoning": 1.7981329039572866, "adv/ratio_step_to_reasoning": 1.808307161943602, "adv/std_final_conf": 0.935461699962616, "adv/std_reasoning": 0.6613828539848328, "adv/std_step_conf": 0.9361003637313843, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5812613312613313, "calib/avg_num_step_conf": 4.06640625, "calib/ece": 0.192113725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3803921568627451, "calib/gap": 0.06619327894327887, "calib/mean_conf": 0.7518078431372549, "calib/mu_c": 0.7775064102564102, "calib/mu_w": 0.7113131313131313, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16607843137254896, "calib/std_conf": 0.2309809226000101, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4324588414634146, "calib/step_q_c_n": 656.0, "calib/step_q_gap": -0.027330768926195792, "calib/step_q_w": 0.4597896103896104, "calib/step_q_w_n": 385.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 401.2734375, "completions/mean_terminated_length": 402.8470764160156, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.08213333333333334, "grad_norm": 0.0602923147380352, "kl": 0.096343994140625, "learning_rate": 3.416666666666667e-06, "loss": -0.0796, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04241577535867691, "mask/share_reasoning": 0.836897611618042, "mask/share_step_conf": 0.1167803704738617, "num_tokens": 18037213.0, "reward": 1.3238272666931152, "reward_std": 0.21843001246452332, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7147822380065918, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8062798976898193, "step": 77 }, { "adv/mean_abs_final_conf": 0.7273539900779724, "adv/mean_abs_reasoning": 0.5718014240264893, "adv/mean_abs_step_conf": 0.7482078075408936, "adv/ratio_final_to_reasoning": 1.2720394869885407, "adv/ratio_step_to_reasoning": 1.3085098709132144, "adv/std_final_conf": 0.9153119921684265, "adv/std_reasoning": 0.792800784111023, "adv/std_step_conf": 0.935762345790863, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7362155388471179, "calib/avg_num_step_conf": 4.5390625, "calib/ece": 0.20717559055118112, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3937007874015748, "calib/gap": 0.1877954887218044, "calib/mean_conf": 0.7583566929133858, "calib/mu_c": 0.8426428571428571, "calib/mu_w": 0.6548473684210527, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20717559055118112, "calib/std_conf": 0.2278602352011284, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42668166409861324, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.06647562121362294, "calib/step_q_w": 0.3602060428849903, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 461.93359375, "completions/mean_terminated_length": 463.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0832, "grad_norm": 0.05932401493191719, "kl": 0.0837554931640625, "learning_rate": 3.3888888888888893e-06, "loss": -0.0694, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03584552928805351, "mask/share_reasoning": 0.8557673096656799, "mask/share_step_conf": 0.10448087751865387, "num_tokens": 18263492.0, "reward": 1.3682516813278198, "reward_std": 0.18205977976322174, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7448265552520752, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8419320583343506, "step": 78 }, { "adv/mean_abs_final_conf": 0.7352473735809326, "adv/mean_abs_reasoning": 0.4633867144584656, "adv/mean_abs_step_conf": 0.7579057216644287, "adv/ratio_final_to_reasoning": 1.5866820317457213, "adv/ratio_step_to_reasoning": 1.6355793077713745, "adv/std_final_conf": 0.9174332022666931, "adv/std_reasoning": 0.7206430435180664, "adv/std_step_conf": 0.9358471632003784, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5985389610389611, "calib/avg_num_step_conf": 4.421875, "calib/ece": 0.2583897637795275, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.515748031496063, "calib/gap": 0.07205493506493521, "calib/mean_conf": 0.8216968503937009, "calib/mu_c": 0.8500649350649352, "calib/mu_w": 0.77801, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23689370078740152, "calib/std_conf": 0.21910718674068766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43175787965616047, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.03660234970224341, "calib/step_q_w": 0.39515552995391706, "calib/step_q_w_n": 434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 435.79296875, "completions/mean_terminated_length": 437.5019836425781, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.08426666666666667, "grad_norm": 0.037078507244586945, "kl": 0.08056640625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0305, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036293767392635345, "mask/share_reasoning": 0.8540657758712769, "mask/share_step_conf": 0.10573424398899078, "num_tokens": 18481431.0, "reward": 1.331152081489563, "reward_std": 0.23628367483615875, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6958156824111938, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.824259877204895, "step": 79 }, { "adv/mean_abs_final_conf": 0.6860350370407104, "adv/mean_abs_reasoning": 0.47583746910095215, "adv/mean_abs_step_conf": 0.7645847797393799, "adv/ratio_final_to_reasoning": 1.4417423628637438, "adv/ratio_step_to_reasoning": 1.6068191964453478, "adv/std_final_conf": 0.8816706538200378, "adv/std_reasoning": 0.7206332087516785, "adv/std_step_conf": 0.9358549118041992, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6925, "calib/avg_num_step_conf": 4.5703125, "calib/ece": 0.24196078431372542, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6196078431372549, "calib/gap": 0.14407236842105242, "calib/mean_conf": 0.8594509803921568, "calib/mu_c": 0.913125, "calib/mu_w": 0.7690526315789475, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23698039215686267, "calib/std_conf": 0.21613469962361845, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4521757345491389, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.08987951059080557, "calib/step_q_w": 0.36229622395833333, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 415.578125, "completions/mean_terminated_length": 415.578125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.08533333333333333, "grad_norm": 10.169572830200195, "kl": 62.33721923828125, "learning_rate": 3.3333333333333333e-06, "loss": 0.3504, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04010055959224701, "mask/share_reasoning": 0.8437327742576599, "mask/share_step_conf": 0.11616663634777069, "num_tokens": 18689979.0, "reward": 1.3634624481201172, "reward_std": 0.21763771772384644, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7301976680755615, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8362541794776917, "step": 80 }, { "adv/mean_abs_final_conf": 0.6958885788917542, "adv/mean_abs_reasoning": 0.4900428056716919, "adv/mean_abs_step_conf": 0.7568225860595703, "adv/ratio_final_to_reasoning": 1.4200567192041798, "adv/ratio_step_to_reasoning": 1.5444009733439688, "adv/std_final_conf": 0.8862650394439697, "adv/std_reasoning": 0.7392630577087402, "adv/std_step_conf": 0.9359573721885681, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6334706959706959, "calib/avg_num_step_conf": 4.328125, "calib/ece": 0.2452191601049869, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5669291338582677, "calib/gap": 0.11300292168149328, "calib/mean_conf": 0.8070564304461942, "calib/mu_c": 0.850655982905983, "calib/mu_w": 0.7376530612244897, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2190511811023622, "calib/std_conf": 0.25818957283509064, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44979445407279034, "calib/step_q_c_n": 577.0, "calib/step_q_gap": 0.10520500021215001, "calib/step_q_w": 0.3445894538606403, "calib/step_q_w_n": 531.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 438.79296875, "completions/mean_terminated_length": 440.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.0864, "grad_norm": 0.04086478427052498, "kl": 0.099609375, "learning_rate": 3.3055555555555558e-06, "loss": -0.0251, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04027840867638588, "mask/share_reasoning": 0.8512026071548462, "mask/share_step_conf": 0.10461273789405823, "num_tokens": 18908558.0, "reward": 1.3497854471206665, "reward_std": 0.2123834192752838, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7071569561958313, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8360507488250732, "step": 81 }, { "adv/mean_abs_final_conf": 0.7277892827987671, "adv/mean_abs_reasoning": 0.5591062903404236, "adv/mean_abs_step_conf": 0.7847660779953003, "adv/ratio_final_to_reasoning": 1.3017011172520296, "adv/ratio_step_to_reasoning": 1.4036080286585204, "adv/std_final_conf": 0.9133651852607727, "adv/std_reasoning": 0.7927923798561096, "adv/std_step_conf": 0.9358444809913635, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.668380376344086, "calib/avg_num_step_conf": 4.47265625, "calib/ece": 0.2806442687747036, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7351778656126482, "calib/gap": 0.1358327284946237, "calib/mean_conf": 0.8799881422924901, "calib/mu_c": 0.92991875, "calib/mu_w": 0.7940860215053763, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26411067193675897, "calib/std_conf": 0.22479581393380432, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4877527047913447, "calib/step_q_c_n": 647.0, "calib/step_q_gap": 0.13897298591584273, "calib/step_q_w": 0.34877971887550197, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 380.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.08746666666666666, "grad_norm": 0.04228287935256958, "kl": 0.1171722412109375, "learning_rate": 3.277777777777778e-06, "loss": -0.0091, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04324019327759743, "mask/share_reasoning": 0.8272990584373474, "mask/share_step_conf": 0.12164826691150665, "num_tokens": 19110686.0, "reward": 1.3558624982833862, "reward_std": 0.23602716624736786, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7104343771934509, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8393171429634094, "step": 82 }, { "adv/mean_abs_final_conf": 0.6465597152709961, "adv/mean_abs_reasoning": 0.3706515431404114, "adv/mean_abs_step_conf": 0.7619996070861816, "adv/ratio_final_to_reasoning": 1.744386951131792, "adv/ratio_step_to_reasoning": 2.055838215672877, "adv/std_final_conf": 0.8661852478981018, "adv/std_reasoning": 0.640232264995575, "adv/std_step_conf": 0.9360112547874451, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5799475753604194, "calib/avg_num_step_conf": 3.99609375, "calib/ece": 0.3164062500000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.6484375, "calib/gap": 0.08320414404293819, "calib/mean_conf": 0.841171875, "calib/mu_c": 0.8765986394557824, "calib/mu_w": 0.7933944954128442, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29167968750000006, "calib/std_conf": 0.24750985174126783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4914368453481207, "calib/step_q_c_n": 541.0, "calib/step_q_gap": 0.10662564202861863, "calib/step_q_w": 0.3848112033195021, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 425.54296875, "completions/mean_terminated_length": 427.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.08853333333333334, "grad_norm": 0.038990918546915054, "kl": 0.1039276123046875, "learning_rate": 3.2500000000000002e-06, "loss": -0.1183, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.040513087064027786, "mask/share_reasoning": 0.8545513153076172, "mask/share_step_conf": 0.10102932155132294, "num_tokens": 19326889.0, "reward": 1.333298683166504, "reward_std": 0.19381004571914673, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6636687517166138, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8440423607826233, "step": 83 }, { "adv/mean_abs_final_conf": 0.7489799857139587, "adv/mean_abs_reasoning": 0.6053099632263184, "adv/mean_abs_step_conf": 0.7614016532897949, "adv/ratio_final_to_reasoning": 1.237349508872901, "adv/ratio_step_to_reasoning": 1.2578706770850154, "adv/std_final_conf": 0.9016205072402954, "adv/std_reasoning": 0.8264942169189453, "adv/std_step_conf": 0.9361007213592529, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.713712686567164, "calib/avg_num_step_conf": 4.140625, "calib/ece": 0.33639107611548535, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6889763779527559, "calib/gap": 0.1905932835820896, "calib/mean_conf": 0.8579658792650918, "calib/mu_c": 0.9480099502487562, "calib/mu_w": 0.7574166666666666, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3333989501312334, "calib/std_conf": 0.24192491526825316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5059726666666666, "calib/step_q_c_n": 500.0, "calib/step_q_gap": 0.11430445238095233, "calib/step_q_w": 0.3916682142857143, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 406.8671875, "completions/mean_terminated_length": 406.8671875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.0896, "grad_norm": 0.03670133650302887, "kl": 0.12044525146484375, "learning_rate": 3.2222222222222227e-06, "loss": 0.0093, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04312140494585037, "mask/share_reasoning": 0.8456448912620544, "mask/share_step_conf": 0.11123368889093399, "num_tokens": 19536967.0, "reward": 1.3182759284973145, "reward_std": 0.2619035840034485, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6727728843688965, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8303269147872925, "step": 84 }, { "adv/mean_abs_final_conf": 0.6797654628753662, "adv/mean_abs_reasoning": 0.46334075927734375, "adv/mean_abs_step_conf": 0.7665224671363831, "adv/ratio_final_to_reasoning": 1.4670961905781232, "adv/ratio_step_to_reasoning": 1.6543385225420295, "adv/std_final_conf": 0.8873353004455566, "adv/std_reasoning": 0.7391656041145325, "adv/std_step_conf": 0.9362779259681702, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6875468867216804, "calib/avg_num_step_conf": 3.89453125, "calib/ece": 0.3937022397891964, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7233201581027668, "calib/gap": 0.17096878386263237, "calib/mean_conf": 0.8838208168642951, "calib/mu_c": 0.9709946236559142, "calib/mu_w": 0.8000258397932818, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3937022397891964, "calib/std_conf": 0.21735560772120124, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5151022727272727, "calib/step_q_c_n": 440.0, "calib/step_q_gap": 0.10504302676676996, "calib/step_q_w": 0.41005924596050275, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 429.99609375, "completions/mean_terminated_length": 431.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.09066666666666667, "grad_norm": 0.04405707120895386, "kl": 0.113250732421875, "learning_rate": 3.1944444444444443e-06, "loss": -0.0879, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04101819917559624, "mask/share_reasoning": 0.850917637348175, "mask/share_step_conf": 0.10415787994861603, "num_tokens": 19754870.0, "reward": 1.2764089107513428, "reward_std": 0.2609490156173706, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6228024959564209, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8181326389312744, "step": 85 }, { "adv/mean_abs_final_conf": 0.7362145185470581, "adv/mean_abs_reasoning": 0.5997448563575745, "adv/mean_abs_step_conf": 0.7307212948799133, "adv/ratio_final_to_reasoning": 1.2275461985923544, "adv/ratio_step_to_reasoning": 1.2183869309322584, "adv/std_final_conf": 0.9218765497207642, "adv/std_reasoning": 0.8427642583847046, "adv/std_step_conf": 0.9363045692443848, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6512355333124804, "calib/avg_num_step_conf": 3.84375, "calib/ece": 0.3785074803149607, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6811023622047244, "calib/gap": 0.1717838411010323, "calib/mean_conf": 0.8218862204724409, "calib/mu_c": 0.9158939130434782, "calib/mu_w": 0.7441100719424459, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.37381889763779536, "calib/std_conf": 0.28613991430444674, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5133126168224299, "calib/step_q_c_n": 428.0, "calib/step_q_gap": 0.09592358804545142, "calib/step_q_w": 0.4173890287769785, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 413.6953125, "completions/mean_terminated_length": 415.3176574707031, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.09173333333333333, "grad_norm": 0.059901315718889236, "kl": 0.1295318603515625, "learning_rate": 3.1666666666666667e-06, "loss": -0.0866, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04301995784044266, "mask/share_reasoning": 0.8469830751419067, "mask/share_step_conf": 0.1060907393693924, "num_tokens": 19966288.0, "reward": 1.2757947444915771, "reward_std": 0.3126189708709717, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6065736413002014, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8291484117507935, "step": 86 }, { "adv/mean_abs_final_conf": 0.7256042957305908, "adv/mean_abs_reasoning": 0.48823797702789307, "adv/mean_abs_step_conf": 0.7334685921669006, "adv/ratio_final_to_reasoning": 1.486169306508365, "adv/ratio_step_to_reasoning": 1.5022768131062396, "adv/std_final_conf": 0.9046579003334045, "adv/std_reasoning": 0.7576124668121338, "adv/std_step_conf": 0.9361890554428101, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6175992192582953, "calib/avg_num_step_conf": 3.4921875, "calib/ece": 0.33095617529880467, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7529880478087649, "calib/gap": 0.10410865322055973, "calib/mean_conf": 0.8994820717131474, "calib/mu_c": 0.943448275862069, "calib/mu_w": 0.8393396226415093, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3263745019920318, "calib/std_conf": 0.19804455079826683, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5461728155339806, "calib/step_q_c_n": 515.0, "calib/step_q_gap": 0.0419955068268566, "calib/step_q_w": 0.504177308707124, "calib/step_q_w_n": 379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 374.390625, "completions/mean_terminated_length": 377.3385925292969, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.0928, "grad_norm": 0.0383741669356823, "kl": 0.1442108154296875, "learning_rate": 3.138888888888889e-06, "loss": -0.0843, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04641494154930115, "mask/share_reasoning": 0.8423793315887451, "mask/share_step_conf": 0.10339324176311493, "num_tokens": 20167628.0, "reward": 1.2959322929382324, "reward_std": 0.28941088914871216, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6510910391807556, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8156992197036743, "step": 87 }, { "adv/mean_abs_final_conf": 0.6736510396003723, "adv/mean_abs_reasoning": 0.5277516841888428, "adv/mean_abs_step_conf": 0.7695959806442261, "adv/ratio_final_to_reasoning": 1.2764545519845714, "adv/ratio_step_to_reasoning": 1.458253954844501, "adv/std_final_conf": 0.8730496168136597, "adv/std_reasoning": 0.75759357213974, "adv/std_step_conf": 0.9361079335212708, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7334702907711756, "calib/avg_num_step_conf": 3.828125, "calib/ece": 0.2642371541501976, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.616600790513834, "calib/gap": 0.21584070796460175, "calib/mean_conf": 0.8175968379446641, "calib/mu_c": 0.9139999999999999, "calib/mu_w": 0.6981592920353982, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2642371541501976, "calib/std_conf": 0.2569745137196758, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49471593090211136, "calib/step_q_c_n": 521.0, "calib/step_q_gap": 0.06141222719840761, "calib/step_q_w": 0.43330370370370375, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 426.65625, "completions/mean_terminated_length": 428.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.09386666666666667, "grad_norm": 0.028476638719439507, "kl": 0.191864013671875, "learning_rate": 3.1111111111111116e-06, "loss": 0.0005, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.039456646889448166, "mask/share_reasoning": 0.8581996560096741, "mask/share_step_conf": 0.09843742847442627, "num_tokens": 20386700.0, "reward": 1.3378771543502808, "reward_std": 0.25673359632492065, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7152007222175598, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8271518349647522, "step": 88 }, { "adv/mean_abs_final_conf": 0.6247692704200745, "adv/mean_abs_reasoning": 0.36448341608047485, "adv/mean_abs_step_conf": 0.7542611956596375, "adv/ratio_final_to_reasoning": 1.7141226263148572, "adv/ratio_step_to_reasoning": 2.0693978446830155, "adv/std_final_conf": 0.8439804315567017, "adv/std_reasoning": 0.6612530946731567, "adv/std_step_conf": 0.9361677169799805, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7321204122674712, "calib/avg_num_step_conf": 3.484375, "calib/ece": 0.33624505928853765, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5454545454545454, "calib/gap": 0.21677369281045744, "calib/mean_conf": 0.786695652173913, "calib/mu_c": 0.9032222222222221, "calib/mu_w": 0.6864485294117647, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33024505928853765, "calib/std_conf": 0.26675508422771166, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5419005376344086, "calib/step_q_c_n": 372.0, "calib/step_q_gap": 0.10425822994210088, "calib/step_q_w": 0.4376423076923077, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2029.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 409.45703125, "completions/mean_terminated_length": 409.45703125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.09493333333333333, "grad_norm": 0.06513398885726929, "kl": 0.1344451904296875, "learning_rate": 3.0833333333333336e-06, "loss": -0.0754, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04523952305316925, "mask/share_reasoning": 0.8543978929519653, "mask/share_step_conf": 0.10036254674196243, "num_tokens": 20600409.0, "reward": 1.3104181289672852, "reward_std": 0.24107292294502258, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6709932088851929, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8307808637619019, "step": 89 }, { "adv/mean_abs_final_conf": 0.7081605195999146, "adv/mean_abs_reasoning": 0.5528960227966309, "adv/mean_abs_step_conf": 0.7596926689147949, "adv/ratio_final_to_reasoning": 1.2808204262673706, "adv/ratio_step_to_reasoning": 1.3740244776443782, "adv/std_final_conf": 0.9066979885101318, "adv/std_reasoning": 0.7927749752998352, "adv/std_step_conf": 0.9362542033195496, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6580491722263404, "calib/avg_num_step_conf": 3.84375, "calib/ece": 0.2879296875, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.58984375, "calib/gap": 0.1431813689152459, "calib/mean_conf": 0.8079296874999999, "calib/mu_c": 0.8716901408450705, "calib/mu_w": 0.7285087719298246, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2705859375, "calib/std_conf": 0.26329494759898514, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5204888268156426, "calib/step_q_c_n": 537.0, "calib/step_q_gap": 0.0747061646232488, "calib/step_q_w": 0.44578266219239376, "calib/step_q_w_n": 447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 385.65234375, "completions/mean_terminated_length": 387.16473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.096, "grad_norm": 0.030344147235155106, "kl": 0.154571533203125, "learning_rate": 3.055555555555556e-06, "loss": -0.0262, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04306305944919586, "mask/share_reasoning": 0.8437785506248474, "mask/share_step_conf": 0.10925211012363434, "num_tokens": 20802456.0, "reward": 1.3113610744476318, "reward_std": 0.24878989160060883, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6869570016860962, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8128043413162231, "step": 90 }, { "adv/mean_abs_final_conf": 0.7299604415893555, "adv/mean_abs_reasoning": 0.520104169845581, "adv/mean_abs_step_conf": 0.7571775913238525, "adv/ratio_final_to_reasoning": 1.403488923778636, "adv/ratio_step_to_reasoning": 1.4558191132146827, "adv/std_final_conf": 0.9214328527450562, "adv/std_reasoning": 0.7926962375640869, "adv/std_step_conf": 0.9360818266868591, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6639589704383282, "calib/avg_num_step_conf": 3.62109375, "calib/ece": 0.22901185770750992, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4505928853754941, "calib/gap": 0.15232415902140672, "calib/mean_conf": 0.7577075098814229, "calib/mu_c": 0.8233333333333334, "calib/mu_w": 0.6710091743119266, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20877470355731229, "calib/std_conf": 0.25543012128681636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48703594080338264, "calib/step_q_c_n": 473.0, "calib/step_q_gap": 0.046277791023646986, "calib/step_q_w": 0.44075814977973565, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 377.328125, "completions/mean_terminated_length": 378.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.09706666666666666, "grad_norm": 0.03573990985751152, "kl": 0.1767578125, "learning_rate": 3.0277777777777776e-06, "loss": -0.0513, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04190374165773392, "mask/share_reasoning": 0.8539397716522217, "mask/share_step_conf": 0.100250244140625, "num_tokens": 21006764.0, "reward": 1.3354003429412842, "reward_std": 0.236099511384964, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.720158576965332, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.820242702960968, "step": 91 }, { "adv/mean_abs_final_conf": 0.7487197518348694, "adv/mean_abs_reasoning": 0.6103894114494324, "adv/mean_abs_step_conf": 0.7408183813095093, "adv/ratio_final_to_reasoning": 1.226626376196398, "adv/ratio_step_to_reasoning": 1.2136815734571147, "adv/std_final_conf": 0.9200137853622437, "adv/std_reasoning": 0.8265608549118042, "adv/std_step_conf": 0.9363206028938293, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6584266169154229, "calib/avg_num_step_conf": 3.48046875, "calib/ece": 0.23157086614173222, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.39763779527559057, "calib/gap": 0.14667537313432855, "calib/mean_conf": 0.7591299212598425, "calib/mu_c": 0.8284253731343285, "calib/mu_w": 0.68175, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23157086614173222, "calib/std_conf": 0.23715097613687144, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5830169491525424, "calib/step_q_c_n": 413.0, "calib/step_q_gap": 0.09549602865045037, "calib/step_q_w": 0.48752092050209206, "calib/step_q_w_n": 478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 364.46484375, "completions/mean_terminated_length": 365.8941345214844, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.09813333333333334, "grad_norm": 0.043616894632577896, "kl": 0.164886474609375, "learning_rate": 3e-06, "loss": -0.0171, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04630187898874283, "mask/share_reasoning": 0.8456723690032959, "mask/share_step_conf": 0.10411947965621948, "num_tokens": 21206787.0, "reward": 1.3230915069580078, "reward_std": 0.2727872133255005, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7084305286407471, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.817704439163208, "step": 92 }, { "adv/mean_abs_final_conf": 0.7456616163253784, "adv/mean_abs_reasoning": 0.6048349142074585, "adv/mean_abs_step_conf": 0.7437169551849365, "adv/ratio_final_to_reasoning": 1.2328349419154336, "adv/ratio_step_to_reasoning": 1.2296197486539964, "adv/std_final_conf": 0.9214170575141907, "adv/std_reasoning": 0.8430580496788025, "adv/std_step_conf": 0.9363595843315125, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6939876670092497, "calib/avg_num_step_conf": 3.70703125, "calib/ece": 0.2401366533864542, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.26294820717131473, "calib/gap": 0.17083301002055495, "calib/mean_conf": 0.6627318725099602, "calib/mu_c": 0.7573366071428571, "calib/mu_w": 0.5865035971223022, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22832669322709165, "calib/std_conf": 0.2609951507313089, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5360590260285474, "calib/step_q_c_n": 397.0, "calib/step_q_gap": 0.08241591008651838, "calib/step_q_w": 0.453643115942029, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 419.4453125, "completions/mean_terminated_length": 419.4453125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.0992, "grad_norm": 0.043325275182724, "kl": 0.145263671875, "learning_rate": 2.9722222222222225e-06, "loss": -0.0145, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04405222460627556, "mask/share_reasoning": 0.8505038619041443, "mask/share_step_conf": 0.10544390976428986, "num_tokens": 21419941.0, "reward": 1.288401484489441, "reward_std": 0.2714729905128479, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7044404745101929, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7951655983924866, "step": 93 }, { "adv/mean_abs_final_conf": 0.7081672549247742, "adv/mean_abs_reasoning": 0.5124787092208862, "adv/mean_abs_step_conf": 0.7548666000366211, "adv/ratio_final_to_reasoning": 1.3818471717613212, "adv/ratio_step_to_reasoning": 1.4729716307321987, "adv/std_final_conf": 0.9206733703613281, "adv/std_reasoning": 0.7575570344924927, "adv/std_step_conf": 0.936098575592041, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7496691245982227, "calib/avg_num_step_conf": 3.39453125, "calib/ece": 0.15350000000000003, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2222222222222222, "calib/gap": 0.21518283229343915, "calib/mean_conf": 0.6507222222222223, "calib/mu_c": 0.755751937984496, "calib/mu_w": 0.5405691056910569, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14615873015873018, "calib/std_conf": 0.2485085076915109, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.540849765258216, "calib/step_q_c_n": 426.0, "calib/step_q_gap": 0.10967595035979621, "calib/step_q_w": 0.4311738148984198, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 373.30859375, "completions/mean_terminated_length": 374.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.10026666666666667, "grad_norm": 0.03856610134243965, "kl": 0.167999267578125, "learning_rate": 2.944444444444445e-06, "loss": -0.0001, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04606093466281891, "mask/share_reasoning": 0.8470206260681152, "mask/share_step_conf": 0.10301218926906586, "num_tokens": 21624188.0, "reward": 1.3532507419586182, "reward_std": 0.20355263352394104, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7645102739334106, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8213862180709839, "step": 94 }, { "adv/mean_abs_final_conf": 0.7450008392333984, "adv/mean_abs_reasoning": 0.4954552948474884, "adv/mean_abs_step_conf": 0.755539059638977, "adv/ratio_final_to_reasoning": 1.5036691442821808, "adv/ratio_step_to_reasoning": 1.524938914764344, "adv/std_final_conf": 0.9205859303474426, "adv/std_reasoning": 0.7393431067466736, "adv/std_step_conf": 0.9360067844390869, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7414733542319748, "calib/avg_num_step_conf": 3.37109375, "calib/ece": 0.1439254901960784, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.24313725490196078, "calib/gap": 0.20583134796238245, "calib/mean_conf": 0.6557686274509804, "calib/mu_c": 0.7445586206896552, "calib/mu_w": 0.5387272727272727, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1155333333333333, "calib/std_conf": 0.2674706166674557, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5160469083155651, "calib/step_q_c_n": 469.0, "calib/step_q_gap": 0.08721132455921993, "calib/step_q_w": 0.4288355837563452, "calib/step_q_w_n": 394.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 361.86328125, "completions/mean_terminated_length": 363.2823791503906, "completions/min_length": 0.0, "completions/min_terminated_length": 70.0, "epoch": 0.10133333333333333, "grad_norm": 0.07308512181043625, "kl": 0.1591339111328125, "learning_rate": 2.916666666666667e-06, "loss": -0.0879, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.048363469541072845, "mask/share_reasoning": 0.8412728309631348, "mask/share_step_conf": 0.10645744204521179, "num_tokens": 21822953.0, "reward": 1.375627875328064, "reward_std": 0.1974714696407318, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7704393863677979, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8345487117767334, "step": 95 }, { "adv/mean_abs_final_conf": 0.7327808141708374, "adv/mean_abs_reasoning": 0.4727819859981537, "adv/mean_abs_step_conf": 0.7735196352005005, "adv/ratio_final_to_reasoning": 1.5499338720018387, "adv/ratio_step_to_reasoning": 1.6361021741710804, "adv/std_final_conf": 0.933215856552124, "adv/std_reasoning": 0.7206127047538757, "adv/std_step_conf": 0.9363253116607666, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7275184599156118, "calib/avg_num_step_conf": 3.38671875, "calib/ece": 0.08228346456692916, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2283464566929134, "calib/gap": 0.2073114451476793, "calib/mean_conf": 0.6432283464566929, "calib/mu_c": 0.7215822784810126, "calib/mu_w": 0.5142708333333333, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.05173228346456696, "calib/std_conf": 0.26140860672311333, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5292725409836065, "calib/step_q_c_n": 488.0, "calib/step_q_gap": 0.10668177581210259, "calib/step_q_w": 0.42259076517150396, "calib/step_q_w_n": 379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2468.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 369.91015625, "completions/mean_terminated_length": 369.91015625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.1024, "grad_norm": 0.058021366596221924, "kl": 0.1710662841796875, "learning_rate": 2.888888888888889e-06, "loss": 0.0321, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.047193411737680435, "mask/share_reasoning": 0.8507916927337646, "mask/share_step_conf": 0.10201486945152283, "num_tokens": 22023466.0, "reward": 1.3637802600860596, "reward_std": 0.21904903650283813, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7867922186851501, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8098371624946594, "step": 96 }, { "adv/mean_abs_final_conf": 0.7595298290252686, "adv/mean_abs_reasoning": 0.5360906720161438, "adv/mean_abs_step_conf": 0.7516697645187378, "adv/ratio_final_to_reasoning": 1.4167935923391632, "adv/ratio_step_to_reasoning": 1.4021317731417309, "adv/std_final_conf": 0.935832679271698, "adv/std_reasoning": 0.792716920375824, "adv/std_step_conf": 0.9358479976654053, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7143928035982009, "calib/avg_num_step_conf": 3.69140625, "calib/ece": 0.10116929133858268, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.14960629921259844, "calib/gap": 0.18254960019990008, "calib/mean_conf": 0.5960511811023622, "calib/mu_c": 0.6794202898550724, "calib/mu_w": 0.49687068965517234, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07695669291338585, "calib/std_conf": 0.24954816124762283, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4945020746887966, "calib/step_q_c_n": 482.0, "calib/step_q_gap": 0.07403490406244662, "calib/step_q_w": 0.42046717062635, "calib/step_q_w_n": 463.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 368.3046875, "completions/mean_terminated_length": 368.3046875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10346666666666667, "grad_norm": 0.03864547610282898, "kl": 0.1785888671875, "learning_rate": 2.861111111111111e-06, "loss": 0.0053, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.044990718364715576, "mask/share_reasoning": 0.8436180353164673, "mask/share_step_conf": 0.11139123886823654, "num_tokens": 22222824.0, "reward": 1.3819646835327148, "reward_std": 0.19518443942070007, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7713358402252197, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8431717157363892, "step": 97 }, { "adv/mean_abs_final_conf": 0.7767525911331177, "adv/mean_abs_reasoning": 0.5913317203521729, "adv/mean_abs_step_conf": 0.7650718688964844, "adv/ratio_final_to_reasoning": 1.3135648983459163, "adv/ratio_step_to_reasoning": 1.293811650152708, "adv/std_final_conf": 0.9360088109970093, "adv/std_reasoning": 0.8265880346298218, "adv/std_step_conf": 0.9362597465515137, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6197610654132394, "calib/avg_num_step_conf": 3.4453125, "calib/ece": 0.1542570281124498, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.21285140562248997, "calib/gap": 0.1063337250293771, "calib/mean_conf": 0.6276706827309237, "calib/mu_c": 0.6750724637681159, "calib/mu_w": 0.5687387387387388, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11385542168674696, "calib/std_conf": 0.25834232912942434, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5326319444444445, "calib/step_q_c_n": 480.0, "calib/step_q_gap": 0.08416677031509129, "calib/step_q_w": 0.4484651741293532, "calib/step_q_w_n": 402.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 389.60546875, "completions/mean_terminated_length": 391.13336181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.10453333333333334, "grad_norm": 0.03718774393200874, "kl": 0.1533050537109375, "learning_rate": 2.8333333333333335e-06, "loss": -0.0184, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04543950408697128, "mask/share_reasoning": 0.8515037298202515, "mask/share_step_conf": 0.09915057569742203, "num_tokens": 22428747.0, "reward": 1.3166905641555786, "reward_std": 0.25483328104019165, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7132925987243652, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8077004551887512, "step": 98 }, { "adv/mean_abs_final_conf": 0.7712830305099487, "adv/mean_abs_reasoning": 0.6145671010017395, "adv/mean_abs_step_conf": 0.76088547706604, "adv/ratio_final_to_reasoning": 1.2550021458238874, "adv/ratio_step_to_reasoning": 1.2380836459123874, "adv/std_final_conf": 0.9343674182891846, "adv/std_reasoning": 0.8267272710800171, "adv/std_step_conf": 0.9363002181053162, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7107643758765779, "calib/avg_num_step_conf": 3.86328125, "calib/ece": 0.18951821862348178, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.1902834008097166, "calib/gap": 0.21728204768583437, "calib/mean_conf": 0.5402793522267206, "calib/mu_c": 0.6766304347826085, "calib/mu_w": 0.45934838709677417, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17866396761133604, "calib/std_conf": 0.28560527193146484, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4695117493472586, "calib/step_q_c_n": 383.0, "calib/step_q_gap": 0.08147379555187906, "calib/step_q_w": 0.38803795379537953, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 477.06640625, "completions/mean_terminated_length": 478.9372863769531, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.1056, "grad_norm": 0.07817687094211578, "kl": 0.1505889892578125, "learning_rate": 2.805555555555556e-06, "loss": -0.0125, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.039287276566028595, "mask/share_reasoning": 0.8576635122299194, "mask/share_step_conf": 0.09914298355579376, "num_tokens": 22656676.0, "reward": 1.302371621131897, "reward_std": 0.27057546377182007, "rewards/accuracy_reward_step": 0.359375, "rewards/final_brier_reward_step": 0.7279237508773804, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8063784837722778, "step": 99 }, { "adv/mean_abs_final_conf": 0.7553995847702026, "adv/mean_abs_reasoning": 0.5235493183135986, "adv/mean_abs_step_conf": 0.7469733953475952, "adv/ratio_final_to_reasoning": 1.4428432209661077, "adv/ratio_step_to_reasoning": 1.4267488643737833, "adv/std_final_conf": 0.9213042855262756, "adv/std_reasoning": 0.7754214406013489, "adv/std_step_conf": 0.9361969232559204, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8015468901063486, "calib/avg_num_step_conf": 3.734375, "calib/ece": 0.2004761904761905, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2619047619047619, "calib/gap": 0.3163764099258783, "calib/mean_conf": 0.619920634920635, "calib/mu_c": 0.80196261682243, "calib/mu_w": 0.4855862068965517, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19789682539682543, "calib/std_conf": 0.3019526296069981, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5622406779661017, "calib/step_q_c_n": 413.0, "calib/step_q_gap": 0.15584448398206235, "calib/step_q_w": 0.4063961939840393, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2323.0, "completions/max_terminated_length": 2323.0, "completions/mean_length": 410.984375, "completions/mean_terminated_length": 412.5960998535156, "completions/min_length": 0.0, "completions/min_terminated_length": 91.0, "epoch": 0.10666666666666667, "grad_norm": 0.033727023750543594, "kl": 0.1587677001953125, "learning_rate": 2.7777777777777783e-06, "loss": -0.0441, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0431080237030983, "mask/share_reasoning": 0.8496963977813721, "mask/share_step_conf": 0.10328933596611023, "num_tokens": 22869296.0, "reward": 1.3796181678771973, "reward_std": 0.23965272307395935, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.7687491774559021, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.855009138584137, "step": 100 }, { "adv/mean_abs_final_conf": 0.7755560874938965, "adv/mean_abs_reasoning": 0.5660301446914673, "adv/mean_abs_step_conf": 0.7645591497421265, "adv/ratio_final_to_reasoning": 1.3701674632834935, "adv/ratio_step_to_reasoning": 1.350739279369783, "adv/std_final_conf": 0.9355933666229248, "adv/std_reasoning": 0.8098204135894775, "adv/std_step_conf": 0.936090886592865, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7312936481543065, "calib/avg_num_step_conf": 3.8515625, "calib/ece": 0.19984523809523808, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.1865079365079365, "calib/gap": 0.22335370801463256, "calib/mean_conf": 0.5784960317460318, "calib/mu_c": 0.7158762886597938, "calib/mu_w": 0.49252258064516125, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.19671031746031745, "calib/std_conf": 0.2734869269098656, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5004893617021277, "calib/step_q_c_n": 376.0, "calib/step_q_gap": 0.07177788629229159, "calib/step_q_w": 0.42871147540983606, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 426.7890625, "completions/mean_terminated_length": 430.14959716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.10773333333333333, "grad_norm": 0.03631528094410896, "kl": 0.172576904296875, "learning_rate": 2.7500000000000004e-06, "loss": -0.1333, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03998691961169243, "mask/share_reasoning": 0.8527641296386719, "mask/share_step_conf": 0.0994364470243454, "num_tokens": 23085546.0, "reward": 1.312831163406372, "reward_std": 0.25012314319610596, "rewards/accuracy_reward_step": 0.37890625, "rewards/final_brier_reward_step": 0.7324117422103882, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.811859667301178, "step": 101 }, { "adv/mean_abs_final_conf": 0.7160638570785522, "adv/mean_abs_reasoning": 0.44066131114959717, "adv/mean_abs_step_conf": 0.7667325139045715, "adv/ratio_final_to_reasoning": 1.6249755514285675, "adv/ratio_step_to_reasoning": 1.7399587722015346, "adv/std_final_conf": 0.9027671813964844, "adv/std_reasoning": 0.7013367414474487, "adv/std_step_conf": 0.9360345602035522, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8072357981593145, "calib/avg_num_step_conf": 3.8984375, "calib/ece": 0.14190476190476184, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.31746031746031744, "calib/gap": 0.30249444620755317, "calib/mean_conf": 0.6661904761904762, "calib/mu_c": 0.8042335766423359, "calib/mu_w": 0.5017391304347827, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13222222222222219, "calib/std_conf": 0.2820428796141521, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5499742, "calib/step_q_c_n": 500.0, "calib/step_q_gap": 0.13110873815261043, "calib/step_q_w": 0.41886546184738954, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 366.84375, "completions/mean_terminated_length": 368.2823791503906, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.1088, "grad_norm": 0.05497967079281807, "kl": 0.1703948974609375, "learning_rate": 2.7222222222222224e-06, "loss": -0.0168, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04940368980169296, "mask/share_reasoning": 0.826754093170166, "mask/share_step_conf": 0.11993592977523804, "num_tokens": 23286154.0, "reward": 1.3887107372283936, "reward_std": 0.19226235151290894, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7948195338249207, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8393477201461792, "step": 102 }, { "adv/mean_abs_final_conf": 0.7382954359054565, "adv/mean_abs_reasoning": 0.49512580037117004, "adv/mean_abs_step_conf": 0.7493655681610107, "adv/ratio_final_to_reasoning": 1.4911269728864762, "adv/ratio_step_to_reasoning": 1.5134851942662861, "adv/std_final_conf": 0.914150595664978, "adv/std_reasoning": 0.7393211126327515, "adv/std_step_conf": 0.9361206293106079, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7724887556221889, "calib/avg_num_step_conf": 3.5859375, "calib/ece": 0.13929133858267717, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2795275590551181, "calib/gap": 0.2634657671164419, "calib/mean_conf": 0.6550393700787401, "calib/mu_c": 0.7753623188405797, "calib/mu_w": 0.5118965517241378, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12551181102362202, "calib/std_conf": 0.28208412777558095, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5474346895074946, "calib/step_q_c_n": 467.0, "calib/step_q_gap": 0.11523069837667421, "calib/step_q_w": 0.4322039911308204, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 431.74609375, "completions/mean_terminated_length": 433.4392395019531, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.10986666666666667, "grad_norm": 0.04851434379816055, "kl": 0.2113494873046875, "learning_rate": 2.6944444444444444e-06, "loss": -0.0098, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04223305359482765, "mask/share_reasoning": 0.8557662963867188, "mask/share_step_conf": 0.09809436649084091, "num_tokens": 23501233.0, "reward": 1.3910441398620605, "reward_std": 0.189153790473938, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.780501127243042, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8480590581893921, "step": 103 }, { "adv/mean_abs_final_conf": 0.7384414076805115, "adv/mean_abs_reasoning": 0.4301934540271759, "adv/mean_abs_step_conf": 0.7679357528686523, "adv/ratio_final_to_reasoning": 1.716533342773419, "adv/ratio_step_to_reasoning": 1.785093998246056, "adv/std_final_conf": 0.9183678030967712, "adv/std_reasoning": 0.7013409733772278, "adv/std_step_conf": 0.9360405206680298, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7520451889365017, "calib/avg_num_step_conf": 3.78515625, "calib/ece": 0.17993280632411068, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.233201581027668, "calib/gap": 0.2726345279833789, "calib/mean_conf": 0.5656245059288538, "calib/mu_c": 0.728343137254902, "calib/mu_w": 0.45570860927152307, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17119762845849804, "calib/std_conf": 0.30125627166524066, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5265366972704715, "calib/step_q_c_n": 403.0, "calib/step_q_gap": 0.10257733331287433, "calib/step_q_w": 0.42395936395759715, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 404.36328125, "completions/mean_terminated_length": 404.36328125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.11093333333333333, "grad_norm": 0.04931806027889252, "kl": 0.2790069580078125, "learning_rate": 2.666666666666667e-06, "loss": -0.0546, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.042280539870262146, "mask/share_reasoning": 0.8500114679336548, "mask/share_step_conf": 0.10770799219608307, "num_tokens": 23711430.0, "reward": 1.3486987352371216, "reward_std": 0.19806914031505585, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.76436847448349, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8278425931930542, "step": 104 }, { "adv/mean_abs_final_conf": 0.7237093448638916, "adv/mean_abs_reasoning": 0.6229598522186279, "adv/mean_abs_step_conf": 0.768699049949646, "adv/ratio_final_to_reasoning": 1.1617271037394326, "adv/ratio_step_to_reasoning": 1.2339463726466129, "adv/std_final_conf": 0.9051206707954407, "adv/std_reasoning": 0.8266770243644714, "adv/std_step_conf": 0.9361938238143921, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7808970317835566, "calib/avg_num_step_conf": 3.6796875, "calib/ece": 0.22570281124497996, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.36947791164658633, "calib/gap": 0.3384968479117414, "calib/mean_conf": 0.6267469879518073, "calib/mu_c": 0.8184259259259259, "calib/mu_w": 0.4799290780141845, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20935742971887553, "calib/std_conf": 0.3418157578142417, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5456369426751593, "calib/step_q_c_n": 314.0, "calib/step_q_gap": 0.16186624203821665, "calib/step_q_w": 0.38377070063694263, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2602.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 443.2578125, "completions/mean_terminated_length": 443.2578125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.112, "grad_norm": 0.036655791103839874, "kl": 0.152374267578125, "learning_rate": 2.6388888888888893e-06, "loss": 0.0456, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.043667759746313095, "mask/share_reasoning": 0.8556262254714966, "mask/share_step_conf": 0.1007060557603836, "num_tokens": 23930664.0, "reward": 1.3181248903274536, "reward_std": 0.29982638359069824, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.7417078018188477, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8082084059715271, "step": 105 }, { "adv/mean_abs_final_conf": 0.6950488090515137, "adv/mean_abs_reasoning": 0.45608022809028625, "adv/mean_abs_step_conf": 0.7821485996246338, "adv/ratio_final_to_reasoning": 1.523961720423278, "adv/ratio_step_to_reasoning": 1.7149364332228816, "adv/std_final_conf": 0.8908172249794006, "adv/std_reasoning": 0.7205820679664612, "adv/std_step_conf": 0.9362524747848511, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7576413732504864, "calib/avg_num_step_conf": 3.62109375, "calib/ece": 0.26952755905511816, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.43700787401574803, "calib/gap": 0.2638636791564676, "calib/mean_conf": 0.7144094488188976, "calib/mu_c": 0.8608849557522124, "calib/mu_w": 0.5970212765957448, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26952755905511816, "calib/std_conf": 0.2975057824973848, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5658398058252427, "calib/step_q_c_n": 412.0, "calib/step_q_gap": 0.10588252427184469, "calib/step_q_w": 0.45995728155339805, "calib/step_q_w_n": 515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 374.99609375, "completions/mean_terminated_length": 376.4666748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.11306666666666666, "grad_norm": 0.06290726363658905, "kl": 0.1614990234375, "learning_rate": 2.6111111111111113e-06, "loss": -0.0864, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.042815275490283966, "mask/share_reasoning": 0.8521543741226196, "mask/share_step_conf": 0.10112406313419342, "num_tokens": 24131247.0, "reward": 1.3216792345046997, "reward_std": 0.24554114043712616, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7165695428848267, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8200350403785706, "step": 106 }, { "adv/mean_abs_final_conf": 0.6705832481384277, "adv/mean_abs_reasoning": 0.55158531665802, "adv/mean_abs_step_conf": 0.746010422706604, "adv/ratio_final_to_reasoning": 1.2157380334222816, "adv/ratio_step_to_reasoning": 1.352484194515146, "adv/std_final_conf": 0.8601038455963135, "adv/std_reasoning": 0.7928495407104492, "adv/std_step_conf": 0.9361860752105713, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6844349680170576, "calib/avg_num_step_conf": 3.8671875, "calib/ece": 0.2385375494071146, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.45849802371541504, "calib/gap": 0.189776119402985, "calib/mean_conf": 0.7405138339920949, "calib/mu_c": 0.829776119402985, "calib/mu_w": 0.64, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22470355731225294, "calib/std_conf": 0.2901071984069233, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5264349442379181, "calib/step_q_c_n": 538.0, "calib/step_q_gap": 0.0806870506680733, "calib/step_q_w": 0.44574789356984484, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 412.48046875, "completions/mean_terminated_length": 412.48046875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.11413333333333334, "grad_norm": 0.04678349569439888, "kl": 0.163360595703125, "learning_rate": 2.5833333333333337e-06, "loss": 0.0671, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04249659553170204, "mask/share_reasoning": 0.8509604930877686, "mask/share_step_conf": 0.10654290020465851, "num_tokens": 24341458.0, "reward": 1.3123687505722046, "reward_std": 0.2601582407951355, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7044988870620728, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8093380928039551, "step": 107 }, { "adv/mean_abs_final_conf": 0.6462497711181641, "adv/mean_abs_reasoning": 0.5497677326202393, "adv/mean_abs_step_conf": 0.7494109869003296, "adv/ratio_final_to_reasoning": 1.1754960008985673, "adv/ratio_step_to_reasoning": 1.3631410911087374, "adv/std_final_conf": 0.8757672309875488, "adv/std_reasoning": 0.7927690744400024, "adv/std_step_conf": 0.9362542033195496, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6808732486151841, "calib/avg_num_step_conf": 4.328125, "calib/ece": 0.17977165354330718, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5787401574803149, "calib/gap": 0.21701987618116647, "calib/mean_conf": 0.7645748031496062, "calib/mu_c": 0.8491612903225807, "calib/mu_w": 0.6321414141414142, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1670551181102363, "calib/std_conf": 0.30523495583362753, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.513030959752322, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.08474524546660778, "calib/step_q_w": 0.42828571428571427, "calib/step_q_w_n": 462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 417.625, "completions/mean_terminated_length": 417.625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.1152, "grad_norm": 0.03446570038795471, "kl": 0.147552490234375, "learning_rate": 2.5555555555555557e-06, "loss": -0.027, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04210588335990906, "mask/share_reasoning": 0.8456481695175171, "mask/share_step_conf": 0.11224594712257385, "num_tokens": 24551602.0, "reward": 1.358659029006958, "reward_std": 0.24917784333229065, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7425518035888672, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8272268772125244, "step": 108 }, { "adv/mean_abs_final_conf": 0.6396289467811584, "adv/mean_abs_reasoning": 0.4717791676521301, "adv/mean_abs_step_conf": 0.7523552179336548, "adv/ratio_final_to_reasoning": 1.355780396078857, "adv/ratio_step_to_reasoning": 1.5947190328005527, "adv/std_final_conf": 0.8753486275672913, "adv/std_reasoning": 0.7574488520622253, "adv/std_step_conf": 0.9360347986221313, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8373565492679066, "calib/avg_num_step_conf": 4.4296875, "calib/ece": 0.18340080971659917, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4008097165991903, "calib/gap": 0.4124436090225562, "calib/mean_conf": 0.6397570850202429, "calib/mu_c": 0.8618421052631577, "calib/mu_w": 0.4493984962406015, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18080971659919026, "calib/std_conf": 0.34836800208824037, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.530151966873706, "calib/step_q_c_n": 483.0, "calib/step_q_gap": 0.18351003138983502, "calib/step_q_w": 0.346641935483871, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 461.74609375, "completions/mean_terminated_length": 461.74609375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.11626666666666667, "grad_norm": 0.04146302118897438, "kl": 0.151702880859375, "learning_rate": 2.5277777777777778e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03965022414922714, "mask/share_reasoning": 0.8478109836578369, "mask/share_step_conf": 0.11253875494003296, "num_tokens": 24774409.0, "reward": 1.3690853118896484, "reward_std": 0.26065385341644287, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7751156091690063, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8405119180679321, "step": 109 }, { "adv/mean_abs_final_conf": 0.7240575551986694, "adv/mean_abs_reasoning": 0.5572279691696167, "adv/mean_abs_step_conf": 0.7480405569076538, "adv/ratio_final_to_reasoning": 1.2993919818448145, "adv/ratio_step_to_reasoning": 1.3424318201801442, "adv/std_final_conf": 0.9065413475036621, "adv/std_reasoning": 0.7928216457366943, "adv/std_step_conf": 0.9361799955368042, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7071007811499552, "calib/avg_num_step_conf": 3.66796875, "calib/ece": 0.23066533864541833, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4262948207171315, "calib/gap": 0.23882622614931492, "calib/mean_conf": 0.6706095617529879, "calib/mu_c": 0.8009649122807018, "calib/mu_w": 0.5621386861313868, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22354581673306773, "calib/std_conf": 0.318844875651609, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5426075949367088, "calib/step_q_c_n": 395.0, "calib/step_q_gap": 0.13413296258376772, "calib/step_q_w": 0.4084746323529411, "calib/step_q_w_n": 544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 385.34375, "completions/mean_terminated_length": 388.3779602050781, "completions/min_length": 0.0, "completions/min_terminated_length": 56.0, "epoch": 0.11733333333333333, "grad_norm": 0.06782060861587524, "kl": 0.172576904296875, "learning_rate": 2.5e-06, "loss": -0.0693, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04428033158183098, "mask/share_reasoning": 0.8394521474838257, "mask/share_step_conf": 0.10845498740673065, "num_tokens": 24977977.0, "reward": 1.3049430847167969, "reward_std": 0.24617999792099, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7079055309295654, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8084121942520142, "step": 110 }, { "adv/mean_abs_final_conf": 0.6536703109741211, "adv/mean_abs_reasoning": 0.5757176876068115, "adv/mean_abs_step_conf": 0.776113748550415, "adv/ratio_final_to_reasoning": 1.1354007789674643, "adv/ratio_step_to_reasoning": 1.3480804311860308, "adv/std_final_conf": 0.8580630421638489, "adv/std_reasoning": 0.7928107976913452, "adv/std_step_conf": 0.9360227584838867, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7613636363636365, "calib/avg_num_step_conf": 3.640625, "calib/ece": 0.23164705882352937, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.3368699186991869, "calib/mean_conf": 0.7008235294117646, "calib/mu_c": 0.8752032520325203, "calib/mu_w": 0.5383333333333333, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2250588235294117, "calib/std_conf": 0.3455335576269934, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.572342857142857, "calib/step_q_c_n": 385.0, "calib/step_q_gap": 0.19892055366936529, "calib/step_q_w": 0.37342230347349176, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 406.5625, "completions/mean_terminated_length": 406.5625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1184, "grad_norm": 0.04310350865125656, "kl": 0.1740875244140625, "learning_rate": 2.4722222222222226e-06, "loss": -0.0381, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04492507129907608, "mask/share_reasoning": 0.8509223461151123, "mask/share_step_conf": 0.10415257513523102, "num_tokens": 25189465.0, "reward": 1.3606820106506348, "reward_std": 0.22834959626197815, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7484785318374634, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8387864828109741, "step": 111 }, { "adv/mean_abs_final_conf": 0.6845964193344116, "adv/mean_abs_reasoning": 0.5019567012786865, "adv/mean_abs_step_conf": 0.7608423233032227, "adv/ratio_final_to_reasoning": 1.3638555229773164, "adv/ratio_step_to_reasoning": 1.5157528953494392, "adv/std_final_conf": 0.8892654180526733, "adv/std_reasoning": 0.757592499256134, "adv/std_step_conf": 0.935874879360199, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7720385674931128, "calib/avg_num_step_conf": 3.66015625, "calib/ece": 0.12848484848484854, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": 0.37050045913682284, "calib/mean_conf": 0.533965744400527, "calib/mu_c": 0.7111616161616162, "calib/mu_w": 0.3406611570247934, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07035573122529648, "calib/std_conf": 0.3735869229363475, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4601897233201581, "calib/step_q_c_n": 506.0, "calib/step_q_gap": 0.1314147813247985, "calib/step_q_w": 0.3287749419953596, "calib/step_q_w_n": 431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 439.5234375, "completions/mean_terminated_length": 439.5234375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.11946666666666667, "grad_norm": 0.041784606873989105, "kl": 0.1550445556640625, "learning_rate": 2.4444444444444447e-06, "loss": -0.117, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03895264118909836, "mask/share_reasoning": 0.8648859262466431, "mask/share_step_conf": 0.09616147726774216, "num_tokens": 25409903.0, "reward": 1.3797184228897095, "reward_std": 0.21610228717327118, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7841790914535522, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8376288414001465, "step": 112 }, { "adv/mean_abs_final_conf": 0.7069253921508789, "adv/mean_abs_reasoning": 0.6055176258087158, "adv/mean_abs_step_conf": 0.7630271315574646, "adv/ratio_final_to_reasoning": 1.1674728563131835, "adv/ratio_step_to_reasoning": 1.2601237338688245, "adv/std_final_conf": 0.8900803923606873, "adv/std_reasoning": 0.8265026211738586, "adv/std_step_conf": 0.9358813166618347, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7211304510574585, "calib/avg_num_step_conf": 4.26953125, "calib/ece": 0.21665354330708658, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4645669291338583, "calib/gap": 0.2851325722128642, "calib/mean_conf": 0.6341338582677165, "calib/mu_c": 0.7654744525547446, "calib/mu_w": 0.48034188034188036, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1557086614173228, "calib/std_conf": 0.37022635898206085, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45904504504504506, "calib/step_q_c_n": 555.0, "calib/step_q_gap": 0.13076623463612314, "calib/step_q_w": 0.32827881040892193, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 374.02734375, "completions/mean_terminated_length": 375.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.12053333333333334, "grad_norm": 0.0556560680270195, "kl": 0.211578369140625, "learning_rate": 2.4166666666666667e-06, "loss": -0.008, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.044888682663440704, "mask/share_reasoning": 0.8306836485862732, "mask/share_step_conf": 0.1205214336514473, "num_tokens": 25610854.0, "reward": 1.3818920850753784, "reward_std": 0.2296406775712967, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7413472533226013, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8584840297698975, "step": 113 }, { "adv/mean_abs_final_conf": 0.7044678926467896, "adv/mean_abs_reasoning": 0.5276539325714111, "adv/mean_abs_step_conf": 0.7533060312271118, "adv/ratio_final_to_reasoning": 1.3350945556564178, "adv/ratio_step_to_reasoning": 1.4276516950343425, "adv/std_final_conf": 0.9050940275192261, "adv/std_reasoning": 0.7926760315895081, "adv/std_step_conf": 0.9359117746353149, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7912844036697247, "calib/avg_num_step_conf": 3.875, "calib/ece": 0.13740784313725485, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4823529411764706, "calib/gap": 0.4216498680407189, "calib/mean_conf": 0.6370941176470588, "calib/mu_c": 0.8173287671232876, "calib/mu_w": 0.3956788990825687, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10097647058823525, "calib/std_conf": 0.3869714877330616, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4834704830053667, "calib/step_q_c_n": 559.0, "calib/step_q_gap": 0.14740350840952376, "calib/step_q_w": 0.33606697459584295, "calib/step_q_w_n": 433.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 386.40234375, "completions/mean_terminated_length": 386.40234375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1216, "grad_norm": 0.04214748367667198, "kl": 0.2071685791015625, "learning_rate": 2.388888888888889e-06, "loss": 0.025, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04536930471658707, "mask/share_reasoning": 0.8384172916412354, "mask/share_step_conf": 0.1162133663892746, "num_tokens": 25814797.0, "reward": 1.4204580783843994, "reward_std": 0.22163823246955872, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.8045815229415894, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8615266680717468, "step": 114 }, { "adv/mean_abs_final_conf": 0.6950163841247559, "adv/mean_abs_reasoning": 0.5868062973022461, "adv/mean_abs_step_conf": 0.7511539459228516, "adv/ratio_final_to_reasoning": 1.1844051219627147, "adv/ratio_step_to_reasoning": 1.2800713785386577, "adv/std_final_conf": 0.8760892748832703, "adv/std_reasoning": 0.8098748326301575, "adv/std_step_conf": 0.9364647269248962, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6829407294832828, "calib/avg_num_step_conf": 3.95703125, "calib/ece": 0.2199565217391305, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.33992094861660077, "calib/gap": 0.24440400202634244, "calib/mean_conf": 0.5407549407114625, "calib/mu_c": 0.6769642857142857, "calib/mu_w": 0.43256028368794325, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15901185770750992, "calib/std_conf": 0.3805655080979492, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4071922246220302, "calib/step_q_c_n": 463.0, "calib/step_q_gap": 0.07122858825839379, "calib/step_q_w": 0.3359636363636364, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 417.3046875, "completions/mean_terminated_length": 417.3046875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.12266666666666666, "grad_norm": 0.041259538382291794, "kl": 0.1705780029296875, "learning_rate": 2.361111111111111e-06, "loss": -0.0094, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04276685416698456, "mask/share_reasoning": 0.8451552987098694, "mask/share_step_conf": 0.11207783222198486, "num_tokens": 26026891.0, "reward": 1.3043255805969238, "reward_std": 0.2694549262523651, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7081127166748047, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8080816864967346, "step": 115 }, { "adv/mean_abs_final_conf": 0.7447359561920166, "adv/mean_abs_reasoning": 0.5281481742858887, "adv/mean_abs_step_conf": 0.7545117139816284, "adv/ratio_final_to_reasoning": 1.410089047830142, "adv/ratio_step_to_reasoning": 1.4285985462352622, "adv/std_final_conf": 0.9046410918235779, "adv/std_reasoning": 0.7576063871383667, "adv/std_step_conf": 0.9354843497276306, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.681582238899312, "calib/avg_num_step_conf": 4.23828125, "calib/ece": 0.22569169960474306, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3438735177865613, "calib/gap": 0.26624640400250144, "calib/mean_conf": 0.5020553359683795, "calib/mu_c": 0.638861788617886, "calib/mu_w": 0.37261538461538457, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12079051383399207, "calib/std_conf": 0.39732470580007523, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4007942477876107, "calib/step_q_c_n": 452.0, "calib/step_q_gap": 0.12556107243216047, "calib/step_q_w": 0.2752331753554502, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 443.2265625, "completions/mean_terminated_length": 444.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.12373333333333333, "grad_norm": 0.06766902655363083, "kl": 0.162445068359375, "learning_rate": 2.3333333333333336e-06, "loss": -0.0187, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04233179986476898, "mask/share_reasoning": 0.8447202444076538, "mask/share_step_conf": 0.1090417355298996, "num_tokens": 26244877.0, "reward": 1.3489094972610474, "reward_std": 0.23857152462005615, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7165961265563965, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8437364101409912, "step": 116 }, { "adv/mean_abs_final_conf": 0.6993064880371094, "adv/mean_abs_reasoning": 0.5325876474380493, "adv/mean_abs_step_conf": 0.7361884117126465, "adv/ratio_final_to_reasoning": 1.3130355001679848, "adv/ratio_step_to_reasoning": 1.382285930313996, "adv/std_final_conf": 0.8909405469894409, "adv/std_reasoning": 0.7577109336853027, "adv/std_step_conf": 0.9362501502037048, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6957074175824175, "calib/avg_num_step_conf": 4.17578125, "calib/ece": 0.20776892430278887, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.32669322709163345, "calib/gap": 0.27219711538461544, "calib/mean_conf": 0.5018725099601594, "calib/mu_c": 0.6753846153846154, "calib/mu_w": 0.4031874999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17354581673306774, "calib/std_conf": 0.3831293869864129, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39309, "calib/step_q_c_n": 380.0, "calib/step_q_gap": 0.07801031930333818, "calib/step_q_w": 0.3150796806966618, "calib/step_q_w_n": 689.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 427.484375, "completions/mean_terminated_length": 429.1607971191406, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.1248, "grad_norm": 0.06574005633592606, "kl": 0.1655731201171875, "learning_rate": 2.305555555555556e-06, "loss": -0.0788, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03895164281129837, "mask/share_reasoning": 0.8477646708488464, "mask/share_step_conf": 0.10937744379043579, "num_tokens": 26460913.0, "reward": 1.3173385858535767, "reward_std": 0.2695554792881012, "rewards/accuracy_reward_step": 0.35546875, "rewards/final_brier_reward_step": 0.714278519153595, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8266055583953857, "step": 117 }, { "adv/mean_abs_final_conf": 0.7023355960845947, "adv/mean_abs_reasoning": 0.5506028532981873, "adv/mean_abs_step_conf": 0.7614545226097107, "adv/ratio_final_to_reasoning": 1.2755756565326666, "adv/ratio_step_to_reasoning": 1.382946924536429, "adv/std_final_conf": 0.9201270937919617, "adv/std_reasoning": 0.8097826242446899, "adv/std_step_conf": 0.9361098408699036, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.725543652575534, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.18125199999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.352, "calib/gap": 0.3181899416255052, "calib/mean_conf": 0.520908, "calib/mu_c": 0.6723664122137405, "calib/mu_w": 0.3541764705882353, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08908, "calib/std_conf": 0.38569740929386603, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3985378787878788, "calib/step_q_c_n": 528.0, "calib/step_q_gap": 0.1381376055638351, "calib/step_q_w": 0.2604002732240437, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 419.71875, "completions/mean_terminated_length": 421.3647155761719, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.12586666666666665, "grad_norm": 0.04007981717586517, "kl": 0.1764984130859375, "learning_rate": 2.277777777777778e-06, "loss": 0.0011, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.041666384786367416, "mask/share_reasoning": 0.8308029174804688, "mask/share_step_conf": 0.12362450361251831, "num_tokens": 26672369.0, "reward": 1.3235828876495361, "reward_std": 0.2537011206150055, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.742707371711731, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.803400993347168, "step": 118 }, { "adv/mean_abs_final_conf": 0.7023715972900391, "adv/mean_abs_reasoning": 0.6352405548095703, "adv/mean_abs_step_conf": 0.7528685331344604, "adv/ratio_final_to_reasoning": 1.1056781434563683, "adv/ratio_step_to_reasoning": 1.1851707631609762, "adv/std_final_conf": 0.8904647827148438, "adv/std_reasoning": 0.8429591655731201, "adv/std_step_conf": 0.9359723329544067, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.762542330364982, "calib/avg_num_step_conf": 4.2578125, "calib/ece": 0.15938735177865615, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.28063241106719367, "calib/gap": 0.3792264517747398, "calib/mean_conf": 0.419901185770751, "calib/mu_c": 0.6207563025210084, "calib/mu_w": 0.24152985074626868, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05446640316205535, "calib/std_conf": 0.3928200592273774, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.372380658436214, "calib/step_q_c_n": 486.0, "calib/step_q_gap": 0.13757193327065104, "calib/step_q_w": 0.23480872516556295, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 449.1640625, "completions/mean_terminated_length": 450.9255065917969, "completions/min_length": 0.0, "completions/min_terminated_length": 67.0, "epoch": 0.12693333333333334, "grad_norm": 0.050807878375053406, "kl": 0.172210693359375, "learning_rate": 2.25e-06, "loss": -0.0699, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04051949828863144, "mask/share_reasoning": 0.8514037132263184, "mask/share_step_conf": 0.10417056083679199, "num_tokens": 26892419.0, "reward": 1.3865572214126587, "reward_std": 0.23146747052669525, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.773796796798706, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8543463945388794, "step": 119 }, { "adv/mean_abs_final_conf": 0.7236967086791992, "adv/mean_abs_reasoning": 0.5814656019210815, "adv/mean_abs_step_conf": 0.7500573396682739, "adv/ratio_final_to_reasoning": 1.2446079463483408, "adv/ratio_step_to_reasoning": 1.289942753604321, "adv/std_final_conf": 0.9180964231491089, "adv/std_reasoning": 0.8266046643257141, "adv/std_step_conf": 0.936331570148468, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7371337827326938, "calib/avg_num_step_conf": 4.2109375, "calib/ece": 0.2342570281124498, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.24497991967871485, "calib/gap": 0.31392792325641694, "calib/mean_conf": 0.38871485943775097, "calib/mu_c": 0.5349624060150376, "calib/mu_w": 0.22103448275862067, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.04441767068273092, "calib/std_conf": 0.3788589333586088, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3636018691588785, "calib/step_q_c_n": 535.0, "calib/step_q_gap": 0.1408436739470921, "calib/step_q_w": 0.2227581952117864, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 413.84765625, "completions/mean_terminated_length": 417.1062927246094, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.128, "grad_norm": 0.09071393311023712, "kl": 0.1795806884765625, "learning_rate": 2.222222222222222e-06, "loss": -0.0262, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.040145181119441986, "mask/share_reasoning": 0.8437585234642029, "mask/share_step_conf": 0.10828377306461334, "num_tokens": 27105052.0, "reward": 1.322953224182129, "reward_std": 0.26806381344795227, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7185671925544739, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8148413896560669, "step": 120 }, { "adv/mean_abs_final_conf": 0.7919062376022339, "adv/mean_abs_reasoning": 0.5846736431121826, "adv/mean_abs_step_conf": 0.7712915539741516, "adv/ratio_final_to_reasoning": 1.3544414853164317, "adv/ratio_step_to_reasoning": 1.3191830400779023, "adv/std_final_conf": 0.934366762638092, "adv/std_reasoning": 0.7929185628890991, "adv/std_step_conf": 0.9359788298606873, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6865609880315762, "calib/avg_num_step_conf": 4.87890625, "calib/ece": 0.25900398406374503, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.26294820717131473, "calib/gap": 0.22867583396995161, "calib/mean_conf": 0.4162948207171315, "calib/mu_c": 0.5365546218487395, "calib/mu_w": 0.30787878787878786, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.100597609561753, "calib/std_conf": 0.3855396909784887, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34495750487329435, "calib/step_q_c_n": 513.0, "calib/step_q_gap": 0.09719921682981605, "calib/step_q_w": 0.2477582880434783, "calib/step_q_w_n": 736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 469.578125, "completions/mean_terminated_length": 473.27557373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.12906666666666666, "grad_norm": 0.044260066002607346, "kl": 0.1622772216796875, "learning_rate": 2.1944444444444445e-06, "loss": -0.0574, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037188708782196045, "mask/share_reasoning": 0.8464273810386658, "mask/share_step_conf": 0.10857141017913818, "num_tokens": 27330320.0, "reward": 1.314887523651123, "reward_std": 0.2514931559562683, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6987988352775574, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.820956826210022, "step": 121 }, { "adv/mean_abs_final_conf": 0.6651676297187805, "adv/mean_abs_reasoning": 0.4964384436607361, "adv/mean_abs_step_conf": 0.7753804922103882, "adv/ratio_final_to_reasoning": 1.3398793711740689, "adv/ratio_step_to_reasoning": 1.561886477793166, "adv/std_final_conf": 0.868554413318634, "adv/std_reasoning": 0.7753000855445862, "adv/std_step_conf": 0.9360112547874451, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.810948191593353, "calib/avg_num_step_conf": 4.41015625, "calib/ece": 0.15142913385826773, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.4917824046920821, "calib/mean_conf": 0.5998307086614173, "calib/mu_c": 0.7915096774193549, "calib/mu_w": 0.29972727272727273, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07051181102362206, "calib/std_conf": 0.42493590864806063, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41932664756446997, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.17259439698442355, "calib/step_q_w": 0.2467322505800464, "calib/step_q_w_n": 431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 394.44921875, "completions/mean_terminated_length": 395.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.13013333333333332, "grad_norm": 0.04509063810110092, "kl": 0.1987762451171875, "learning_rate": 2.166666666666667e-06, "loss": -0.0169, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04127652570605278, "mask/share_reasoning": 0.8373570442199707, "mask/share_step_conf": 0.11746013164520264, "num_tokens": 27538643.0, "reward": 1.4087672233581543, "reward_std": 0.2185821235179901, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.8090417385101318, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8444807529449463, "step": 122 }, { "adv/mean_abs_final_conf": 0.7057344913482666, "adv/mean_abs_reasoning": 0.5242763757705688, "adv/mean_abs_step_conf": 0.7717443108558655, "adv/ratio_final_to_reasoning": 1.346111562457864, "adv/ratio_step_to_reasoning": 1.4720180929792501, "adv/std_final_conf": 0.8740495443344116, "adv/std_reasoning": 0.7575638890266418, "adv/std_step_conf": 0.9360249042510986, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.682103952692188, "calib/avg_num_step_conf": 4.42578125, "calib/ece": 0.23219763779527566, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2992125984251969, "calib/gap": 0.2641871023965142, "calib/mean_conf": 0.42252677165354335, "calib/mu_c": 0.5629411764705883, "calib/mu_w": 0.29875407407407406, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09311023622047249, "calib/std_conf": 0.39652975758870523, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3490301075268817, "calib/step_q_c_n": 465.0, "calib/step_q_gap": 0.12006304165861822, "calib/step_q_w": 0.22896706586826346, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 460.00390625, "completions/mean_terminated_length": 461.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.1312, "grad_norm": 0.04096907004714012, "kl": 0.1746368408203125, "learning_rate": 2.138888888888889e-06, "loss": 0.012, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03718689829111099, "mask/share_reasoning": 0.8577108979225159, "mask/share_step_conf": 0.10119592398405075, "num_tokens": 27761692.0, "reward": 1.3273255825042725, "reward_std": 0.21498894691467285, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7175615429878235, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8228417634963989, "step": 123 }, { "adv/mean_abs_final_conf": 0.698214054107666, "adv/mean_abs_reasoning": 0.4900742769241333, "adv/mean_abs_step_conf": 0.7525066137313843, "adv/ratio_final_to_reasoning": 1.4247106754712493, "adv/ratio_step_to_reasoning": 1.5354950242529812, "adv/std_final_conf": 0.9064407348632812, "adv/std_reasoning": 0.739323079586029, "adv/std_step_conf": 0.9363665580749512, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7348370927318297, "calib/avg_num_step_conf": 4.68359375, "calib/ece": 0.21602362204724407, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4015748031496063, "calib/gap": 0.3618107769423559, "calib/mean_conf": 0.5108267716535433, "calib/mu_c": 0.6732142857142858, "calib/mu_w": 0.31140350877192985, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08783464566929136, "calib/std_conf": 0.42134919392385983, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3774491803278689, "calib/step_q_c_n": 610.0, "calib/step_q_gap": 0.11135206657574664, "calib/step_q_w": 0.26609711375212225, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 432.83203125, "completions/mean_terminated_length": 432.83203125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.13226666666666667, "grad_norm": 0.12796330451965332, "kl": 0.193145751953125, "learning_rate": 2.1111111111111114e-06, "loss": 0.0172, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03850656747817993, "mask/share_reasoning": 0.8427542448043823, "mask/share_step_conf": 0.11873914301395416, "num_tokens": 27979313.0, "reward": 1.3393280506134033, "reward_std": 0.24536257982254028, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7465871572494507, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8121283054351807, "step": 124 }, { "adv/mean_abs_final_conf": 0.6791626214981079, "adv/mean_abs_reasoning": 0.5428112149238586, "adv/mean_abs_step_conf": 0.7677067518234253, "adv/ratio_final_to_reasoning": 1.251194895804383, "adv/ratio_step_to_reasoning": 1.4143163050363896, "adv/std_final_conf": 0.8601999878883362, "adv/std_reasoning": 0.7754709720611572, "adv/std_step_conf": 0.9360963106155396, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.669949494949495, "calib/avg_num_step_conf": 4.05859375, "calib/ece": 0.2620238095238096, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.43253968253968256, "calib/gap": 0.2544166666666667, "calib/mean_conf": 0.5611507936507937, "calib/mu_c": 0.6944166666666667, "calib/mu_w": 0.44, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17349206349206356, "calib/std_conf": 0.41145196960644254, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39113207547169815, "calib/step_q_c_n": 477.0, "calib/step_q_gap": 0.0631391929094206, "calib/step_q_w": 0.32799288256227754, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 462.08984375, "completions/mean_terminated_length": 462.08984375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13333333333333333, "grad_norm": 0.04287952929735184, "kl": 0.1557159423828125, "learning_rate": 2.0833333333333334e-06, "loss": 0.0341, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.040689751505851746, "mask/share_reasoning": 0.8555436730384827, "mask/share_step_conf": 0.1037665456533432, "num_tokens": 28202416.0, "reward": 1.2918962240219116, "reward_std": 0.27244269847869873, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6861358880996704, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8039063215255737, "step": 125 }, { "adv/mean_abs_final_conf": 0.6152515411376953, "adv/mean_abs_reasoning": 0.4486311078071594, "adv/mean_abs_step_conf": 0.7678678035736084, "adv/ratio_final_to_reasoning": 1.3713974141136829, "adv/ratio_step_to_reasoning": 1.7115794919501444, "adv/std_final_conf": 0.8269786834716797, "adv/std_reasoning": 0.7206025123596191, "adv/std_step_conf": 0.9352603554725647, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7698147195059187, "calib/avg_num_step_conf": 4.8359375, "calib/ece": 0.195924, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.528, "calib/gap": 0.4379417138445702, "calib/mean_conf": 0.6099959999999999, "calib/mu_c": 0.8447327586206896, "calib/mu_w": 0.40679104477611944, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17096, "calib/std_conf": 0.4277949087869093, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47163960396039606, "calib/step_q_c_n": 505.0, "calib/step_q_gap": 0.21580058622506182, "calib/step_q_w": 0.25583901773533424, "calib/step_q_w_n": 733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 471.8671875, "completions/mean_terminated_length": 473.7176818847656, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.1344, "grad_norm": 0.0364367812871933, "kl": 0.1450653076171875, "learning_rate": 2.0555555555555555e-06, "loss": 0.0102, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0397508405148983, "mask/share_reasoning": 0.846636176109314, "mask/share_step_conf": 0.10970672965049744, "num_tokens": 28428678.0, "reward": 1.342126488685608, "reward_std": 0.258208304643631, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.74688321352005, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8261067271232605, "step": 126 }, { "adv/mean_abs_final_conf": 0.6154482364654541, "adv/mean_abs_reasoning": 0.41630616784095764, "adv/mean_abs_step_conf": 0.7662066221237183, "adv/ratio_final_to_reasoning": 1.478354835954713, "adv/ratio_step_to_reasoning": 1.840488278368323, "adv/std_final_conf": 0.825966477394104, "adv/std_reasoning": 0.7014350295066833, "adv/std_step_conf": 0.9359336495399475, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7410403103833761, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.2454435483870968, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5645161290322581, "calib/gap": 0.3928749917801011, "calib/mean_conf": 0.6350403225806452, "calib/mu_c": 0.8520720720720719, "calib/mu_w": 0.4591970802919708, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21645161290322584, "calib/std_conf": 0.43111726380218535, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4743393574297189, "calib/step_q_c_n": 498.0, "calib/step_q_gap": 0.1798232463336431, "calib/step_q_w": 0.2945161110960758, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 450.75, "completions/mean_terminated_length": 454.2992248535156, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.13546666666666668, "grad_norm": 0.06047916039824486, "kl": 0.1569366455078125, "learning_rate": 2.027777777777778e-06, "loss": -0.0308, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.040085144340991974, "mask/share_reasoning": 0.8332340717315674, "mask/share_step_conf": 0.11886821687221527, "num_tokens": 28647742.0, "reward": 1.3079798221588135, "reward_std": 0.2906861901283264, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6997348070144653, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8182685375213623, "step": 127 }, { "adv/mean_abs_final_conf": 0.7099267244338989, "adv/mean_abs_reasoning": 0.5674612522125244, "adv/mean_abs_step_conf": 0.738547682762146, "adv/ratio_final_to_reasoning": 1.251057621407459, "adv/ratio_step_to_reasoning": 1.301494471882543, "adv/std_final_conf": 0.8756265640258789, "adv/std_reasoning": 0.7929064035415649, "adv/std_step_conf": 0.9359824061393738, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7090269712220932, "calib/avg_num_step_conf": 4.0859375, "calib/ece": 0.2501204819277109, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5742971887550201, "calib/gap": 0.35529229578010063, "calib/mean_conf": 0.6424899598393575, "calib/mu_c": 0.8222764227642276, "calib/mu_w": 0.466984126984127, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1993172690763053, "calib/std_conf": 0.42755330922413926, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45157582417582415, "calib/step_q_c_n": 455.0, "calib/step_q_gap": 0.15030002045331992, "calib/step_q_w": 0.30127580372250423, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 444.80078125, "completions/mean_terminated_length": 451.86114501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.13653333333333334, "grad_norm": 0.047435879707336426, "kl": 0.160675048828125, "learning_rate": 2.0000000000000003e-06, "loss": -0.0491, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03938092291355133, "mask/share_reasoning": 0.8430782556533813, "mask/share_step_conf": 0.10191580653190613, "num_tokens": 28868275.0, "reward": 1.3003993034362793, "reward_std": 0.2986387610435486, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7030344009399414, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8035696744918823, "step": 128 }, { "adv/mean_abs_final_conf": 0.6892992258071899, "adv/mean_abs_reasoning": 0.6112114191055298, "adv/mean_abs_step_conf": 0.7431253790855408, "adv/ratio_final_to_reasoning": 1.1277590769098143, "adv/ratio_step_to_reasoning": 1.2158237818479551, "adv/std_final_conf": 0.8707001209259033, "adv/std_reasoning": 0.8428522348403931, "adv/std_step_conf": 0.9362640976905823, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6386879837584063, "calib/avg_num_step_conf": 4.4375, "calib/ece": 0.2934387351778656, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6758893280632411, "calib/gap": 0.22017954574292598, "calib/mean_conf": 0.7472727272727273, "calib/mu_c": 0.8438732394366197, "calib/mu_w": 0.6236936936936938, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23972332015810277, "calib/std_conf": 0.3805302010619262, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4970400843881857, "calib/step_q_c_n": 632.0, "calib/step_q_gap": 0.09559167168977295, "calib/step_q_w": 0.40144841269841275, "calib/step_q_w_n": 504.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 407.484375, "completions/mean_terminated_length": 409.0823669433594, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.1376, "grad_norm": 0.06957102566957474, "kl": 0.1775360107421875, "learning_rate": 1.9722222222222224e-06, "loss": 0.004, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.040567055344581604, "mask/share_reasoning": 0.8331802487373352, "mask/share_step_conf": 0.12234644591808319, "num_tokens": 29074975.0, "reward": 1.3078129291534424, "reward_std": 0.2828935980796814, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6708804368972778, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8184664249420166, "step": 129 }, { "adv/mean_abs_final_conf": 0.576531708240509, "adv/mean_abs_reasoning": 0.4125916063785553, "adv/mean_abs_step_conf": 0.7666875123977661, "adv/ratio_final_to_reasoning": 1.3973423097500866, "adv/ratio_step_to_reasoning": 1.8582237266705945, "adv/std_final_conf": 0.7933287620544434, "adv/std_reasoning": 0.6816454529762268, "adv/std_step_conf": 0.9361461400985718, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7252351097178683, "calib/avg_num_step_conf": 4.16015625, "calib/ece": 0.23817647058823524, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6745098039215687, "calib/gap": 0.3667382445141065, "calib/mean_conf": 0.7327647058823529, "calib/mu_c": 0.8909655172413793, "calib/mu_w": 0.5242272727272728, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20115686274509798, "calib/std_conf": 0.39607969034938717, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5254081128747796, "calib/step_q_c_n": 567.0, "calib/step_q_gap": 0.14184272052065489, "calib/step_q_w": 0.38356539235412473, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 386.44140625, "completions/mean_terminated_length": 386.44140625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.13866666666666666, "grad_norm": 0.049915507435798645, "kl": 0.1814117431640625, "learning_rate": 1.944444444444445e-06, "loss": 0.0389, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.043779339641332626, "mask/share_reasoning": 0.8376109600067139, "mask/share_step_conf": 0.1186097040772438, "num_tokens": 29279192.0, "reward": 1.3503212928771973, "reward_std": 0.24141067266464233, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7408499121665955, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8244274854660034, "step": 130 }, { "adv/mean_abs_final_conf": 0.6712453365325928, "adv/mean_abs_reasoning": 0.35178640484809875, "adv/mean_abs_step_conf": 0.7708390951156616, "adv/ratio_final_to_reasoning": 1.9081048252061823, "adv/ratio_step_to_reasoning": 2.1912134309127427, "adv/std_final_conf": 0.8527726531028748, "adv/std_reasoning": 0.6401533484458923, "adv/std_step_conf": 0.935929000377655, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7221153846153847, "calib/avg_num_step_conf": 4.1484375, "calib/ece": 0.25347656249999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.51953125, "calib/gap": 0.393525641025641, "calib/mean_conf": 0.5926953125, "calib/mu_c": 0.8325, "calib/mu_w": 0.438974358974359, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22777343749999995, "calib/std_conf": 0.435085481159194, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5067487046632124, "calib/step_q_c_n": 386.0, "calib/step_q_gap": 0.15886926679338992, "calib/step_q_w": 0.3478794378698225, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 390.640625, "completions/mean_terminated_length": 392.1725769042969, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.13973333333333332, "grad_norm": 0.056483618915081024, "kl": 0.177459716796875, "learning_rate": 1.916666666666667e-06, "loss": -0.0283, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.041544727981090546, "mask/share_reasoning": 0.8425483703613281, "mask/share_step_conf": 0.11200062930583954, "num_tokens": 29485404.0, "reward": 1.3231122493743896, "reward_std": 0.23826926946640015, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.7114789485931396, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.829091489315033, "step": 131 }, { "adv/mean_abs_final_conf": 0.6433945894241333, "adv/mean_abs_reasoning": 0.5586982369422913, "adv/mean_abs_step_conf": 0.7547528743743896, "adv/ratio_final_to_reasoning": 1.1515958828604473, "adv/ratio_step_to_reasoning": 1.3509132917710445, "adv/std_final_conf": 0.8603805899620056, "adv/std_reasoning": 0.8097853064537048, "adv/std_step_conf": 0.9360271096229553, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7476470588235294, "calib/avg_num_step_conf": 4.27734375, "calib/ece": 0.20990777338603434, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6324110671936759, "calib/gap": 0.3873812636165578, "calib/mean_conf": 0.7022661396574441, "calib/mu_c": 0.8553812636165578, "calib/mu_w": 0.46799999999999997, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15371541501976294, "calib/std_conf": 0.40523718322501656, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5632342657342657, "calib/step_q_c_n": 572.0, "calib/step_q_gap": 0.2445589311262351, "calib/step_q_w": 0.3186753346080306, "calib/step_q_w_n": 523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2175.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 409.47265625, "completions/mean_terminated_length": 411.0784606933594, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.1408, "grad_norm": 0.035640228539705276, "kl": 0.1730194091796875, "learning_rate": 1.888888888888889e-06, "loss": 0.0443, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0445302277803421, "mask/share_reasoning": 0.8328328132629395, "mask/share_step_conf": 0.11873072385787964, "num_tokens": 29695821.0, "reward": 1.4004900455474854, "reward_std": 0.28041917085647583, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.763381838798523, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8602052927017212, "step": 132 }, { "adv/mean_abs_final_conf": 0.7644220590591431, "adv/mean_abs_reasoning": 0.5505396723747253, "adv/mean_abs_step_conf": 0.7615689039230347, "adv/ratio_final_to_reasoning": 1.3884958658144413, "adv/ratio_step_to_reasoning": 1.3833133961773278, "adv/std_final_conf": 0.9061704874038696, "adv/std_reasoning": 0.7754989862442017, "adv/std_step_conf": 0.9363312125205994, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6629651860744298, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.29396812749003975, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4342629482071713, "calib/gap": 0.27101127117513646, "calib/mean_conf": 0.5375059760956175, "calib/mu_c": 0.7027040816326529, "calib/mu_w": 0.4316928104575164, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22051792828685254, "calib/std_conf": 0.43009438758858803, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42600412637008384, "calib/step_q_c_n": 517.0, "calib/step_q_gap": 0.10147463778417776, "calib/step_q_w": 0.3245294885859061, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 501.0, "completions/mean_terminated_length": 502.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.14186666666666667, "grad_norm": 0.050603996962308884, "kl": 0.15277099609375, "learning_rate": 1.8611111111111113e-06, "loss": -0.1137, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034230709075927734, "mask/share_reasoning": 0.8573011159896851, "mask/share_step_conf": 0.1045619398355484, "num_tokens": 29930421.0, "reward": 1.2970893383026123, "reward_std": 0.3156052231788635, "rewards/accuracy_reward_step": 0.3828125, "rewards/final_brier_reward_step": 0.6671197414398193, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8275918960571289, "step": 133 }, { "adv/mean_abs_final_conf": 0.7715365290641785, "adv/mean_abs_reasoning": 0.6042397618293762, "adv/mean_abs_step_conf": 0.7806218862533569, "adv/ratio_final_to_reasoning": 1.2768714967189516, "adv/ratio_step_to_reasoning": 1.2919075101743918, "adv/std_final_conf": 0.9153911471366882, "adv/std_reasoning": 0.8099719285964966, "adv/std_step_conf": 0.9359931945800781, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.714391405043747, "calib/avg_num_step_conf": 4.09375, "calib/ece": 0.25439999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.44, "calib/gap": 0.3235370560988163, "calib/mean_conf": 0.5473600000000001, "calib/mu_c": 0.7207758620689655, "calib/mu_w": 0.3972388059701492, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16887999999999995, "calib/std_conf": 0.4274260525517835, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44735641547861504, "calib/step_q_c_n": 491.0, "calib/step_q_gap": 0.12831512283947677, "calib/step_q_w": 0.3190412926391383, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 491.48828125, "completions/mean_terminated_length": 493.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.14293333333333333, "grad_norm": 0.052062638103961945, "kl": 0.14556884765625, "learning_rate": 1.8333333333333333e-06, "loss": 0.0223, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03546704351902008, "mask/share_reasoning": 0.8622174263000488, "mask/share_step_conf": 0.09840920567512512, "num_tokens": 30165194.0, "reward": 1.3123855590820312, "reward_std": 0.32143712043762207, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6979237794876099, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8212360143661499, "step": 134 }, { "adv/mean_abs_final_conf": 0.7274664640426636, "adv/mean_abs_reasoning": 0.6027958393096924, "adv/mean_abs_step_conf": 0.7395835518836975, "adv/ratio_final_to_reasoning": 1.2068206457359445, "adv/ratio_step_to_reasoning": 1.226922124629545, "adv/std_final_conf": 0.8909457325935364, "adv/std_reasoning": 0.8100106716156006, "adv/std_step_conf": 0.9361552000045776, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6833225178455549, "calib/avg_num_step_conf": 4.6015625, "calib/ece": 0.27297188755020074, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.42971887550200805, "calib/gap": 0.2680350421804024, "calib/mean_conf": 0.5455823293172691, "calib/mu_c": 0.6898260869565218, "calib/mu_w": 0.4217910447761194, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17835341365461843, "calib/std_conf": 0.4263713550163871, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44290066225165564, "calib/step_q_c_n": 453.0, "calib/step_q_gap": 0.1375089381137246, "calib/step_q_w": 0.30539172413793103, "calib/step_q_w_n": 725.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 482.140625, "completions/mean_terminated_length": 484.0314025878906, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.144, "grad_norm": 0.041505929082632065, "kl": 0.151641845703125, "learning_rate": 1.8055555555555557e-06, "loss": -0.0545, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03815152496099472, "mask/share_reasoning": 0.8518142104148865, "mask/share_step_conf": 0.1061280220746994, "num_tokens": 30394502.0, "reward": 1.3121750354766846, "reward_std": 0.2834535241127014, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6768605709075928, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8315571546554565, "step": 135 }, { "adv/mean_abs_final_conf": 0.6537525653839111, "adv/mean_abs_reasoning": 0.42274731397628784, "adv/mean_abs_step_conf": 0.7515841722488403, "adv/ratio_final_to_reasoning": 1.5464381292806524, "adv/ratio_step_to_reasoning": 1.7778567654979758, "adv/std_final_conf": 0.8735095262527466, "adv/std_reasoning": 0.701313853263855, "adv/std_step_conf": 0.9356544613838196, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7858732277957912, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.1786220472440945, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3188976377952756, "calib/gap": 0.4211068726556043, "calib/mean_conf": 0.4246062992125984, "calib/mu_c": 0.6683177570093458, "calib/mu_w": 0.24721088435374153, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09098425196850395, "calib/std_conf": 0.42097239622749233, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45806739130434787, "calib/step_q_c_n": 460.0, "calib/step_q_gap": 0.22600596446033333, "calib/step_q_w": 0.23206142684401454, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 457.07421875, "completions/mean_terminated_length": 457.07421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.14506666666666668, "grad_norm": 0.05797513574361801, "kl": 0.1590728759765625, "learning_rate": 1.777777777777778e-06, "loss": 0.0713, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03888920694589615, "mask/share_reasoning": 0.8414955139160156, "mask/share_step_conf": 0.11961531639099121, "num_tokens": 30620001.0, "reward": 1.4119293689727783, "reward_std": 0.20032727718353271, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.7781753540039062, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8818260431289673, "step": 136 }, { "adv/mean_abs_final_conf": 0.7331190705299377, "adv/mean_abs_reasoning": 0.6162766814231873, "adv/mean_abs_step_conf": 0.7503564953804016, "adv/ratio_final_to_reasoning": 1.1895940453838407, "adv/ratio_step_to_reasoning": 1.2175643148586761, "adv/std_final_conf": 0.9069944620132446, "adv/std_reasoning": 0.858950138092041, "adv/std_step_conf": 0.9364016056060791, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6668528517346244, "calib/avg_num_step_conf": 4.76171875, "calib/ece": 0.2797456692913386, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4015748031496063, "calib/gap": 0.24683139080245753, "calib/mean_conf": 0.5051362204724409, "calib/mu_c": 0.6324390243902438, "calib/mu_w": 0.38560763358778627, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15031496062992125, "calib/std_conf": 0.4260757041840703, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39144387755102045, "calib/step_q_c_n": 588.0, "calib/step_q_gap": 0.09878999482518841, "calib/step_q_w": 0.29265388272583204, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 418.046875, "completions/mean_terminated_length": 419.6863098144531, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.14613333333333334, "grad_norm": 0.03714418783783913, "kl": 0.16357421875, "learning_rate": 1.75e-06, "loss": -0.0739, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03954150900244713, "mask/share_reasoning": 0.8390306234359741, "mask/share_step_conf": 0.11752159893512726, "num_tokens": 30834005.0, "reward": 1.3112924098968506, "reward_std": 0.3159603476524353, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6747555136680603, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.827820897102356, "step": 137 }, { "adv/mean_abs_final_conf": 0.7053368091583252, "adv/mean_abs_reasoning": 0.6283121109008789, "adv/mean_abs_step_conf": 0.7511765956878662, "adv/ratio_final_to_reasoning": 1.1225898672349441, "adv/ratio_step_to_reasoning": 1.1955468988347577, "adv/std_final_conf": 0.8911412358283997, "adv/std_reasoning": 0.8100284934043884, "adv/std_step_conf": 0.9361639022827148, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7114058703788824, "calib/avg_num_step_conf": 4.328125, "calib/ece": 0.26853018372703413, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4251968503937008, "calib/gap": 0.3181723466193885, "calib/mean_conf": 0.5242257217847768, "calib/mu_c": 0.6457324840764331, "calib/mu_w": 0.32756013745704465, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08732283464566928, "calib/std_conf": 0.4289380259018184, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41454853620955323, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.09478295886750532, "calib/step_q_w": 0.3197655773420479, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2229.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 434.875, "completions/mean_terminated_length": 436.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.1472, "grad_norm": 0.06317199766635895, "kl": 0.158172607421875, "learning_rate": 1.7222222222222224e-06, "loss": -0.0748, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04075439274311066, "mask/share_reasoning": 0.8472160696983337, "mask/share_step_conf": 0.1081232875585556, "num_tokens": 31049669.0, "reward": 1.3243135213851929, "reward_std": 0.27699100971221924, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7157214879989624, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8059059381484985, "step": 138 }, { "adv/mean_abs_final_conf": 0.7152689695358276, "adv/mean_abs_reasoning": 0.5647158622741699, "adv/mean_abs_step_conf": 0.7506711483001709, "adv/ratio_final_to_reasoning": 1.266599749217889, "adv/ratio_step_to_reasoning": 1.3292899995355887, "adv/std_final_conf": 0.8902831077575684, "adv/std_reasoning": 0.7928001880645752, "adv/std_step_conf": 0.9358660578727722, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.722875451038945, "calib/avg_num_step_conf": 4.515625, "calib/ece": 0.24751372549019607, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3411764705882353, "calib/gap": 0.3153568495707354, "calib/mean_conf": 0.46291764705882354, "calib/mu_c": 0.6039007092198582, "calib/mu_w": 0.28854385964912277, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07874509803921569, "calib/std_conf": 0.41096522094434734, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43868451612903225, "calib/step_q_c_n": 620.0, "calib/step_q_gap": 0.1349643668753009, "calib/step_q_w": 0.30372014925373136, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 402.203125, "completions/mean_terminated_length": 403.7804260253906, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.14826666666666666, "grad_norm": 0.057961318641901016, "kl": 0.1760101318359375, "learning_rate": 1.6944444444444446e-06, "loss": -0.0217, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0406876876950264, "mask/share_reasoning": 0.837753415107727, "mask/share_step_conf": 0.11765265464782715, "num_tokens": 31255729.0, "reward": 1.3755717277526855, "reward_std": 0.22052976489067078, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7288585305213928, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8564548492431641, "step": 139 }, { "adv/mean_abs_final_conf": 0.5774595737457275, "adv/mean_abs_reasoning": 0.5195547342300415, "adv/mean_abs_step_conf": 0.7685931921005249, "adv/ratio_final_to_reasoning": 1.1114508938148713, "adv/ratio_step_to_reasoning": 1.4793305526116474, "adv/std_final_conf": 0.8099300265312195, "adv/std_reasoning": 0.7394070625305176, "adv/std_step_conf": 0.9359972476959229, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.781879630833225, "calib/avg_num_step_conf": 4.16796875, "calib/ece": 0.19671764705882355, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.43335928766411, "calib/mean_conf": 0.6247725490196079, "calib/mu_c": 0.7913184713375795, "calib/mu_w": 0.35795918367346946, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10290196078431375, "calib/std_conf": 0.4220057466751322, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49226213592233015, "calib/step_q_c_n": 618.0, "calib/step_q_gap": 0.15976547667956847, "calib/step_q_w": 0.33249665924276167, "calib/step_q_w_n": 449.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1566.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 425.296875, "completions/mean_terminated_length": 426.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.14933333333333335, "grad_norm": 0.051009099930524826, "kl": 0.157012939453125, "learning_rate": 1.6666666666666667e-06, "loss": 0.0003, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.040174562484025955, "mask/share_reasoning": 0.8454655408859253, "mask/share_step_conf": 0.11045361310243607, "num_tokens": 31469621.0, "reward": 1.408362627029419, "reward_std": 0.21559756994247437, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7836502194404602, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8559907674789429, "step": 140 }, { "adv/mean_abs_final_conf": 0.6383939981460571, "adv/mean_abs_reasoning": 0.46379637718200684, "adv/mean_abs_step_conf": 0.7589306831359863, "adv/ratio_final_to_reasoning": 1.3764531797874162, "adv/ratio_step_to_reasoning": 1.6363445694578171, "adv/std_final_conf": 0.8568813800811768, "adv/std_reasoning": 0.7392438054084778, "adv/std_step_conf": 0.9359174370765686, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.834562868051511, "calib/avg_num_step_conf": 4.44921875, "calib/ece": 0.14496062992125985, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.468503937007874, "calib/gap": 0.5326189089497185, "calib/mean_conf": 0.5637007874015748, "calib/mu_c": 0.7754901960784314, "calib/mu_w": 0.2428712871287129, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05314960629921259, "calib/std_conf": 0.431495847462444, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44982689335394127, "calib/step_q_c_n": 647.0, "calib/step_q_gap": 0.1612394949799575, "calib/step_q_w": 0.2885873983739838, "calib/step_q_w_n": 492.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 461.86328125, "completions/mean_terminated_length": 461.86328125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.1504, "grad_norm": 0.04535761475563049, "kl": 0.1490936279296875, "learning_rate": 1.638888888888889e-06, "loss": 0.0648, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03858054429292679, "mask/share_reasoning": 0.8557083606719971, "mask/share_step_conf": 0.10571112483739853, "num_tokens": 31694954.0, "reward": 1.435590147972107, "reward_std": 0.21061460673809052, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8199445009231567, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8670241236686707, "step": 141 }, { "adv/mean_abs_final_conf": 0.696391224861145, "adv/mean_abs_reasoning": 0.43528157472610474, "adv/mean_abs_step_conf": 0.7312085032463074, "adv/ratio_final_to_reasoning": 1.5998637785193186, "adv/ratio_step_to_reasoning": 1.6798517228908916, "adv/std_final_conf": 0.8764922618865967, "adv/std_reasoning": 0.7204663753509521, "adv/std_step_conf": 0.9357934594154358, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7650992234685073, "calib/avg_num_step_conf": 4.515625, "calib/ece": 0.20349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.403921568627451, "calib/gap": 0.41327640823369904, "calib/mean_conf": 0.5069372549019608, "calib/mu_c": 0.7046616541353383, "calib/mu_w": 0.2913852459016393, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09443137254901962, "calib/std_conf": 0.4355757469799832, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47131958762886594, "calib/step_q_c_n": 582.0, "calib/step_q_gap": 0.1915607026114443, "calib/step_q_w": 0.27975888501742163, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 435.87109375, "completions/mean_terminated_length": 437.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.15146666666666667, "grad_norm": 0.03959168866276741, "kl": 0.1544036865234375, "learning_rate": 1.6111111111111113e-06, "loss": -0.0305, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03837796300649643, "mask/share_reasoning": 0.8447139263153076, "mask/share_step_conf": 0.11300183832645416, "num_tokens": 31911697.0, "reward": 1.386702299118042, "reward_std": 0.22725032269954681, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.76378333568573, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8532482385635376, "step": 142 }, { "adv/mean_abs_final_conf": 0.6530799865722656, "adv/mean_abs_reasoning": 0.49441465735435486, "adv/mean_abs_step_conf": 0.7745805978775024, "adv/ratio_final_to_reasoning": 1.3209155045421577, "adv/ratio_step_to_reasoning": 1.5666618826034282, "adv/std_final_conf": 0.8723351955413818, "adv/std_reasoning": 0.7576212286949158, "adv/std_step_conf": 0.9359222054481506, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8299148606811146, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.17129200000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.452, "calib/gap": 0.5069794891640866, "calib/mean_conf": 0.5489400000000001, "calib/mu_c": 0.8247368421052632, "calib/mu_w": 0.3177573529411765, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13211600000000004, "calib/std_conf": 0.4300814625161145, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.46212589928057557, "calib/step_q_c_n": 556.0, "calib/step_q_gap": 0.20496793608831437, "calib/step_q_w": 0.2571579631922612, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 459.6484375, "completions/mean_terminated_length": 459.6484375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.15253333333333333, "grad_norm": 0.053450778126716614, "kl": 0.155853271484375, "learning_rate": 1.5833333333333333e-06, "loss": -0.0157, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03666718304157257, "mask/share_reasoning": 0.8401566743850708, "mask/share_step_conf": 0.12317609786987305, "num_tokens": 32136703.0, "reward": 1.3771334886550903, "reward_std": 0.2412530779838562, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7908738255500793, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8395090103149414, "step": 143 }, { "adv/mean_abs_final_conf": 0.6271746754646301, "adv/mean_abs_reasoning": 0.499098002910614, "adv/mean_abs_step_conf": 0.7657457590103149, "adv/ratio_final_to_reasoning": 1.2566162793822158, "adv/ratio_step_to_reasoning": 1.5342593128898099, "adv/std_final_conf": 0.8462645411491394, "adv/std_reasoning": 0.7575715184211731, "adv/std_step_conf": 0.9356332421302795, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.692873892652423, "calib/avg_num_step_conf": 4.41796875, "calib/ece": 0.2912648221343873, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5059288537549407, "calib/gap": 0.3031103439291297, "calib/mean_conf": 0.5765612648221344, "calib/mu_c": 0.6975657894736842, "calib/mu_w": 0.3944554455445545, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13351778656126478, "calib/std_conf": 0.44622664042621685, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47541877579530867, "calib/step_q_c_n": 648.0, "calib/step_q_gap": 0.1618338896669443, "calib/step_q_w": 0.3135848861283644, "calib/step_q_w_n": 483.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 421.4921875, "completions/mean_terminated_length": 423.1451110839844, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.1536, "grad_norm": 0.06617000699043274, "kl": 0.21453857421875, "learning_rate": 1.5555555555555558e-06, "loss": 0.0071, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.040411271154880524, "mask/share_reasoning": 0.840194821357727, "mask/share_step_conf": 0.11548765003681183, "num_tokens": 32348733.0, "reward": 1.3366563320159912, "reward_std": 0.23944231867790222, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.697578489780426, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.829663872718811, "step": 144 }, { "adv/mean_abs_final_conf": 0.7166906595230103, "adv/mean_abs_reasoning": 0.5873551368713379, "adv/mean_abs_step_conf": 0.7748841047286987, "adv/ratio_final_to_reasoning": 1.2201998663714824, "adv/ratio_step_to_reasoning": 1.3192769690519275, "adv/std_final_conf": 0.8760490417480469, "adv/std_reasoning": 0.8265672326087952, "adv/std_step_conf": 0.9361008405685425, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7301093514328808, "calib/avg_num_step_conf": 4.65625, "calib/ece": 0.2123162055335969, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.49407114624505927, "calib/gap": 0.3609771241830065, "calib/mean_conf": 0.6051541501976284, "calib/mu_c": 0.7720882352941176, "calib/mu_w": 0.41111111111111115, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13996047430830044, "calib/std_conf": 0.41960309209280505, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45034983922829586, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.09043054098268183, "calib/step_q_w": 0.359919298245614, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 411.8671875, "completions/mean_terminated_length": 413.4823913574219, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.15466666666666667, "grad_norm": 0.050251394510269165, "kl": 0.156768798828125, "learning_rate": 1.527777777777778e-06, "loss": 0.001, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04229770600795746, "mask/share_reasoning": 0.8285897970199585, "mask/share_step_conf": 0.12520626187324524, "num_tokens": 32556875.0, "reward": 1.3466042280197144, "reward_std": 0.2541048228740692, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7375452518463135, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8262690305709839, "step": 145 }, { "adv/mean_abs_final_conf": 0.7096195220947266, "adv/mean_abs_reasoning": 0.569200873374939, "adv/mean_abs_step_conf": 0.7494736909866333, "adv/ratio_final_to_reasoning": 1.2466943662387746, "adv/ratio_step_to_reasoning": 1.316712124039463, "adv/std_final_conf": 0.8874281048774719, "adv/std_reasoning": 0.7928953170776367, "adv/std_step_conf": 0.9355780482292175, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7482400268186389, "calib/avg_num_step_conf": 4.4609375, "calib/ece": 0.22743650793650794, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4523809523809524, "calib/gap": 0.413036540395575, "calib/mean_conf": 0.5476190476190477, "calib/mu_c": 0.8049473684210527, "calib/mu_w": 0.3919108280254777, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1990357142857143, "calib/std_conf": 0.43866439348706965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5529746835443038, "calib/step_q_c_n": 395.0, "calib/step_q_gap": 0.21941967685086333, "calib/step_q_w": 0.33355500669344046, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 436.28515625, "completions/mean_terminated_length": 437.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.15573333333333333, "grad_norm": 0.04094357788562775, "kl": 0.159423828125, "learning_rate": 1.5e-06, "loss": -0.0788, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03925366699695587, "mask/share_reasoning": 0.8443082571029663, "mask/share_step_conf": 0.11253180354833603, "num_tokens": 32775780.0, "reward": 1.3252230882644653, "reward_std": 0.2993590831756592, "rewards/accuracy_reward_step": 0.37109375, "rewards/final_brier_reward_step": 0.726082444190979, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.82663494348526, "step": 146 }, { "adv/mean_abs_final_conf": 0.705093502998352, "adv/mean_abs_reasoning": 0.47931668162345886, "adv/mean_abs_step_conf": 0.775804877281189, "adv/ratio_final_to_reasoning": 1.471038939454769, "adv/ratio_step_to_reasoning": 1.6185643167133603, "adv/std_final_conf": 0.8910297155380249, "adv/std_reasoning": 0.7393829226493835, "adv/std_step_conf": 0.9361888766288757, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7350313693398799, "calib/avg_num_step_conf": 4.5703125, "calib/ece": 0.29897333333333337, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.56, "calib/gap": 0.40094426259319893, "calib/mean_conf": 0.6338533333333333, "calib/mu_c": 0.8840425531914895, "calib/mu_w": 0.48309829059829057, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2784133333333334, "calib/std_conf": 0.4330953662493593, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5375308641975309, "calib/step_q_c_n": 405.0, "calib/step_q_gap": 0.18174968772694272, "calib/step_q_w": 0.3557811764705882, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 450.05078125, "completions/mean_terminated_length": 450.05078125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1568, "grad_norm": 0.04447196424007416, "kl": 0.1577301025390625, "learning_rate": 1.4722222222222225e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037693917751312256, "mask/share_reasoning": 0.8485679626464844, "mask/share_step_conf": 0.11373814940452576, "num_tokens": 32994673.0, "reward": 1.3036630153656006, "reward_std": 0.31476935744285583, "rewards/accuracy_reward_step": 0.3671875, "rewards/final_brier_reward_step": 0.6830648183822632, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8277555704116821, "step": 147 }, { "adv/mean_abs_final_conf": 0.5962350368499756, "adv/mean_abs_reasoning": 0.5363343954086304, "adv/mean_abs_step_conf": 0.7551847100257874, "adv/ratio_final_to_reasoning": 1.111685250757985, "adv/ratio_step_to_reasoning": 1.4080482558841225, "adv/std_final_conf": 0.8239307403564453, "adv/std_reasoning": 0.7576537728309631, "adv/std_step_conf": 0.9358545541763306, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7532467532467533, "calib/avg_num_step_conf": 4.51171875, "calib/ece": 0.21653968253968264, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5515873015873016, "calib/gap": 0.420508348794063, "calib/mean_conf": 0.6239365079365078, "calib/mu_c": 0.7874675324675324, "calib/mu_w": 0.3669591836734694, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11468253968253977, "calib/std_conf": 0.43631919555894205, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5317462462462463, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.224292258516185, "calib/step_q_w": 0.30745398773006133, "calib/step_q_w_n": 489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 420.18359375, "completions/mean_terminated_length": 421.8313903808594, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.15786666666666666, "grad_norm": 0.038488779217004776, "kl": 0.1638641357421875, "learning_rate": 1.4444444444444445e-06, "loss": 0.0168, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04403883218765259, "mask/share_reasoning": 0.8297275304794312, "mask/share_step_conf": 0.12232742458581924, "num_tokens": 33207352.0, "reward": 1.3929460048675537, "reward_std": 0.2436758577823639, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7596205472946167, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8541513085365295, "step": 148 }, { "adv/mean_abs_final_conf": 0.6414650678634644, "adv/mean_abs_reasoning": 0.5006125569343567, "adv/mean_abs_step_conf": 0.7623882293701172, "adv/ratio_final_to_reasoning": 1.281360323423883, "adv/ratio_step_to_reasoning": 1.5229107196967215, "adv/std_final_conf": 0.8594561219215393, "adv/std_reasoning": 0.7575691342353821, "adv/std_step_conf": 0.9285663366317749, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7966791979949874, "calib/avg_num_step_conf": 4.88671875, "calib/ece": 0.18441897233201587, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5098814229249012, "calib/gap": 0.5147541353383458, "calib/mean_conf": 0.587501976284585, "calib/mu_c": 0.8316541353383459, "calib/mu_w": 0.3169, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12311462450592892, "calib/std_conf": 0.4446132806874793, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4966707530647986, "calib/step_q_c_n": 571.0, "calib/step_q_gap": 0.20092957659421035, "calib/step_q_w": 0.29574117647058823, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 485.91796875, "completions/mean_terminated_length": 487.82354736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.15893333333333334, "grad_norm": 0.07420997321605682, "kl": 0.135040283203125, "learning_rate": 1.4166666666666667e-06, "loss": 0.0254, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038157302886247635, "mask/share_reasoning": 0.843529462814331, "mask/share_step_conf": 0.11440698057413101, "num_tokens": 33436203.0, "reward": 1.431693434715271, "reward_std": 0.2294609248638153, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.796412467956543, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8827059268951416, "step": 149 }, { "adv/mean_abs_final_conf": 0.7208597660064697, "adv/mean_abs_reasoning": 0.5321922302246094, "adv/mean_abs_step_conf": 0.7479966282844543, "adv/ratio_final_to_reasoning": 1.354510128233616, "adv/ratio_step_to_reasoning": 1.4055008431234812, "adv/std_final_conf": 0.8971462845802307, "adv/std_reasoning": 0.7754176259040833, "adv/std_step_conf": 0.9362186193466187, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7468057366362449, "calib/avg_num_step_conf": 4.484375, "calib/ece": 0.22217580645161294, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6048387096774194, "calib/gap": 0.3900070404172099, "calib/mean_conf": 0.6926629032258064, "calib/mu_c": 0.8782307692307693, "calib/mu_w": 0.48822372881355935, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19532258064516134, "calib/std_conf": 0.4085760559094708, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5420354477611941, "calib/step_q_c_n": 536.0, "calib/step_q_gap": 0.19873261551718535, "calib/step_q_w": 0.3433028322440087, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 406.93359375, "completions/mean_terminated_length": 413.39288330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.16, "grad_norm": 0.059314560145139694, "kl": 0.1519622802734375, "learning_rate": 1.3888888888888892e-06, "loss": -0.0439, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.044302038848400116, "mask/share_reasoning": 0.8175007104873657, "mask/share_step_conf": 0.12257222831249237, "num_tokens": 33645338.0, "reward": 1.3415672779083252, "reward_std": 0.32355982065200806, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.726384162902832, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.830718994140625, "step": 150 }, { "adv/mean_abs_final_conf": 0.7061728835105896, "adv/mean_abs_reasoning": 0.4900563955307007, "adv/mean_abs_step_conf": 0.7467561960220337, "adv/ratio_final_to_reasoning": 1.4410033007442913, "adv/ratio_step_to_reasoning": 1.5238168562484387, "adv/std_final_conf": 0.8989728689193726, "adv/std_reasoning": 0.7393714785575867, "adv/std_step_conf": 0.9360334873199463, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7790909090909091, "calib/avg_num_step_conf": 4.35546875, "calib/ece": 0.2163453815261044, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.44176706827309237, "calib/gap": 0.4591393939393939, "calib/mean_conf": 0.5140160642570282, "calib/mu_c": 0.7906060606060605, "calib/mu_w": 0.33146666666666663, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16638554216867468, "calib/std_conf": 0.45318765644131903, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5401400491400491, "calib/step_q_c_n": 407.0, "calib/step_q_gap": 0.22112027512874965, "calib/step_q_w": 0.31901977401129944, "calib/step_q_w_n": 708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 482.5234375, "completions/mean_terminated_length": 484.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.16106666666666666, "grad_norm": 0.04952695593237877, "kl": 0.127716064453125, "learning_rate": 1.3611111111111112e-06, "loss": 0.0168, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0359514057636261, "mask/share_reasoning": 0.8559660911560059, "mask/share_step_conf": 0.10417624562978745, "num_tokens": 33875888.0, "reward": 1.3257105350494385, "reward_std": 0.28718993067741394, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.7406706809997559, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8194376826286316, "step": 151 }, { "adv/mean_abs_final_conf": 0.7186774611473083, "adv/mean_abs_reasoning": 0.582579493522644, "adv/mean_abs_step_conf": 0.7576056718826294, "adv/ratio_final_to_reasoning": 1.233612698589389, "adv/ratio_step_to_reasoning": 1.3004331259613455, "adv/std_final_conf": 0.9069194197654724, "adv/std_reasoning": 0.8265808820724487, "adv/std_step_conf": 0.9362088441848755, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7519455868396077, "calib/avg_num_step_conf": 4.4375, "calib/ece": 0.22714566929133856, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.452755905511811, "calib/gap": 0.4167751344511232, "calib/mean_conf": 0.5478937007874015, "calib/mu_c": 0.785816513761468, "calib/mu_w": 0.3690413793103448, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1729527559055118, "calib/std_conf": 0.44473848871684885, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5037935483870968, "calib/step_q_c_n": 465.0, "calib/step_q_gap": 0.1702492854809679, "calib/step_q_w": 0.3335442629061289, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 448.5703125, "completions/mean_terminated_length": 450.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.16213333333333332, "grad_norm": 0.053616829216480255, "kl": 0.1596527099609375, "learning_rate": 1.3333333333333334e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.039214253425598145, "mask/share_reasoning": 0.8418387770652771, "mask/share_step_conf": 0.11504073441028595, "num_tokens": 34096114.0, "reward": 1.3434820175170898, "reward_std": 0.26486706733703613, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7340766191482544, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8354280591011047, "step": 152 }, { "adv/mean_abs_final_conf": 0.6649273037910461, "adv/mean_abs_reasoning": 0.48268380761146545, "adv/mean_abs_step_conf": 0.7543078660964966, "adv/ratio_final_to_reasoning": 1.3775628958456319, "adv/ratio_step_to_reasoning": 1.5627370427633527, "adv/std_final_conf": 0.8761218786239624, "adv/std_reasoning": 0.7393322587013245, "adv/std_step_conf": 0.9358770847320557, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6802699433373655, "calib/avg_num_step_conf": 4.64453125, "calib/ece": 0.31198412698412703, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.46825396825396826, "calib/gap": 0.28423505443432867, "calib/mean_conf": 0.5495238095238096, "calib/mu_c": 0.676978417266187, "calib/mu_w": 0.39274336283185834, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1549603174603175, "calib/std_conf": 0.449749237261425, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4632719412724307, "calib/step_q_c_n": 613.0, "calib/step_q_gap": 0.15083617738354183, "calib/step_q_w": 0.3124357638888889, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 453.34765625, "completions/mean_terminated_length": 455.1255187988281, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.1632, "grad_norm": 0.04672946408390999, "kl": 0.1456146240234375, "learning_rate": 1.3055555555555556e-06, "loss": -0.0194, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037633031606674194, "mask/share_reasoning": 0.8455663919448853, "mask/share_step_conf": 0.11289433389902115, "num_tokens": 34319491.0, "reward": 1.3227565288543701, "reward_std": 0.26125621795654297, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6762843132019043, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.832270622253418, "step": 153 }, { "adv/mean_abs_final_conf": 0.6391319036483765, "adv/mean_abs_reasoning": 0.5661917924880981, "adv/mean_abs_step_conf": 0.76639723777771, "adv/ratio_final_to_reasoning": 1.1288258009529724, "adv/ratio_step_to_reasoning": 1.3536000485097466, "adv/std_final_conf": 0.8594533205032349, "adv/std_reasoning": 0.8097780346870422, "adv/std_step_conf": 0.9359622597694397, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7422602567329475, "calib/avg_num_step_conf": 3.99609375, "calib/ece": 0.2400395256916996, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5138339920948617, "calib/gap": 0.4294601057135666, "calib/mean_conf": 0.5810671936758893, "calib/mu_c": 0.8136206896551724, "calib/mu_w": 0.38416058394160585, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18130434782608695, "calib/std_conf": 0.45757658534215817, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5460468384074941, "calib/step_q_c_n": 427.0, "calib/step_q_gap": 0.19163912028668878, "calib/step_q_w": 0.35440771812080535, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 413.70703125, "completions/mean_terminated_length": 415.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.16426666666666667, "grad_norm": 0.047010958194732666, "kl": 0.16094970703125, "learning_rate": 1.2777777777777779e-06, "loss": 0.0255, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0411284863948822, "mask/share_reasoning": 0.8461230993270874, "mask/share_step_conf": 0.10884217172861099, "num_tokens": 34529840.0, "reward": 1.3319199085235596, "reward_std": 0.2809707522392273, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7279886603355408, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8241755366325378, "step": 154 }, { "adv/mean_abs_final_conf": 0.697257399559021, "adv/mean_abs_reasoning": 0.39795076847076416, "adv/mean_abs_step_conf": 0.7744704484939575, "adv/ratio_final_to_reasoning": 1.7521197464661906, "adv/ratio_step_to_reasoning": 1.9461463825539886, "adv/std_final_conf": 0.8761575222015381, "adv/std_reasoning": 0.6815242171287537, "adv/std_step_conf": 0.9362286329269409, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7283950617283951, "calib/avg_num_step_conf": 4.15234375, "calib/ece": 0.26471764705882356, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.43137254901960786, "calib/gap": 0.357225925925926, "calib/mean_conf": 0.49904705882352945, "calib/mu_c": 0.6881666666666667, "calib/mu_w": 0.3309407407407407, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14658823529411769, "calib/std_conf": 0.458563816993944, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47445479999999995, "calib/step_q_c_n": 500.0, "calib/step_q_gap": 0.13788641634103016, "calib/step_q_w": 0.3365683836589698, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 396.61328125, "completions/mean_terminated_length": 396.61328125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.16533333333333333, "grad_norm": 0.06110754236578941, "kl": 0.1593475341796875, "learning_rate": 1.25e-06, "loss": -0.0123, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.041939422488212585, "mask/share_reasoning": 0.8429463505744934, "mask/share_step_conf": 0.11511427164077759, "num_tokens": 34738589.0, "reward": 1.3121984004974365, "reward_std": 0.22575721144676208, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7149654626846313, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8082312345504761, "step": 155 }, { "adv/mean_abs_final_conf": 0.6453638076782227, "adv/mean_abs_reasoning": 0.533676028251648, "adv/mean_abs_step_conf": 0.7646842002868652, "adv/ratio_final_to_reasoning": 1.2092801128663584, "adv/ratio_step_to_reasoning": 1.4328621856822252, "adv/std_final_conf": 0.8599355816841125, "adv/std_reasoning": 0.7576324343681335, "adv/std_step_conf": 0.9362171292304993, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6508128016256033, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.3132270916334661, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4860557768924303, "calib/gap": 0.28091503683007363, "calib/mean_conf": 0.5432669322709163, "calib/mu_c": 0.6854032258064516, "calib/mu_w": 0.404488188976378, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18123505976095616, "calib/std_conf": 0.4599130400895085, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4531105263157894, "calib/step_q_c_n": 570.0, "calib/step_q_gap": 0.11236488913438014, "calib/step_q_w": 0.34074563718140927, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 468.01171875, "completions/mean_terminated_length": 469.8470764160156, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.1664, "grad_norm": 0.06579665839672089, "kl": 0.148345947265625, "learning_rate": 1.2222222222222223e-06, "loss": -0.0088, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.038678042590618134, "mask/share_reasoning": 0.8419922590255737, "mask/share_step_conf": 0.11542340368032455, "num_tokens": 34963160.0, "reward": 1.3079309463500977, "reward_std": 0.27311986684799194, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6633148193359375, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8297890424728394, "step": 156 }, { "adv/mean_abs_final_conf": 0.6082822680473328, "adv/mean_abs_reasoning": 0.5066467523574829, "adv/mean_abs_step_conf": 0.7343960404396057, "adv/ratio_final_to_reasoning": 1.2006042972089106, "adv/ratio_step_to_reasoning": 1.4495228421427364, "adv/std_final_conf": 0.8255444169044495, "adv/std_reasoning": 0.7575724720954895, "adv/std_step_conf": 0.9362667798995972, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.776601206520344, "calib/avg_num_step_conf": 5.015625, "calib/ece": 0.22458498023715412, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5691699604743083, "calib/gap": 0.49837889872930285, "calib/mean_conf": 0.602213438735178, "calib/mu_c": 0.8110204081632651, "calib/mu_w": 0.3126415094339623, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12288537549407111, "calib/std_conf": 0.4677376400274536, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5158425925925926, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.1860958122895623, "calib/step_q_w": 0.3297467803030303, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 470.8671875, "completions/mean_terminated_length": 470.8671875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.16746666666666668, "grad_norm": 0.04144881293177605, "kl": 0.132781982421875, "learning_rate": 1.1944444444444446e-06, "loss": -0.045, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03978681564331055, "mask/share_reasoning": 0.8324368000030518, "mask/share_step_conf": 0.1277763694524765, "num_tokens": 35187430.0, "reward": 1.367426872253418, "reward_std": 0.2844254970550537, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.769758939743042, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8266879916191101, "step": 157 }, { "adv/mean_abs_final_conf": 0.687097430229187, "adv/mean_abs_reasoning": 0.5489099621772766, "adv/mean_abs_step_conf": 0.7720958590507507, "adv/ratio_final_to_reasoning": 1.2517488797320848, "adv/ratio_step_to_reasoning": 1.4065983717770343, "adv/std_final_conf": 0.8751606941223145, "adv/std_reasoning": 0.7576378583908081, "adv/std_step_conf": 0.9359956979751587, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6282976684765251, "calib/avg_num_step_conf": 4.50390625, "calib/ece": 0.33855468749999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.59765625, "calib/gap": 0.22169977642925587, "calib/mean_conf": 0.6432421875, "calib/mu_c": 0.7307096774193549, "calib/mu_w": 0.509009900990099, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18816406249999995, "calib/std_conf": 0.4489902255007505, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5108185628742515, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.1338536144206433, "calib/step_q_w": 0.37696494845360823, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 405.16796875, "completions/mean_terminated_length": 406.75689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.16853333333333334, "grad_norm": 0.0779474750161171, "kl": 0.1684417724609375, "learning_rate": 1.1666666666666668e-06, "loss": 0.0508, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04238620772957802, "mask/share_reasoning": 0.8300025463104248, "mask/share_step_conf": 0.12370499968528748, "num_tokens": 35396393.0, "reward": 1.3117878437042236, "reward_std": 0.2755977511405945, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6640223264694214, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8192297220230103, "step": 158 }, { "adv/mean_abs_final_conf": 0.6682296395301819, "adv/mean_abs_reasoning": 0.5617543458938599, "adv/mean_abs_step_conf": 0.7508109211921692, "adv/ratio_final_to_reasoning": 1.189540667401334, "adv/ratio_step_to_reasoning": 1.3365467070797357, "adv/std_final_conf": 0.8761033415794373, "adv/std_reasoning": 0.7928183078765869, "adv/std_step_conf": 0.9362561702728271, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7296401515151515, "calib/avg_num_step_conf": 4.28515625, "calib/ece": 0.25996031746031745, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5515873015873016, "calib/gap": 0.411590909090909, "calib/mean_conf": 0.5977380952380952, "calib/mu_c": 0.8133333333333332, "calib/mu_w": 0.40174242424242423, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1907539682539682, "calib/std_conf": 0.4621413698118899, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.571676659528908, "calib/step_q_c_n": 467.0, "calib/step_q_gap": 0.21274173889398734, "calib/step_q_w": 0.35893492063492066, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 417.32421875, "completions/mean_terminated_length": 418.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.1696, "grad_norm": 0.05492505058646202, "kl": 0.1497039794921875, "learning_rate": 1.138888888888889e-06, "loss": -0.0204, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04090462625026703, "mask/share_reasoning": 0.8396684527397156, "mask/share_step_conf": 0.11552062630653381, "num_tokens": 35608012.0, "reward": 1.3436503410339355, "reward_std": 0.2901962697505951, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7161792516708374, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8406387567520142, "step": 159 }, { "adv/mean_abs_final_conf": 0.6749417185783386, "adv/mean_abs_reasoning": 0.6084262132644653, "adv/mean_abs_step_conf": 0.766987681388855, "adv/ratio_final_to_reasoning": 1.109323865184883, "adv/ratio_step_to_reasoning": 1.2606091990574173, "adv/std_final_conf": 0.8604558706283569, "adv/std_reasoning": 0.8266105651855469, "adv/std_step_conf": 0.936232328414917, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7114664082687339, "calib/avg_num_step_conf": 4.7421875, "calib/ece": 0.2897710843373495, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5421686746987951, "calib/gap": 0.36726356589147285, "calib/mean_conf": 0.5802690763052208, "calib/mu_c": 0.7572635658914728, "calib/mu_w": 0.38999999999999996, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17598393574297197, "calib/std_conf": 0.4717101714303749, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5031260550458716, "calib/step_q_c_n": 545.0, "calib/step_q_gap": 0.1928853973478148, "calib/step_q_w": 0.3102406576980568, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 446.85546875, "completions/mean_terminated_length": 448.6078796386719, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.17066666666666666, "grad_norm": 0.0584404356777668, "kl": 0.150634765625, "learning_rate": 1.111111111111111e-06, "loss": 0.0536, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03910226374864578, "mask/share_reasoning": 0.8406857252120972, "mask/share_step_conf": 0.11630573123693466, "num_tokens": 35827247.0, "reward": 1.294600009918213, "reward_std": 0.3159686028957367, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6879982352256775, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.802944540977478, "step": 160 }, { "adv/mean_abs_final_conf": 0.5800888538360596, "adv/mean_abs_reasoning": 0.4550928473472595, "adv/mean_abs_step_conf": 0.7868247032165527, "adv/ratio_final_to_reasoning": 1.2746604505375176, "adv/ratio_step_to_reasoning": 1.7289322559186797, "adv/std_final_conf": 0.8195415139198303, "adv/std_reasoning": 0.7391253709793091, "adv/std_step_conf": 0.9356487989425659, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7142903645833334, "calib/avg_num_step_conf": 4.609375, "calib/ece": 0.2741796874999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.5234375, "calib/gap": 0.37472916666666667, "calib/mean_conf": 0.5803515625, "calib/mu_c": 0.720875, "calib/mu_w": 0.3461458333333334, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11476562499999993, "calib/std_conf": 0.46237442703809717, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48515708274894814, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.15993010202089675, "calib/step_q_w": 0.3252269807280514, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 380.1796875, "completions/mean_terminated_length": 381.6706237792969, "completions/min_length": 0.0, "completions/min_terminated_length": 67.0, "epoch": 0.17173333333333332, "grad_norm": 0.04697565361857414, "kl": 0.16094970703125, "learning_rate": 1.0833333333333335e-06, "loss": 0.0567, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.045005638152360916, "mask/share_reasoning": 0.8235310912132263, "mask/share_step_conf": 0.12755702435970306, "num_tokens": 36028493.0, "reward": 1.370754361152649, "reward_std": 0.22566889226436615, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7254956960678101, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8455064296722412, "step": 161 }, { "adv/mean_abs_final_conf": 0.683541476726532, "adv/mean_abs_reasoning": 0.5417003035545349, "adv/mean_abs_step_conf": 0.7814807891845703, "adv/ratio_final_to_reasoning": 1.2618443671551633, "adv/ratio_step_to_reasoning": 1.4426441780752959, "adv/std_final_conf": 0.8726164102554321, "adv/std_reasoning": 0.7753661870956421, "adv/std_step_conf": 0.9360226392745972, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6877846659919028, "calib/avg_num_step_conf": 4.34765625, "calib/ece": 0.29037890625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.61328125, "calib/gap": 0.3148466599190283, "calib/mean_conf": 0.65248828125, "calib/mu_c": 0.7803947368421053, "calib/mu_w": 0.4655480769230769, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17455859375000002, "calib/std_conf": 0.44877169152328544, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.512444099378882, "calib/step_q_c_n": 644.0, "calib/step_q_gap": 0.10413066654306113, "calib/step_q_w": 0.4083134328358209, "calib/step_q_w_n": 469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 377.95703125, "completions/mean_terminated_length": 379.4392395019531, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.1728, "grad_norm": 0.06931810081005096, "kl": 0.1662750244140625, "learning_rate": 1.0555555555555557e-06, "loss": -0.0313, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.043304286897182465, "mask/share_reasoning": 0.8308358788490295, "mask/share_step_conf": 0.12195360660552979, "num_tokens": 36229394.0, "reward": 1.334316372871399, "reward_std": 0.26881474256515503, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.70583176612854, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8220254182815552, "step": 162 }, { "adv/mean_abs_final_conf": 0.5971531271934509, "adv/mean_abs_reasoning": 0.5205508470535278, "adv/mean_abs_step_conf": 0.7498682737350464, "adv/ratio_final_to_reasoning": 1.1471561915104254, "adv/ratio_step_to_reasoning": 1.440528390222633, "adv/std_final_conf": 0.8285160064697266, "adv/std_reasoning": 0.7576987743377686, "adv/std_step_conf": 0.9356043338775635, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7737785218253969, "calib/avg_num_step_conf": 4.609375, "calib/ece": 0.23511811023622053, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.515748031496063, "calib/gap": 0.4542906746031746, "calib/mean_conf": 0.5743307086614173, "calib/mu_c": 0.7996875, "calib/mu_w": 0.34539682539682537, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15275590551181112, "calib/std_conf": 0.460342376096538, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5099199288256229, "calib/step_q_c_n": 562.0, "calib/step_q_gap": 0.1882223560100889, "calib/step_q_w": 0.32169757281553396, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 449.578125, "completions/mean_terminated_length": 449.578125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.17386666666666667, "grad_norm": 0.047132719308137894, "kl": 0.14910888671875, "learning_rate": 1.0277777777777777e-06, "loss": 0.052, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04032721370458603, "mask/share_reasoning": 0.8429751396179199, "mask/share_step_conf": 0.11669766902923584, "num_tokens": 36449318.0, "reward": 1.3833903074264526, "reward_std": 0.24456867575645447, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7504304647445679, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8597375154495239, "step": 163 }, { "adv/mean_abs_final_conf": 0.7085994482040405, "adv/mean_abs_reasoning": 0.5715576410293579, "adv/mean_abs_step_conf": 0.7559859752655029, "adv/ratio_final_to_reasoning": 1.2397690054985084, "adv/ratio_step_to_reasoning": 1.3226767013454588, "adv/std_final_conf": 0.8809927701950073, "adv/std_reasoning": 0.8098272085189819, "adv/std_step_conf": 0.9359097480773926, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7224260055653933, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.28141269841269845, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.503968253968254, "calib/gap": 0.34523956488742724, "calib/mean_conf": 0.5793015873015873, "calib/mu_c": 0.7628813559322034, "calib/mu_w": 0.41764179104477617, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19623015873015875, "calib/std_conf": 0.4523486025748822, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4881397338403042, "calib/step_q_c_n": 526.0, "calib/step_q_gap": 0.1514336860203323, "calib/step_q_w": 0.3367060478199719, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 483.765625, "completions/mean_terminated_length": 485.66278076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.17493333333333333, "grad_norm": 0.05917629599571228, "kl": 0.139862060546875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0461, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03503939509391785, "mask/share_reasoning": 0.8546115159988403, "mask/share_step_conf": 0.10644285380840302, "num_tokens": 36679298.0, "reward": 1.325966238975525, "reward_std": 0.25556206703186035, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6949499249458313, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8339599370956421, "step": 164 }, { "adv/mean_abs_final_conf": 0.6125674843788147, "adv/mean_abs_reasoning": 0.5041183233261108, "adv/mean_abs_step_conf": 0.7621119022369385, "adv/ratio_final_to_reasoning": 1.2151264019470065, "adv/ratio_step_to_reasoning": 1.511771873731186, "adv/std_final_conf": 0.8267259001731873, "adv/std_reasoning": 0.775274395942688, "adv/std_step_conf": 0.9353328943252563, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7095137943832734, "calib/avg_num_step_conf": 4.76953125, "calib/ece": 0.2916470588235295, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5333333333333333, "calib/gap": 0.3677483607571445, "calib/mean_conf": 0.5851372549019607, "calib/mu_c": 0.7827118644067795, "calib/mu_w": 0.414963503649635, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20701960784313736, "calib/std_conf": 0.4624401188413837, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5051555555555556, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.14120972222222228, "calib/step_q_w": 0.3639458333333333, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 456.3671875, "completions/mean_terminated_length": 458.1568908691406, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.176, "grad_norm": 0.05504642054438591, "kl": 0.1468353271484375, "learning_rate": 9.722222222222224e-07, "loss": -0.0093, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03639739751815796, "mask/share_reasoning": 0.8458707332611084, "mask/share_step_conf": 0.11382567137479782, "num_tokens": 36901704.0, "reward": 1.3412384986877441, "reward_std": 0.2222728580236435, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7026550769805908, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8442078232765198, "step": 165 }, { "adv/mean_abs_final_conf": 0.5281888246536255, "adv/mean_abs_reasoning": 0.47246187925338745, "adv/mean_abs_step_conf": 0.7369155287742615, "adv/ratio_final_to_reasoning": 1.1179501412649442, "adv/ratio_step_to_reasoning": 1.5597354223345585, "adv/std_final_conf": 0.7823854684829712, "adv/std_reasoning": 0.7392469048500061, "adv/std_step_conf": 0.9355576634407043, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8042750694619853, "calib/avg_num_step_conf": 4.87890625, "calib/ece": 0.19105882352941178, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5764705882352941, "calib/gap": 0.5152008082849204, "calib/mean_conf": 0.6321960784313726, "calib/mu_c": 0.8483783783783784, "calib/mu_w": 0.33317757009345794, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12143137254901962, "calib/std_conf": 0.44897539212045834, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5243108728943339, "calib/step_q_c_n": 653.0, "calib/step_q_gap": 0.2076749668540654, "calib/step_q_w": 0.3166359060402685, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 464.94140625, "completions/mean_terminated_length": 464.94140625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.17706666666666668, "grad_norm": 0.0417521707713604, "kl": 0.13323974609375, "learning_rate": 9.444444444444445e-07, "loss": 0.0596, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03702709823846817, "mask/share_reasoning": 0.8455966114997864, "mask/share_step_conf": 0.11737628281116486, "num_tokens": 37126913.0, "reward": 1.4266111850738525, "reward_std": 0.20052561163902283, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7997732162475586, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8696931600570679, "step": 166 }, { "adv/mean_abs_final_conf": 0.544503927230835, "adv/mean_abs_reasoning": 0.5453364253044128, "adv/mean_abs_step_conf": 0.758324921131134, "adv/ratio_final_to_reasoning": 0.998473422946004, "adv/ratio_step_to_reasoning": 1.3905634869481323, "adv/std_final_conf": 0.79471755027771, "adv/std_reasoning": 0.7754924893379211, "adv/std_step_conf": 0.9359537363052368, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6856980400026758, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.270752, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.684, "calib/gap": 0.32092367382433606, "calib/mean_conf": 0.7328479999999999, "calib/mu_c": 0.859933774834437, "calib/mu_w": 0.5390101010101009, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1998, "calib/std_conf": 0.4121252053636128, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5010471771428572, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.0949255555212356, "calib/step_q_w": 0.4061216216216216, "calib/step_q_w_n": 518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 435.234375, "completions/mean_terminated_length": 438.6614074707031, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.17813333333333334, "grad_norm": 0.039866652339696884, "kl": 0.1396331787109375, "learning_rate": 9.166666666666666e-07, "loss": -0.0156, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03890000283718109, "mask/share_reasoning": 0.8367317318916321, "mask/share_step_conf": 0.11655577272176743, "num_tokens": 37343941.0, "reward": 1.3247613906860352, "reward_std": 0.2827262878417969, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7072490453720093, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8144960403442383, "step": 167 }, { "adv/mean_abs_final_conf": 0.5947244167327881, "adv/mean_abs_reasoning": 0.5478661060333252, "adv/mean_abs_step_conf": 0.7788569927215576, "adv/ratio_final_to_reasoning": 1.0855287636585655, "adv/ratio_step_to_reasoning": 1.4216192316780807, "adv/std_final_conf": 0.7944765686988831, "adv/std_reasoning": 0.7754152417182922, "adv/std_step_conf": 0.9360412359237671, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7478191911181602, "calib/avg_num_step_conf": 4.9375, "calib/ece": 0.23316205533596834, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6284584980237155, "calib/gap": 0.3987205921226539, "calib/mean_conf": 0.6853359683794465, "calib/mu_c": 0.8382051282051282, "calib/mu_w": 0.43948453608247423, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15094861660079048, "calib/std_conf": 0.43243540696431676, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4778477653631285, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.15568900623904092, "calib/step_q_w": 0.3221587591240876, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 465.28515625, "completions/mean_terminated_length": 468.9488220214844, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.1792, "grad_norm": 0.03753136098384857, "kl": 0.140655517578125, "learning_rate": 8.88888888888889e-07, "loss": -0.0185, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037122730165719986, "mask/share_reasoning": 0.8412609696388245, "mask/share_step_conf": 0.11380381882190704, "num_tokens": 37567726.0, "reward": 1.38808012008667, "reward_std": 0.24945136904716492, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7514784932136536, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8525751233100891, "step": 168 }, { "adv/mean_abs_final_conf": 0.6367510557174683, "adv/mean_abs_reasoning": 0.5308976173400879, "adv/mean_abs_step_conf": 0.7567160129547119, "adv/ratio_final_to_reasoning": 1.1993857853567493, "adv/ratio_step_to_reasoning": 1.4253520608098094, "adv/std_final_conf": 0.8464077711105347, "adv/std_reasoning": 0.7754961252212524, "adv/std_step_conf": 0.9359106421470642, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7465494422386084, "calib/avg_num_step_conf": 4.28515625, "calib/ece": 0.24952380952380956, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6031746031746031, "calib/gap": 0.4017167706560787, "calib/mean_conf": 0.6587301587301587, "calib/mu_c": 0.8548062015503877, "calib/mu_w": 0.453089430894309, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19817460317460323, "calib/std_conf": 0.43882867924210656, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5423281690140845, "calib/step_q_c_n": 568.0, "calib/step_q_gap": 0.179225522511249, "calib/step_q_w": 0.3631026465028355, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 447.34375, "completions/mean_terminated_length": 450.86614990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.18026666666666666, "grad_norm": 0.04010246694087982, "kl": 0.15643310546875, "learning_rate": 8.611111111111112e-07, "loss": -0.0581, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037801437079906464, "mask/share_reasoning": 0.8434059619903564, "mask/share_step_conf": 0.11098004877567291, "num_tokens": 37786430.0, "reward": 1.3459802865982056, "reward_std": 0.28642362356185913, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7174351215362549, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8392157554626465, "step": 169 }, { "adv/mean_abs_final_conf": 0.6127386093139648, "adv/mean_abs_reasoning": 0.4825683832168579, "adv/mean_abs_step_conf": 0.7408894300460815, "adv/ratio_final_to_reasoning": 1.2697446219525963, "adv/ratio_step_to_reasoning": 1.5353045408968258, "adv/std_final_conf": 0.8355098962783813, "adv/std_reasoning": 0.7394052147865295, "adv/std_step_conf": 0.9359559416770935, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7182885906040268, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.23140562248995986, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6626506024096386, "calib/gap": 0.3948080536912752, "calib/mean_conf": 0.7026506024096385, "calib/mu_c": 0.8612080536912752, "calib/mu_w": 0.4664, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16783132530120484, "calib/std_conf": 0.42632006474014184, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5080845481049563, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.18208097028742498, "calib/step_q_w": 0.3260035778175313, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2739.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 485.921875, "completions/mean_terminated_length": 487.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.18133333333333335, "grad_norm": 0.04000094532966614, "kl": 0.127960205078125, "learning_rate": 8.333333333333333e-07, "loss": -0.0138, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03573903813958168, "mask/share_reasoning": 0.8482216000556946, "mask/share_step_conf": 0.11213310807943344, "num_tokens": 38014978.0, "reward": 1.3481841087341309, "reward_std": 0.29889148473739624, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7322218418121338, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8269951343536377, "step": 170 }, { "adv/mean_abs_final_conf": 0.675028383731842, "adv/mean_abs_reasoning": 0.5765197277069092, "adv/mean_abs_step_conf": 0.7273029685020447, "adv/ratio_final_to_reasoning": 1.1708677973896022, "adv/ratio_step_to_reasoning": 1.2615404704273894, "adv/std_final_conf": 0.8604573011398315, "adv/std_reasoning": 0.8098583817481995, "adv/std_step_conf": 0.9359089732170105, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6691556395715186, "calib/avg_num_step_conf": 4.59375, "calib/ece": 0.3261739130434782, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5612648221343873, "calib/gap": 0.282, "calib/mean_conf": 0.6241818181818182, "calib/mu_c": 0.778, "calib/mu_w": 0.49600000000000005, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2479051383399209, "calib/std_conf": 0.4454292224793872, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5166305220883535, "calib/step_q_c_n": 498.0, "calib/step_q_gap": 0.16900677577566914, "calib/step_q_w": 0.34762374631268433, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 458.9140625, "completions/mean_terminated_length": 458.9140625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1824, "grad_norm": 0.04507390782237053, "kl": 0.14056396484375, "learning_rate": 8.055555555555557e-07, "loss": -0.0121, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03748851642012596, "mask/share_reasoning": 0.8515720963478088, "mask/share_step_conf": 0.1109393835067749, "num_tokens": 38239356.0, "reward": 1.3258652687072754, "reward_std": 0.2702135443687439, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6569274663925171, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8532608151435852, "step": 171 }, { "adv/mean_abs_final_conf": 0.6522042751312256, "adv/mean_abs_reasoning": 0.5203927755355835, "adv/mean_abs_step_conf": 0.7619109153747559, "adv/ratio_final_to_reasoning": 1.2532923318544975, "adv/ratio_step_to_reasoning": 1.464107403471549, "adv/std_final_conf": 0.8345689177513123, "adv/std_reasoning": 0.7753735780715942, "adv/std_step_conf": 0.9356265068054199, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7008541600759253, "calib/avg_num_step_conf": 4.453125, "calib/ece": 0.26681102362204734, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6850393700787402, "calib/gap": 0.31271306548560585, "calib/mean_conf": 0.7587007874015748, "calib/mu_c": 0.892896551724138, "calib/mu_w": 0.5801834862385321, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22732283464566938, "calib/std_conf": 0.3834585219171378, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5435008665511265, "calib/step_q_c_n": 577.0, "calib/step_q_gap": 0.1564386995884267, "calib/step_q_w": 0.3870621669626998, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 431.22265625, "completions/mean_terminated_length": 432.91375732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.18346666666666667, "grad_norm": 0.0644737184047699, "kl": 0.1463623046875, "learning_rate": 7.777777777777779e-07, "loss": 0.0133, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03934334218502045, "mask/share_reasoning": 0.8426755666732788, "mask/share_step_conf": 0.11407487094402313, "num_tokens": 38453101.0, "reward": 1.3807896375656128, "reward_std": 0.2652018070220947, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7202441692352295, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8648082613945007, "step": 172 }, { "adv/mean_abs_final_conf": 0.6737043857574463, "adv/mean_abs_reasoning": 0.5796314477920532, "adv/mean_abs_step_conf": 0.7727189660072327, "adv/ratio_final_to_reasoning": 1.1622978503387593, "adv/ratio_step_to_reasoning": 1.3331211909752194, "adv/std_final_conf": 0.8583126664161682, "adv/std_reasoning": 0.792940616607666, "adv/std_step_conf": 0.9362257122993469, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6987571762033941, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.27857142857142864, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6865079365079365, "calib/gap": 0.34207936407797623, "calib/mean_conf": 0.7465873015873016, "calib/mu_c": 0.9108396946564886, "calib/mu_w": 0.5687603305785124, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25265873015873025, "calib/std_conf": 0.3980610998374869, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5129598755832038, "calib/step_q_c_n": 643.0, "calib/step_q_gap": 0.09447132885170734, "calib/step_q_w": 0.41848854673149644, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 484.59765625, "completions/mean_terminated_length": 484.59765625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.18453333333333333, "grad_norm": 0.046825289726257324, "kl": 0.135772705078125, "learning_rate": 7.5e-07, "loss": 0.06, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03792772442102432, "mask/share_reasoning": 0.8422991037368774, "mask/share_step_conf": 0.11977314949035645, "num_tokens": 38680318.0, "reward": 1.3218117952346802, "reward_std": 0.29576268792152405, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7001835703849792, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8221105337142944, "step": 173 }, { "adv/mean_abs_final_conf": 0.7295100092887878, "adv/mean_abs_reasoning": 0.5894637107849121, "adv/mean_abs_step_conf": 0.7436856031417847, "adv/ratio_final_to_reasoning": 1.237582561812659, "adv/ratio_step_to_reasoning": 1.2616308511197667, "adv/std_final_conf": 0.907024621963501, "adv/std_reasoning": 0.8264725208282471, "adv/std_step_conf": 0.9360268115997314, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6277687398323113, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.3521739130434783, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5968379446640316, "calib/gap": 0.22327430859717168, "calib/mean_conf": 0.6431620553359684, "calib/mu_c": 0.7587704918032786, "calib/mu_w": 0.5354961832061069, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25656126482213437, "calib/std_conf": 0.4448576196956833, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4367843137254902, "calib/step_q_c_n": 561.0, "calib/step_q_gap": 0.053985316231806024, "calib/step_q_w": 0.3827989974936842, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 460.16015625, "completions/mean_terminated_length": 461.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1856, "grad_norm": 0.04496309533715248, "kl": 0.1407012939453125, "learning_rate": 7.222222222222222e-07, "loss": 0.0457, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035832107067108154, "mask/share_reasoning": 0.8494890332221985, "mask/share_step_conf": 0.11077260971069336, "num_tokens": 38902351.0, "reward": 1.2767741680145264, "reward_std": 0.28168269991874695, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6305328011512756, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8150234222412109, "step": 174 }, { "adv/mean_abs_final_conf": 0.6855437755584717, "adv/mean_abs_reasoning": 0.4921633005142212, "adv/mean_abs_step_conf": 0.7646262049674988, "adv/ratio_final_to_reasoning": 1.3929193315352912, "adv/ratio_step_to_reasoning": 1.5536026440179578, "adv/std_final_conf": 0.8800501823425293, "adv/std_reasoning": 0.7576267719268799, "adv/std_step_conf": 0.9361082315444946, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7554347826086957, "calib/avg_num_step_conf": 5.03125, "calib/ece": 0.2831075697211155, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4860557768924303, "calib/gap": 0.430406070549631, "calib/mean_conf": 0.5675697211155378, "calib/mu_c": 0.8402173913043479, "calib/mu_w": 0.40981132075471693, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24207171314741033, "calib/std_conf": 0.45303499565746624, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5376992665036674, "calib/step_q_c_n": 409.0, "calib/step_q_gap": 0.23311519369365602, "calib/step_q_w": 0.3045840728100114, "calib/step_q_w_n": 879.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 467.69921875, "completions/mean_terminated_length": 469.5333557128906, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.18666666666666668, "grad_norm": 0.14519743621349335, "kl": 0.2266845703125, "learning_rate": 6.944444444444446e-07, "loss": -0.0928, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03772320598363876, "mask/share_reasoning": 0.8381674289703369, "mask/share_step_conf": 0.12020306289196014, "num_tokens": 39127906.0, "reward": 1.3145315647125244, "reward_std": 0.2927352488040924, "rewards/accuracy_reward_step": 0.359375, "rewards/final_brier_reward_step": 0.7042621374130249, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8288068175315857, "step": 175 }, { "adv/mean_abs_final_conf": 0.619603157043457, "adv/mean_abs_reasoning": 0.39660730957984924, "adv/mean_abs_step_conf": 0.7484374642372131, "adv/ratio_final_to_reasoning": 1.56225854157817, "adv/ratio_step_to_reasoning": 1.8870995217664532, "adv/std_final_conf": 0.8269156813621521, "adv/std_reasoning": 0.6816341876983643, "adv/std_step_conf": 0.9356382489204407, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7637772675086107, "calib/avg_num_step_conf": 4.60546875, "calib/ece": 0.2599601593625498, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5856573705179283, "calib/gap": 0.4572847301951782, "calib/mean_conf": 0.6374103585657371, "calib/mu_c": 0.8815384615384617, "calib/mu_w": 0.42425373134328354, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21561752988047805, "calib/std_conf": 0.45065166065959406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.561765784114053, "calib/step_q_c_n": 491.0, "calib/step_q_gap": 0.24466840039312276, "calib/step_q_w": 0.3170973837209302, "calib/step_q_w_n": 688.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 465.328125, "completions/mean_terminated_length": 467.1529541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.18773333333333334, "grad_norm": 0.04847950115799904, "kl": 0.132049560546875, "learning_rate": 6.666666666666667e-07, "loss": -0.0013, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0396907776594162, "mask/share_reasoning": 0.8393638134002686, "mask/share_step_conf": 0.11703912168741226, "num_tokens": 39351094.0, "reward": 1.3537293672561646, "reward_std": 0.27692878246307373, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7317417860031128, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8441084623336792, "step": 176 }, { "adv/mean_abs_final_conf": 0.6589510440826416, "adv/mean_abs_reasoning": 0.3878467082977295, "adv/mean_abs_step_conf": 0.7780240178108215, "adv/ratio_final_to_reasoning": 1.6989986764997875, "adv/ratio_step_to_reasoning": 2.0060090782401936, "adv/std_final_conf": 0.8572041988372803, "adv/std_reasoning": 0.6612958908081055, "adv/std_step_conf": 0.935721755027771, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7402791625124627, "calib/avg_num_step_conf": 4.90625, "calib/ece": 0.25370078740157476, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5354330708661418, "calib/gap": 0.40251246261216345, "calib/mean_conf": 0.5934645669291339, "calib/mu_c": 0.8089830508474576, "calib/mu_w": 0.40647058823529414, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19129921259842517, "calib/std_conf": 0.45309066022408007, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5174404761904763, "calib/step_q_c_n": 504.0, "calib/step_q_gap": 0.21048036980749757, "calib/step_q_w": 0.3069601063829787, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 464.4140625, "completions/mean_terminated_length": 464.4140625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.1888, "grad_norm": 0.09088198095560074, "kl": 0.1372833251953125, "learning_rate": 6.388888888888889e-07, "loss": -0.0281, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0373879075050354, "mask/share_reasoning": 0.8458495140075684, "mask/share_step_conf": 0.11676257103681564, "num_tokens": 39573816.0, "reward": 1.3609651327133179, "reward_std": 0.24577787518501282, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7238953113555908, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8537049293518066, "step": 177 }, { "adv/mean_abs_final_conf": 0.6388211250305176, "adv/mean_abs_reasoning": 0.6201913952827454, "adv/mean_abs_step_conf": 0.7540395259857178, "adv/ratio_final_to_reasoning": 1.0300386782039743, "adv/ratio_step_to_reasoning": 1.2158174584830397, "adv/std_final_conf": 0.8600205779075623, "adv/std_reasoning": 0.8266543745994568, "adv/std_step_conf": 0.9360244274139404, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8161231884057971, "calib/avg_num_step_conf": 4.8359375, "calib/ece": 0.180728, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.556, "calib/gap": 0.5187885610766045, "calib/mean_conf": 0.6250319999999999, "calib/mu_c": 0.8574492753623189, "calib/mu_w": 0.33866071428571426, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12688, "calib/std_conf": 0.44586524306790276, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4992134831460674, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.19817445875582346, "calib/step_q_w": 0.30103902439024394, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 444.16015625, "completions/mean_terminated_length": 449.4269104003906, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.18986666666666666, "grad_norm": 0.054977044463157654, "kl": 0.1357879638671875, "learning_rate": 6.111111111111112e-07, "loss": -0.0295, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037683069705963135, "mask/share_reasoning": 0.8318748474121094, "mask/share_step_conf": 0.11872333288192749, "num_tokens": 39793593.0, "reward": 1.3868026733398438, "reward_std": 0.2786443829536438, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7862921953201294, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8420940637588501, "step": 178 }, { "adv/mean_abs_final_conf": 0.6645340919494629, "adv/mean_abs_reasoning": 0.5819056630134583, "adv/mean_abs_step_conf": 0.7427074909210205, "adv/ratio_final_to_reasoning": 1.1419962619165878, "adv/ratio_step_to_reasoning": 1.2763365922146788, "adv/std_final_conf": 0.8761852979660034, "adv/std_reasoning": 0.8265078663825989, "adv/std_step_conf": 0.9360561966896057, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7403410852713178, "calib/avg_num_step_conf": 4.1875, "calib/ece": 0.25804724409448815, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5393700787401575, "calib/gap": 0.4028840930232559, "calib/mean_conf": 0.5851023622047243, "calib/mu_c": 0.7833720930232558, "calib/mu_w": 0.38048799999999994, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16763779527559053, "calib/std_conf": 0.45991964951518216, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5248474903474903, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.17961283330777916, "calib/step_q_w": 0.3452346570397112, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 421.703125, "completions/mean_terminated_length": 425.02362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.19093333333333334, "grad_norm": 0.04853654280304909, "kl": 0.149322509765625, "learning_rate": 5.833333333333334e-07, "loss": -0.0655, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03802802786231041, "mask/share_reasoning": 0.8477430939674377, "mask/share_step_conf": 0.10641638189554214, "num_tokens": 40007813.0, "reward": 1.3584095239639282, "reward_std": 0.2803743779659271, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7246987819671631, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8468413352966309, "step": 179 }, { "adv/mean_abs_final_conf": 0.6793646812438965, "adv/mean_abs_reasoning": 0.5431982278823853, "adv/mean_abs_step_conf": 0.7438191771507263, "adv/ratio_final_to_reasoning": 1.250675437385621, "adv/ratio_step_to_reasoning": 1.3693328493549137, "adv/std_final_conf": 0.8664782643318176, "adv/std_reasoning": 0.7928855419158936, "adv/std_step_conf": 0.9362828135490417, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7301007882155317, "calib/avg_num_step_conf": 4.73828125, "calib/ece": 0.2398446215139441, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.549800796812749, "calib/gap": 0.37958831890425127, "calib/mean_conf": 0.6286095617529881, "calib/mu_c": 0.7934507042253521, "calib/mu_w": 0.41386238532110087, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1513585657370517, "calib/std_conf": 0.4365410753940372, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46762560777957857, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.13475501954428443, "calib/step_q_w": 0.33287058823529414, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 493.0625, "completions/mean_terminated_length": 494.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.192, "grad_norm": 0.04936710745096207, "kl": 0.1364593505859375, "learning_rate": 5.555555555555555e-07, "loss": -0.0353, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035570383071899414, "mask/share_reasoning": 0.8538831472396851, "mask/share_step_conf": 0.10664021968841553, "num_tokens": 40237893.0, "reward": 1.330967903137207, "reward_std": 0.3095991909503937, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7201747894287109, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8185367584228516, "step": 180 }, { "adv/mean_abs_final_conf": 0.6676289439201355, "adv/mean_abs_reasoning": 0.5264783501625061, "adv/mean_abs_step_conf": 0.7492181062698364, "adv/ratio_final_to_reasoning": 1.2681033203246836, "adv/ratio_step_to_reasoning": 1.4230748634555972, "adv/std_final_conf": 0.8744766116142273, "adv/std_reasoning": 0.7928068041801453, "adv/std_step_conf": 0.9359700083732605, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7649448818897637, "calib/avg_num_step_conf": 4.53515625, "calib/ece": 0.23920238095238094, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4444444444444444, "calib/gap": 0.3926706771653545, "calib/mean_conf": 0.5361944444444445, "calib/mu_c": 0.7340880000000001, "calib/mu_w": 0.3414173228346456, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13968253968253969, "calib/std_conf": 0.44734548397477747, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4891968085106383, "calib/step_q_c_n": 564.0, "calib/step_q_gap": 0.17155861755586443, "calib/step_q_w": 0.31763819095477386, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 441.85546875, "completions/mean_terminated_length": 443.5882568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.19306666666666666, "grad_norm": 0.047045283019542694, "kl": 0.1700897216796875, "learning_rate": 5.277777777777779e-07, "loss": 0.051, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037868089973926544, "mask/share_reasoning": 0.8394981622695923, "mask/share_step_conf": 0.11872752755880356, "num_tokens": 40457272.0, "reward": 1.3582584857940674, "reward_std": 0.2621048092842102, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7291232347488403, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8468218445777893, "step": 181 }, { "adv/mean_abs_final_conf": 0.565575897693634, "adv/mean_abs_reasoning": 0.3998072147369385, "adv/mean_abs_step_conf": 0.7626135945320129, "adv/ratio_final_to_reasoning": 1.4146215396982431, "adv/ratio_step_to_reasoning": 1.907453308549698, "adv/std_final_conf": 0.799468994140625, "adv/std_reasoning": 0.68152916431427, "adv/std_step_conf": 0.9355183243751526, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7179000835851604, "calib/avg_num_step_conf": 4.6953125, "calib/ece": 0.25555511811023623, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.610236220472441, "calib/gap": 0.3693243104224266, "calib/mean_conf": 0.6671220472440945, "calib/mu_c": 0.8168874172185431, "calib/mu_w": 0.4475631067961165, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16409448818897643, "calib/std_conf": 0.4407465803572345, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4907097744360902, "calib/step_q_c_n": 665.0, "calib/step_q_gap": 0.1647898489239859, "calib/step_q_w": 0.3259199255121043, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 456.72265625, "completions/mean_terminated_length": 456.72265625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.19413333333333332, "grad_norm": 0.046485915780067444, "kl": 0.14080810546875, "learning_rate": 5.000000000000001e-07, "loss": 0.0515, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03678284212946892, "mask/share_reasoning": 0.8472731113433838, "mask/share_step_conf": 0.11594408005475998, "num_tokens": 40680353.0, "reward": 1.3809163570404053, "reward_std": 0.21928508579730988, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7317008376121521, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8568628430366516, "step": 182 }, { "adv/mean_abs_final_conf": 0.6731914281845093, "adv/mean_abs_reasoning": 0.6058062314987183, "adv/mean_abs_step_conf": 0.758799135684967, "adv/ratio_final_to_reasoning": 1.1112322607165748, "adv/ratio_step_to_reasoning": 1.2525442892981737, "adv/std_final_conf": 0.8595358729362488, "adv/std_reasoning": 0.8099452257156372, "adv/std_step_conf": 0.9361091256141663, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.637114951164538, "calib/avg_num_step_conf": 4.2109375, "calib/ece": 0.3296047430830039, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5177865612648221, "calib/gap": 0.25068870523415987, "calib/mean_conf": 0.5725296442687747, "calib/mu_c": 0.6924242424242425, "calib/mu_w": 0.4417355371900826, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19019762845849808, "calib/std_conf": 0.46056238618867934, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4695128676470588, "calib/step_q_c_n": 544.0, "calib/step_q_gap": 0.10510462794668429, "calib/step_q_w": 0.3644082397003745, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2079.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 459.60546875, "completions/mean_terminated_length": 459.60546875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1952, "grad_norm": 0.06715097278356552, "kl": 0.1397247314453125, "learning_rate": 4.7222222222222226e-07, "loss": 0.0486, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037804316729307175, "mask/share_reasoning": 0.859054446220398, "mask/share_step_conf": 0.10314127802848816, "num_tokens": 40904692.0, "reward": 1.300488829612732, "reward_std": 0.2849113345146179, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6523257493972778, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8243258595466614, "step": 183 }, { "adv/mean_abs_final_conf": 0.70212721824646, "adv/mean_abs_reasoning": 0.5660622119903564, "adv/mean_abs_step_conf": 0.7311906218528748, "adv/ratio_final_to_reasoning": 1.240371117120995, "adv/ratio_step_to_reasoning": 1.2917142433548126, "adv/std_final_conf": 0.8754715919494629, "adv/std_reasoning": 0.8265897631645203, "adv/std_step_conf": 0.9362109899520874, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.737949829261886, "calib/avg_num_step_conf": 4.71875, "calib/ece": 0.23835341365461854, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6184738955823293, "calib/gap": 0.4285973207249802, "calib/mean_conf": 0.6523293172690763, "calib/mu_c": 0.8382269503546099, "calib/mu_w": 0.40962962962962973, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16220883534136554, "calib/std_conf": 0.44985606353090074, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5141587561374794, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.18701972766176755, "calib/step_q_w": 0.32713902847571186, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 445.72265625, "completions/mean_terminated_length": 449.2322692871094, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.19626666666666667, "grad_norm": 0.03923555091023445, "kl": 0.149627685546875, "learning_rate": 4.444444444444445e-07, "loss": -0.0426, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03696506842970848, "mask/share_reasoning": 0.8455117344856262, "mask/share_step_conf": 0.1097106784582138, "num_tokens": 41124077.0, "reward": 1.3372976779937744, "reward_std": 0.3383023142814636, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7344995737075806, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8177042007446289, "step": 184 }, { "adv/mean_abs_final_conf": 0.6509227752685547, "adv/mean_abs_reasoning": 0.5465054512023926, "adv/mean_abs_step_conf": 0.7464802265167236, "adv/ratio_final_to_reasoning": 1.1910636460010209, "adv/ratio_step_to_reasoning": 1.3659154265970395, "adv/std_final_conf": 0.8595633506774902, "adv/std_reasoning": 0.7754468321800232, "adv/std_step_conf": 0.9359642267227173, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7657847069478587, "calib/avg_num_step_conf": 4.67578125, "calib/ece": 0.231004016064257, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5502008032128514, "calib/gap": 0.43655065338336146, "calib/mean_conf": 0.6228915662650603, "calib/mu_c": 0.8297709923664123, "calib/mu_w": 0.3932203389830508, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16389558232931722, "calib/std_conf": 0.4465374059457861, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5231329690346084, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.233674635701275, "calib/step_q_w": 0.2894583333333334, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 467.9296875, "completions/mean_terminated_length": 473.478271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.19733333333333333, "grad_norm": 0.05877178534865379, "kl": 0.13330078125, "learning_rate": 4.1666666666666667e-07, "loss": 0.0617, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03615127131342888, "mask/share_reasoning": 0.8481602668762207, "mask/share_step_conf": 0.10396970063447952, "num_tokens": 41350787.0, "reward": 1.354670763015747, "reward_std": 0.26534274220466614, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7350628972053528, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.839092493057251, "step": 185 }, { "adv/mean_abs_final_conf": 0.5814087986946106, "adv/mean_abs_reasoning": 0.5024453997612, "adv/mean_abs_step_conf": 0.7515132427215576, "adv/ratio_final_to_reasoning": 1.1571581687700594, "adv/ratio_step_to_reasoning": 1.4957112615196269, "adv/std_final_conf": 0.8138114213943481, "adv/std_reasoning": 0.7575085759162903, "adv/std_step_conf": 0.9355971217155457, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7794045275590551, "calib/avg_num_step_conf": 4.58984375, "calib/ece": 0.22841568627450987, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5176470588235295, "calib/gap": 0.4784271653543307, "calib/mean_conf": 0.5925254901960784, "calib/mu_c": 0.8326771653543307, "calib/mu_w": 0.35424999999999995, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1614509803921569, "calib/std_conf": 0.46045974290967717, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5064904580152672, "calib/step_q_c_n": 524.0, "calib/step_q_gap": 0.18329998182479096, "calib/step_q_w": 0.3231904761904762, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 461.1015625, "completions/mean_terminated_length": 461.1015625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.1984, "grad_norm": 0.06877963989973068, "kl": 0.150177001953125, "learning_rate": 3.8888888888888895e-07, "loss": 0.0762, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03738259896636009, "mask/share_reasoning": 0.8527088165283203, "mask/share_step_conf": 0.1099085807800293, "num_tokens": 41573869.0, "reward": 1.4036998748779297, "reward_std": 0.21290750801563263, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7652618885040283, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8718501925468445, "step": 186 }, { "adv/mean_abs_final_conf": 0.700536847114563, "adv/mean_abs_reasoning": 0.603737473487854, "adv/mean_abs_step_conf": 0.7438031435012817, "adv/ratio_final_to_reasoning": 1.1603335520445484, "adv/ratio_step_to_reasoning": 1.2319976416309788, "adv/std_final_conf": 0.8750166893005371, "adv/std_reasoning": 0.8099066019058228, "adv/std_step_conf": 0.9360366463661194, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6577019471756314, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.307968, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.2565840241629714, "calib/mean_conf": 0.594352, "calib/mu_c": 0.7308547008547007, "calib/mu_w": 0.47427067669172934, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21716000000000002, "calib/std_conf": 0.43475091270289473, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4549644864847921, "calib/step_q_c_n": 574.0, "calib/step_q_gap": 0.12196934085372407, "calib/step_q_w": 0.332995145631068, "calib/step_q_w_n": 824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 503.4765625, "completions/mean_terminated_length": 503.4765625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.19946666666666665, "grad_norm": 0.11839123070240021, "kl": 0.1331329345703125, "learning_rate": 3.611111111111111e-07, "loss": 0.0877, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03556034713983536, "mask/share_reasoning": 0.8487687706947327, "mask/share_step_conf": 0.11567091196775436, "num_tokens": 41804303.0, "reward": 1.2941031455993652, "reward_std": 0.2728716731071472, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6580247282981873, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.821340799331665, "step": 187 }, { "adv/mean_abs_final_conf": 0.7011606693267822, "adv/mean_abs_reasoning": 0.5370633602142334, "adv/mean_abs_step_conf": 0.7369471192359924, "adv/ratio_final_to_reasoning": 1.3055455301346395, "adv/ratio_step_to_reasoning": 1.3721791018140315, "adv/std_final_conf": 0.9064499139785767, "adv/std_reasoning": 0.7926884293556213, "adv/std_step_conf": 0.9360687732696533, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.760989010989011, "calib/avg_num_step_conf": 4.55859375, "calib/ece": 0.24700000000000008, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.56640625, "calib/gap": 0.44895677655677657, "calib/mean_conf": 0.6251093750000001, "calib/mu_c": 0.8530952380952381, "calib/mu_w": 0.40413846153846156, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18996093750000007, "calib/std_conf": 0.4533882034604665, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5160293577981653, "calib/step_q_c_n": 545.0, "calib/step_q_gap": 0.15879784654414603, "calib/step_q_w": 0.35723151125401925, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 437.6328125, "completions/mean_terminated_length": 439.34906005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.20053333333333334, "grad_norm": 0.06301950663328171, "kl": 0.136566162109375, "learning_rate": 3.3333333333333335e-07, "loss": 0.0086, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03884129226207733, "mask/share_reasoning": 0.8422836065292358, "mask/share_step_conf": 0.11496884375810623, "num_tokens": 42020409.0, "reward": 1.3785154819488525, "reward_std": 0.2768915891647339, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7512555122375488, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8536690473556519, "step": 188 }, { "adv/mean_abs_final_conf": 0.6124563217163086, "adv/mean_abs_reasoning": 0.45022422075271606, "adv/mean_abs_step_conf": 0.7598081827163696, "adv/ratio_final_to_reasoning": 1.3603362357812774, "adv/ratio_step_to_reasoning": 1.6876217397768376, "adv/std_final_conf": 0.8141918778419495, "adv/std_reasoning": 0.7205761075019836, "adv/std_step_conf": 0.9360559582710266, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7318624712964688, "calib/avg_num_step_conf": 4.2265625, "calib/ece": 0.24921259842519683, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.44881889763779526, "calib/gap": 0.40485074163718743, "calib/mean_conf": 0.5106299212598425, "calib/mu_c": 0.7194308943089431, "calib/mu_w": 0.3145801526717557, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13779527559055116, "calib/std_conf": 0.4628859284173213, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5091338582677166, "calib/step_q_c_n": 508.0, "calib/step_q_gap": 0.22357636697851796, "calib/step_q_w": 0.28555749128919866, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 424.0234375, "completions/mean_terminated_length": 427.3622131347656, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.2016, "grad_norm": 0.04162989556789398, "kl": 0.1527099609375, "learning_rate": 3.055555555555556e-07, "loss": -0.0255, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03960593044757843, "mask/share_reasoning": 0.8452208042144775, "mask/share_step_conf": 0.10736077278852463, "num_tokens": 42236727.0, "reward": 1.3523257970809937, "reward_std": 0.2419973611831665, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.731751561164856, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8391844034194946, "step": 189 }, { "adv/mean_abs_final_conf": 0.7057432532310486, "adv/mean_abs_reasoning": 0.6029276251792908, "adv/mean_abs_step_conf": 0.7524210214614868, "adv/ratio_final_to_reasoning": 1.1705273133258471, "adv/ratio_step_to_reasoning": 1.2479458396648215, "adv/std_final_conf": 0.8848484754562378, "adv/std_reasoning": 0.8428612947463989, "adv/std_step_conf": 0.9360077381134033, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7822839856305539, "calib/avg_num_step_conf": 4.703125, "calib/ece": 0.19853174603174595, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4801587301587302, "calib/gap": 0.499896010588013, "calib/mean_conf": 0.5331349206349206, "calib/mu_c": 0.7771317829457365, "calib/mu_w": 0.2772357723577235, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1098809523809523, "calib/std_conf": 0.4639368432177326, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49771258503401355, "calib/step_q_c_n": 588.0, "calib/step_q_gap": 0.1850534941249226, "calib/step_q_w": 0.31265909090909094, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 486.234375, "completions/mean_terminated_length": 486.234375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.20266666666666666, "grad_norm": 0.06818609684705734, "kl": 0.1410675048828125, "learning_rate": 2.7777777777777776e-07, "loss": 0.033, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034055113792419434, "mask/share_reasoning": 0.8611304759979248, "mask/share_step_conf": 0.10481436550617218, "num_tokens": 42466811.0, "reward": 1.3780144453048706, "reward_std": 0.2664412260055542, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7680996656417847, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8447458744049072, "step": 190 }, { "adv/mean_abs_final_conf": 0.584932267665863, "adv/mean_abs_reasoning": 0.46373453736305237, "adv/mean_abs_step_conf": 0.7547152638435364, "adv/ratio_final_to_reasoning": 1.2613515288121109, "adv/ratio_step_to_reasoning": 1.6274726228826872, "adv/std_final_conf": 0.8164354562759399, "adv/std_reasoning": 0.7392594814300537, "adv/std_step_conf": 0.9352312684059143, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7241357069143446, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.2514, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.504, "calib/gap": 0.3930263157894736, "calib/mean_conf": 0.57172, "calib/mu_c": 0.7855263157894736, "calib/mu_w": 0.3925, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18356000000000003, "calib/std_conf": 0.4546654172025843, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49432965517241373, "calib/step_q_c_n": 551.0, "calib/step_q_gap": 0.1917289718011153, "calib/step_q_w": 0.3026006833712984, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 453.77734375, "completions/mean_terminated_length": 457.35040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.20373333333333332, "grad_norm": 0.06090309098362923, "kl": 0.1447601318359375, "learning_rate": 2.5000000000000004e-07, "loss": -0.0163, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.040286559611558914, "mask/share_reasoning": 0.821499228477478, "mask/share_step_conf": 0.13040170073509216, "num_tokens": 42687146.0, "reward": 1.3344109058380127, "reward_std": 0.23764482140541077, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7097808122634888, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8373328447341919, "step": 191 }, { "adv/mean_abs_final_conf": 0.6598813533782959, "adv/mean_abs_reasoning": 0.465440958738327, "adv/mean_abs_step_conf": 0.7431613206863403, "adv/ratio_final_to_reasoning": 1.4177552297224536, "adv/ratio_step_to_reasoning": 1.596682257403455, "adv/std_final_conf": 0.8603542447090149, "adv/std_reasoning": 0.7393536567687988, "adv/std_step_conf": 0.9358422756195068, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8130050505050506, "calib/avg_num_step_conf": 4.421875, "calib/ece": 0.19507936507936507, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4603174603174603, "calib/gap": 0.5058333333333334, "calib/mean_conf": 0.5258730158730158, "calib/mu_c": 0.7908333333333334, "calib/mu_w": 0.285, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12238095238095237, "calib/std_conf": 0.45866398435205064, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5151640316205534, "calib/step_q_c_n": 506.0, "calib/step_q_gap": 0.19083975047039364, "calib/step_q_w": 0.3243242811501597, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 449.78515625, "completions/mean_terminated_length": 451.5490417480469, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.2048, "grad_norm": 0.04594694823026657, "kl": 0.146728515625, "learning_rate": 2.2222222222222224e-07, "loss": -0.0829, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.038958802819252014, "mask/share_reasoning": 0.8438724875450134, "mask/share_step_conf": 0.11326245963573456, "num_tokens": 42907267.0, "reward": 1.3796112537384033, "reward_std": 0.30651265382766724, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7699777483940125, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8500910997390747, "step": 192 }, { "adv/mean_abs_final_conf": 0.735534131526947, "adv/mean_abs_reasoning": 0.6952435970306396, "adv/mean_abs_step_conf": 0.7746602296829224, "adv/ratio_final_to_reasoning": 1.0579516800562951, "adv/ratio_step_to_reasoning": 1.1142284991785156, "adv/std_final_conf": 0.8841529488563538, "adv/std_reasoning": 0.8748270869255066, "adv/std_step_conf": 0.9360180497169495, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7600585688820982, "calib/avg_num_step_conf": 4.58984375, "calib/ece": 0.23414741035856573, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.4103379169849759, "calib/mean_conf": 0.48848207171314745, "calib/mu_c": 0.7042773109243698, "calib/mu_w": 0.29393939393939394, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12426294820717132, "calib/std_conf": 0.45367003941042455, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4586973684210527, "calib/step_q_c_n": 532.0, "calib/step_q_gap": 0.14031006930233836, "calib/step_q_w": 0.31838729911871433, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 451.5078125, "completions/mean_terminated_length": 455.06298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.20586666666666667, "grad_norm": 0.0538572259247303, "kl": 0.145263671875, "learning_rate": 1.9444444444444447e-07, "loss": -0.0346, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0355381965637207, "mask/share_reasoning": 0.8476927280426025, "mask/share_step_conf": 0.10895660519599915, "num_tokens": 43128565.0, "reward": 1.3470118045806885, "reward_std": 0.28083106875419617, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7346318960189819, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8351645469665527, "step": 193 }, { "adv/mean_abs_final_conf": 0.6366356015205383, "adv/mean_abs_reasoning": 0.5255308747291565, "adv/mean_abs_step_conf": 0.7562828063964844, "adv/ratio_final_to_reasoning": 1.2114142710428613, "adv/ratio_step_to_reasoning": 1.4390834920712332, "adv/std_final_conf": 0.8287248611450195, "adv/std_reasoning": 0.7394238710403442, "adv/std_step_conf": 0.9351856708526611, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7922456575682382, "calib/avg_num_step_conf": 4.4140625, "calib/ece": 0.22039370078740161, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5078740157480315, "calib/gap": 0.5165434243176177, "calib/mean_conf": 0.5574015748031497, "calib/mu_c": 0.821774193548387, "calib/mu_w": 0.30523076923076925, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14480314960629925, "calib/std_conf": 0.4716560080384642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49373540856031123, "calib/step_q_c_n": 514.0, "calib/step_q_gap": 0.1829614248204739, "calib/step_q_w": 0.31077398373983733, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 412.2265625, "completions/mean_terminated_length": 413.8431701660156, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.20693333333333333, "grad_norm": 0.039248399436473846, "kl": 0.1414642333984375, "learning_rate": 1.6666666666666668e-07, "loss": 0.0308, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04142865911126137, "mask/share_reasoning": 0.8406766653060913, "mask/share_step_conf": 0.11398839950561523, "num_tokens": 43340039.0, "reward": 1.3840529918670654, "reward_std": 0.26664406061172485, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7710093259811401, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8512825965881348, "step": 194 }, { "adv/mean_abs_final_conf": 0.6953327655792236, "adv/mean_abs_reasoning": 0.5130212306976318, "adv/mean_abs_step_conf": 0.7610127925872803, "adv/ratio_final_to_reasoning": 1.3553684018762255, "adv/ratio_step_to_reasoning": 1.4833943452055915, "adv/std_final_conf": 0.8780831098556519, "adv/std_reasoning": 0.7575872540473938, "adv/std_step_conf": 0.9359692931175232, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7829176896239824, "calib/avg_num_step_conf": 4.75, "calib/ece": 0.22101195219123498, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4900398406374502, "calib/gap": 0.49266817418271114, "calib/mean_conf": 0.5397848605577689, "calib/mu_c": 0.7537323943661973, "calib/mu_w": 0.2610642201834862, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09752988047808757, "calib/std_conf": 0.46986335380076294, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5041426307448496, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.2346674170696359, "calib/step_q_w": 0.26947521367521365, "calib/step_q_w_n": 585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 446.8046875, "completions/mean_terminated_length": 452.102783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.208, "grad_norm": 0.06146930903196335, "kl": 0.1854705810546875, "learning_rate": 1.3888888888888888e-07, "loss": -0.0375, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037648532539606094, "mask/share_reasoning": 0.8388372659683228, "mask/share_step_conf": 0.11179547011852264, "num_tokens": 43560405.0, "reward": 1.3826078176498413, "reward_std": 0.271342396736145, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.759816586971283, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8491839170455933, "step": 195 }, { "adv/mean_abs_final_conf": 0.6164750456809998, "adv/mean_abs_reasoning": 0.39999616146087646, "adv/mean_abs_step_conf": 0.7732023000717163, "adv/ratio_final_to_reasoning": 1.541202404116813, "adv/ratio_step_to_reasoning": 1.933024300152798, "adv/std_final_conf": 0.841810405254364, "adv/std_reasoning": 0.6613016724586487, "adv/std_step_conf": 0.9360251426696777, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7458754001477469, "calib/avg_num_step_conf": 4.1953125, "calib/ece": 0.2534117647058824, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6078431372549019, "calib/gap": 0.3940137897069688, "calib/mean_conf": 0.680156862745098, "calib/mu_c": 0.8717557251908398, "calib/mu_w": 0.47774193548387095, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20992156862745098, "calib/std_conf": 0.43007520421018475, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5337148014440433, "calib/step_q_c_n": 554.0, "calib/step_q_gap": 0.12296095529019707, "calib/step_q_w": 0.4107538461538462, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 392.58984375, "completions/mean_terminated_length": 392.58984375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.20906666666666668, "grad_norm": 0.05715157836675644, "kl": 0.150390625, "learning_rate": 1.1111111111111112e-07, "loss": 0.035, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.042884960770606995, "mask/share_reasoning": 0.8413792848587036, "mask/share_step_conf": 0.115735724568367, "num_tokens": 43763452.0, "reward": 1.3373997211456299, "reward_std": 0.24315345287322998, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7315140962600708, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8208614587783813, "step": 196 }, { "adv/mean_abs_final_conf": 0.6945164203643799, "adv/mean_abs_reasoning": 0.5376187562942505, "adv/mean_abs_step_conf": 0.7531794905662537, "adv/ratio_final_to_reasoning": 1.2918381515399657, "adv/ratio_step_to_reasoning": 1.400954638855684, "adv/std_final_conf": 0.8717400431632996, "adv/std_reasoning": 0.7928566932678223, "adv/std_step_conf": 0.9359936714172363, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7045021590043179, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.2702788844621514, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5059760956175299, "calib/gap": 0.36873571247142484, "calib/mean_conf": 0.5565737051792828, "calib/mu_c": 0.7431451612903225, "calib/mu_w": 0.37440944881889765, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1664143426294821, "calib/std_conf": 0.4634852094600902, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48389640410958906, "calib/step_q_c_n": 584.0, "calib/step_q_gap": 0.16362489075647335, "calib/step_q_w": 0.3202715133531157, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 480.3203125, "completions/mean_terminated_length": 482.2039489746094, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.21013333333333334, "grad_norm": 0.05007997155189514, "kl": 0.13787841796875, "learning_rate": 8.333333333333334e-08, "loss": -0.0201, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03720054030418396, "mask/share_reasoning": 0.8442490100860596, "mask/share_step_conf": 0.11464422941207886, "num_tokens": 43991470.0, "reward": 1.343414545059204, "reward_std": 0.2829974293708801, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7016687393188477, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8460958003997803, "step": 197 }, { "adv/mean_abs_final_conf": 0.6337748765945435, "adv/mean_abs_reasoning": 0.5334087014198303, "adv/mean_abs_step_conf": 0.7811760902404785, "adv/ratio_final_to_reasoning": 1.188159988593283, "adv/ratio_step_to_reasoning": 1.4644982133983557, "adv/std_final_conf": 0.8478825688362122, "adv/std_reasoning": 0.7753887176513672, "adv/std_step_conf": 0.9359259605407715, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8029621297337832, "calib/avg_num_step_conf": 4.62890625, "calib/ece": 0.18976284584980235, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4505928853754941, "calib/gap": 0.5294956880389953, "calib/mean_conf": 0.5101581027667984, "calib/mu_c": 0.7759523809523811, "calib/mu_w": 0.2464566929133858, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10094861660079052, "calib/std_conf": 0.46679291680297397, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5045838926174496, "calib/step_q_c_n": 596.0, "calib/step_q_gap": 0.17904772962933418, "calib/step_q_w": 0.32553616298811544, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 407.10546875, "completions/mean_terminated_length": 408.7019958496094, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.2112, "grad_norm": 0.046563468873500824, "kl": 0.1541595458984375, "learning_rate": 5.555555555555556e-08, "loss": -0.0071, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.041801366955041885, "mask/share_reasoning": 0.8286471962928772, "mask/share_step_conf": 0.12564517557621002, "num_tokens": 44201073.0, "reward": 1.4073433876037598, "reward_std": 0.24024641513824463, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7873682975769043, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8656122088432312, "step": 198 }, { "adv/mean_abs_final_conf": 0.6945219039916992, "adv/mean_abs_reasoning": 0.6268808841705322, "adv/mean_abs_step_conf": 0.7525413036346436, "adv/ratio_final_to_reasoning": 1.1079009131227016, "adv/ratio_step_to_reasoning": 1.2004534236681679, "adv/std_final_conf": 0.8746665716171265, "adv/std_reasoning": 0.8428965210914612, "adv/std_step_conf": 0.9360443353652954, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6880782695673918, "calib/avg_num_step_conf": 4.6953125, "calib/ece": 0.3001581027667984, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5533596837944664, "calib/gap": 0.3234758689672418, "calib/mean_conf": 0.6152569169960476, "calib/mu_c": 0.7737984496124031, "calib/mu_w": 0.4503225806451613, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2027667984189723, "calib/std_conf": 0.4535808703312079, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5238173913043479, "calib/step_q_c_n": 575.0, "calib/step_q_gap": 0.1828824630746828, "calib/step_q_w": 0.3409349282296651, "calib/step_q_w_n": 627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2129.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 465.59375, "completions/mean_terminated_length": 467.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.21226666666666666, "grad_norm": 0.07331021130084991, "kl": 0.14813232421875, "learning_rate": 2.777777777777778e-08, "loss": 0.0272, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03809141740202904, "mask/share_reasoning": 0.840973973274231, "mask/share_step_conf": 0.1170283704996109, "num_tokens": 44424465.0, "reward": 1.325409173965454, "reward_std": 0.28530770540237427, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6828891038894653, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8351364731788635, "step": 199 }, { "adv/mean_abs_final_conf": 0.5704379081726074, "adv/mean_abs_reasoning": 0.48373907804489136, "adv/mean_abs_step_conf": 0.7290881872177124, "adv/ratio_final_to_reasoning": 1.1792264343788872, "adv/ratio_step_to_reasoning": 1.5071930722744968, "adv/std_final_conf": 0.820499062538147, "adv/std_reasoning": 0.7574408650398254, "adv/std_step_conf": 0.9358140826225281, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8717167872097449, "calib/avg_num_step_conf": 3.953125, "calib/ece": 0.1338339920948617, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.549407114624506, "calib/gap": 0.647292221799264, "calib/mean_conf": 0.5969960474308301, "calib/mu_c": 0.8809859154929577, "calib/mu_w": 0.2336936936936937, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08478260869565221, "calib/std_conf": 0.4614529253953416, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.586984375, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.27550180619266057, "calib/step_q_w": 0.31148256880733943, "calib/step_q_w_n": 436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 456.49609375, "completions/mean_terminated_length": 456.49609375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.21333333333333335, "grad_norm": 0.05067267641425133, "kl": 0.1455535888671875, "learning_rate": 0.0, "loss": 0.0631, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038834765553474426, "mask/share_reasoning": 0.8590643405914307, "mask/share_step_conf": 0.1021009087562561, "num_tokens": 44649376.0, "reward": 1.457303524017334, "reward_std": 0.21622072160243988, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.8482663631439209, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8788734078407288, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.050295659240800886, "train_runtime": 13081.6192, "train_samples_per_second": 3.914, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 44649376, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }