{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7498364448547363, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5715035496705603, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9352971315383911, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04300324618816376, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0136, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 1.0788748264312744, "reward_std": 0.22853493690490723, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.770571768283844, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.509578923891962, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9354329705238342, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.040453653782606125, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 1.016056776046753, "reward_std": 0.2184845209121704, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7699410915374756, "adv/mean_abs_reasoning": 0.4286423921585083, "adv/mean_abs_step_conf": 0.7708143591880798, "adv/ratio_final_to_reasoning": 1.7962317904682603, "adv/ratio_step_to_reasoning": 1.7982690776488557, "adv/std_final_conf": 0.9275014996528625, "adv/std_reasoning": 0.7013915777206421, "adv/std_step_conf": 0.9344233870506287, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.430653197733943, "calib/avg_num_step_conf": 4.953125, "calib/ece": 0.24111111111111116, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": -0.010320114667940916, "calib/mean_conf": 0.88, "calib/mu_c": 0.8762732919254658, "calib/mu_w": 0.8865934065934067, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24111111111111116, "calib/std_conf": 0.042323395908998626, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7988338192419825, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.053954094156071886, "calib/step_q_w": 0.7448797250859106, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2755.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 503.89453125, "completions/mean_terminated_length": 505.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.07180804014205933, "kl": 0.0010092556476593018, "learning_rate": 7.5e-07, "loss": 0.0958, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033395763486623764, "mask/share_reasoning": 0.8536885976791382, "mask/share_step_conf": 0.10900937020778656, "num_tokens": 692914.0, "reward": 1.0585122108459473, "reward_std": 0.20415569841861725, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6930652260780334, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7347228527069092, "step": 3 }, { "adv/mean_abs_final_conf": 0.7719908952713013, "adv/mean_abs_reasoning": 0.37903717160224915, "adv/mean_abs_step_conf": 0.7598022818565369, "adv/ratio_final_to_reasoning": 2.0367155337508867, "adv/ratio_step_to_reasoning": 2.0045587577723163, "adv/std_final_conf": 0.9269395470619202, "adv/std_reasoning": 0.6612975001335144, "adv/std_step_conf": 0.9352869391441345, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5061005032053492, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.23214285714285715, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": 0.00448473150892692, "calib/mean_conf": 0.878968253968254, "calib/mu_c": 0.880552147239264, "calib/mu_w": 0.8760674157303371, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23214285714285715, "calib/std_conf": 0.0459266040595583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7920047449584816, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.003940228829449333, "calib/step_q_w": 0.7880645161290323, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 515.4375, "completions/mean_terminated_length": 515.4375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.004266666666666667, "grad_norm": 0.041159238666296005, "kl": 0.0002847015857696533, "learning_rate": 1.0000000000000002e-06, "loss": 0.0688, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03243076801300049, "mask/share_reasoning": 0.8518642783164978, "mask/share_step_conf": 0.11570495367050171, "num_tokens": 931034.0, "reward": 1.0581204891204834, "reward_std": 0.20035883784294128, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7063945531845093, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7237518429756165, "step": 4 }, { "adv/mean_abs_final_conf": 0.7795875072479248, "adv/mean_abs_reasoning": 0.41760870814323425, "adv/mean_abs_step_conf": 0.7794197201728821, "adv/ratio_final_to_reasoning": 1.866789394105586, "adv/ratio_step_to_reasoning": 1.8663876135110464, "adv/std_final_conf": 0.9297929406166077, "adv/std_reasoning": 0.6816225647926331, "adv/std_step_conf": 0.9355724453926086, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.43911059700533384, "calib/avg_num_step_conf": 4.703125, "calib/ece": 0.3465999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.308, "calib/gap": -0.00953987532934919, "calib/mean_conf": 0.8786, "calib/mu_c": 0.8741353383458645, "calib/mu_w": 0.8836752136752137, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3465999999999999, "calib/std_conf": 0.04492705198430006, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7988571428571428, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.010703832752613107, "calib/step_q_w": 0.7881533101045297, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 528.9921875, "completions/mean_terminated_length": 528.9921875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.005333333333333333, "grad_norm": 0.043477851897478104, "kl": 0.00028324127197265625, "learning_rate": 1.25e-06, "loss": 0.0765, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03482171893119812, "mask/share_reasoning": 0.8547407388687134, "mask/share_step_conf": 0.1104375571012497, "num_tokens": 1173144.0, "reward": 0.9544405341148376, "reward_std": 0.1946556568145752, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6074038743972778, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6697347164154053, "step": 5 }, { "adv/mean_abs_final_conf": 0.7901644706726074, "adv/mean_abs_reasoning": 0.4342483878135681, "adv/mean_abs_step_conf": 0.7639518976211548, "adv/ratio_final_to_reasoning": 1.8196140569480743, "adv/ratio_step_to_reasoning": 1.7592509703205512, "adv/std_final_conf": 0.9303911328315735, "adv/std_reasoning": 0.6817383766174316, "adv/std_step_conf": 0.9353141784667969, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.525006314725941, "calib/avg_num_step_conf": 5.03125, "calib/ece": 0.30039215686274506, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3176470588235294, "calib/gap": 0.004571861581207259, "calib/mean_conf": 0.8807843137254903, "calib/mu_c": 0.8827027027027026, "calib/mu_w": 0.8781308411214953, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.30039215686274506, "calib/std_conf": 0.04316149051810785, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.793986577181208, "calib/step_q_c_n": 745.0, "calib/step_q_gap": -0.0015567009035065693, "calib/step_q_w": 0.7955432780847146, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 445.44921875, "completions/mean_terminated_length": 445.44921875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0064, "grad_norm": 0.045135434716939926, "kl": 0.0006567239761352539, "learning_rate": 1.5e-06, "loss": -0.004, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036960337311029434, "mask/share_reasoning": 0.8375744819641113, "mask/share_step_conf": 0.12546522915363312, "num_tokens": 1393131.0, "reward": 1.0118639469146729, "reward_std": 0.21928110718727112, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6584070324897766, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7018805742263794, "step": 6 }, { "adv/mean_abs_final_conf": 0.7620992660522461, "adv/mean_abs_reasoning": 0.5001809597015381, "adv/mean_abs_step_conf": 0.7641808390617371, "adv/ratio_final_to_reasoning": 1.523647094657495, "adv/ratio_step_to_reasoning": 1.52780873449827, "adv/std_final_conf": 0.9296735525131226, "adv/std_reasoning": 0.7575913071632385, "adv/std_step_conf": 0.9349011778831482, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4986007781038837, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2416666666666666, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.32142857142857145, "calib/gap": 0.004653607262303239, "calib/mean_conf": 0.8805555555555555, "calib/mu_c": 0.8822360248447206, "calib/mu_w": 0.8775824175824174, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2416666666666666, "calib/std_conf": 0.05412808964099344, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7983625730994154, "calib/step_q_c_n": 855.0, "calib/step_q_gap": 0.01571794500024193, "calib/step_q_w": 0.7826446280991735, "calib/step_q_w_n": 484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 537.640625, "completions/mean_terminated_length": 539.7490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.007466666666666667, "grad_norm": 0.07143445312976837, "kl": 0.014304488897323608, "learning_rate": 1.75e-06, "loss": 0.0537, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030748117715120316, "mask/share_reasoning": 0.858625054359436, "mask/share_step_conf": 0.10672055184841156, "num_tokens": 1638191.0, "reward": 1.0623219013214111, "reward_std": 0.2287452518939972, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.695104718208313, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.738442599773407, "step": 7 }, { "adv/mean_abs_final_conf": 0.7889758944511414, "adv/mean_abs_reasoning": 0.3992607593536377, "adv/mean_abs_step_conf": 0.7824378609657288, "adv/ratio_final_to_reasoning": 1.976091754492509, "adv/ratio_step_to_reasoning": 1.959716407473691, "adv/std_final_conf": 0.9319833517074585, "adv/std_reasoning": 0.661307692527771, "adv/std_step_conf": 0.9356229305267334, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.46211683053788316, "calib/avg_num_step_conf": 4.51171875, "calib/ece": 0.34732000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.32, "calib/gap": -0.002402801876486005, "calib/mean_conf": 0.87932, "calib/mu_c": 0.8781954887218046, "calib/mu_w": 0.8805982905982906, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.34732000000000013, "calib/std_conf": 0.05104446688917419, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7933065810593901, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.01719755850299909, "calib/step_q_w": 0.776109022556391, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 516.56640625, "completions/mean_terminated_length": 516.56640625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.008533333333333334, "grad_norm": 0.03993474319577217, "kl": 0.0004799962043762207, "learning_rate": 2.0000000000000003e-06, "loss": -0.0164, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03236281871795654, "mask/share_reasoning": 0.8639826774597168, "mask/share_step_conf": 0.10365445911884308, "num_tokens": 1876944.0, "reward": 0.9967085123062134, "reward_std": 0.19894912838935852, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.611905038356781, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7215287685394287, "step": 8 }, { "adv/mean_abs_final_conf": 0.7554298639297485, "adv/mean_abs_reasoning": 0.4521360397338867, "adv/mean_abs_step_conf": 0.7652294635772705, "adv/ratio_final_to_reasoning": 1.6708021425904716, "adv/ratio_step_to_reasoning": 1.6924761494962022, "adv/std_final_conf": 0.9297342896461487, "adv/std_reasoning": 0.7205620408058167, "adv/std_step_conf": 0.9354267716407776, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.41982228298017776, "calib/avg_num_step_conf": 5.03515625, "calib/ece": 0.25763052208835335, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.27309236947791166, "calib/gap": -0.00714969241285035, "calib/mean_conf": 0.8761044176706826, "calib/mu_c": 0.8733766233766234, "calib/mu_w": 0.8805263157894737, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.25763052208835335, "calib/std_conf": 0.05478403327402361, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7702870090634442, "calib/step_q_c_n": 662.0, "calib/step_q_gap": 0.07661874750044573, "calib/step_q_w": 0.6936682615629984, "calib/step_q_w_n": 627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 510.15234375, "completions/mean_terminated_length": 514.1693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0096, "grad_norm": 0.04263272136449814, "kl": 0.0003707706928253174, "learning_rate": 2.25e-06, "loss": -0.034, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033698923885822296, "mask/share_reasoning": 0.8547195196151733, "mask/share_step_conf": 0.10376904904842377, "num_tokens": 2115079.0, "reward": 1.0097324848175049, "reward_std": 0.24089282751083374, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6642941236495972, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6951137781143188, "step": 9 }, { "adv/mean_abs_final_conf": 0.7777752876281738, "adv/mean_abs_reasoning": 0.4451920986175537, "adv/mean_abs_step_conf": 0.7588284015655518, "adv/ratio_final_to_reasoning": 1.7470554622226768, "adv/ratio_step_to_reasoning": 1.7044965620951646, "adv/std_final_conf": 0.9298601150512695, "adv/std_reasoning": 0.7014667391777039, "adv/std_step_conf": 0.9350449442863464, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5539309378185525, "calib/avg_num_step_conf": 5.12890625, "calib/ece": 0.3145849802371541, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.391304347826087, "calib/gap": 0.01739678899082564, "calib/mean_conf": 0.8837549407114624, "calib/mu_c": 0.89125, "calib/mu_w": 0.8738532110091743, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3145849802371541, "calib/std_conf": 0.06500023734599751, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7907023188405797, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.012018530718589382, "calib/step_q_w": 0.7786837881219904, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2728.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 524.91796875, "completions/mean_terminated_length": 524.91796875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.010666666666666666, "grad_norm": 0.04463927820324898, "kl": 0.0003826320171356201, "learning_rate": 2.5e-06, "loss": 0.0955, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03257928788661957, "mask/share_reasoning": 0.8568609952926636, "mask/share_step_conf": 0.11055973172187805, "num_tokens": 2356258.0, "reward": 1.0168341398239136, "reward_std": 0.21721628308296204, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6515105366706848, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7151883840560913, "step": 10 }, { "adv/mean_abs_final_conf": 0.7630910277366638, "adv/mean_abs_reasoning": 0.3175758421421051, "adv/mean_abs_step_conf": 0.7779554128646851, "adv/ratio_final_to_reasoning": 2.402862329166728, "adv/ratio_step_to_reasoning": 2.449668109567902, "adv/std_final_conf": 0.9286644458770752, "adv/std_reasoning": 0.5960689783096313, "adv/std_step_conf": 0.9352787733078003, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.43782581055308323, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.3308300395256918, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.38735177865612647, "calib/gap": -0.013174825174825266, "calib/mean_conf": 0.8860079051383399, "calib/mu_c": 0.8802797202797202, "calib/mu_w": 0.8934545454545455, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.325810276679842, "calib/std_conf": 0.058196477359325455, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.769406674907293, "calib/step_q_c_n": 809.0, "calib/step_q_gap": -0.02049299064454646, "calib/step_q_w": 0.7898996655518394, "calib/step_q_w_n": 598.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2209.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 515.4296875, "completions/mean_terminated_length": 515.4296875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.011733333333333333, "grad_norm": 0.0435059629380703, "kl": 0.0006912946701049805, "learning_rate": 2.7500000000000004e-06, "loss": 0.0438, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03301956132054329, "mask/share_reasoning": 0.8474563360214233, "mask/share_step_conf": 0.11952407658100128, "num_tokens": 2592688.0, "reward": 1.001125454902649, "reward_std": 0.1699179857969284, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6315202713012695, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7086119651794434, "step": 11 }, { "adv/mean_abs_final_conf": 0.757521390914917, "adv/mean_abs_reasoning": 0.48119303584098816, "adv/mean_abs_step_conf": 0.7516059279441833, "adv/ratio_final_to_reasoning": 1.5742567628623005, "adv/ratio_step_to_reasoning": 1.561963436629108, "adv/std_final_conf": 0.928837239742279, "adv/std_reasoning": 0.7393051385879517, "adv/std_step_conf": 0.9350537657737732, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4660745384883316, "calib/avg_num_step_conf": 5.46484375, "calib/ece": 0.2371428571428571, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.39285714285714285, "calib/gap": -0.005341692789968655, "calib/mean_conf": 0.8885714285714287, "calib/mu_c": 0.8867272727272727, "calib/mu_w": 0.8920689655172414, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23547619047619042, "calib/std_conf": 0.05101687062566043, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7803575685339691, "calib/step_q_c_n": 839.0, "calib/step_q_gap": 0.029553997105397634, "calib/step_q_w": 0.7508035714285715, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 468.87890625, "completions/mean_terminated_length": 472.57086181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.0128, "grad_norm": 0.0435611829161644, "kl": 0.001415252685546875, "learning_rate": 3e-06, "loss": -0.0022, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03637603670358658, "mask/share_reasoning": 0.828792929649353, "mask/share_step_conf": 0.1270185112953186, "num_tokens": 2816897.0, "reward": 1.088067889213562, "reward_std": 0.21567848324775696, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6941168308258057, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7718667387962341, "step": 12 }, { "adv/mean_abs_final_conf": 0.7475674152374268, "adv/mean_abs_reasoning": 0.48703324794769287, "adv/mean_abs_step_conf": 0.7698397040367126, "adv/ratio_final_to_reasoning": 1.5349412353008702, "adv/ratio_step_to_reasoning": 1.5806717657998435, "adv/std_final_conf": 0.9303745031356812, "adv/std_reasoning": 0.7574519515037537, "adv/std_step_conf": 0.9354352355003357, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5476190476190477, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.27944881889763784, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.421259842519685, "calib/gap": 0.01138016745159598, "calib/mean_conf": 0.8936220472440944, "calib/mu_c": 0.8980128205128205, "calib/mu_w": 0.8866326530612245, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.27944881889763784, "calib/std_conf": 0.05016773897241201, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7593807641633729, "calib/step_q_c_n": 759.0, "calib/step_q_gap": 0.025323071855680612, "calib/step_q_w": 0.7340576923076922, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 468.07421875, "completions/mean_terminated_length": 469.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.013866666666666666, "grad_norm": 0.038371726870536804, "kl": 0.0019371509552001953, "learning_rate": 3.2500000000000002e-06, "loss": -0.0169, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03500467911362648, "mask/share_reasoning": 0.8416920304298401, "mask/share_step_conf": 0.11939701437950134, "num_tokens": 3041316.0, "reward": 1.0731277465820312, "reward_std": 0.2134917676448822, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6806222200393677, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7640678882598877, "step": 13 }, { "adv/mean_abs_final_conf": 0.7789779901504517, "adv/mean_abs_reasoning": 0.5143574476242065, "adv/mean_abs_step_conf": 0.7580201625823975, "adv/ratio_final_to_reasoning": 1.514468185011251, "adv/ratio_step_to_reasoning": 1.4737225368926956, "adv/std_final_conf": 0.9270016551017761, "adv/std_reasoning": 0.7575810551643372, "adv/std_step_conf": 0.9352178573608398, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4321884775808134, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.3639357429718876, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6506024096385542, "calib/gap": -0.004977189781021796, "calib/mean_conf": 0.9141365461847389, "calib/mu_c": 0.9118978102189782, "calib/mu_w": 0.916875, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3639357429718876, "calib/std_conf": 0.04368743294022322, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7413020134228188, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.031105292111343275, "calib/step_q_w": 0.7101967213114755, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 525.13671875, "completions/mean_terminated_length": 531.3636474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.014933333333333333, "grad_norm": 0.03572245314717293, "kl": 0.003782510757446289, "learning_rate": 3.5e-06, "loss": 0.0123, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03229822218418121, "mask/share_reasoning": 0.8388671875, "mask/share_step_conf": 0.11711588501930237, "num_tokens": 3281151.0, "reward": 1.0118708610534668, "reward_std": 0.22134985029697418, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5988633036613464, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.748877227306366, "step": 14 }, { "adv/mean_abs_final_conf": 0.7903380393981934, "adv/mean_abs_reasoning": 0.4590962529182434, "adv/mean_abs_step_conf": 0.7628530859947205, "adv/ratio_final_to_reasoning": 1.72150836425785, "adv/ratio_step_to_reasoning": 1.661640845782661, "adv/std_final_conf": 0.9195718765258789, "adv/std_reasoning": 0.701473593711853, "adv/std_step_conf": 0.935431182384491, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.47, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.309763779527559, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6968503937007874, "calib/gap": 0.001454545454545153, "calib/mean_conf": 0.9148818897637795, "calib/mu_c": 0.9154545454545452, "calib/mu_w": 0.914, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30917322834645666, "calib/std_conf": 0.05821310486077129, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6812092391304347, "calib/step_q_c_n": 736.0, "calib/step_q_gap": -0.02122691018194245, "calib/step_q_w": 0.7024361493123772, "calib/step_q_w_n": 509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 454.70703125, "completions/mean_terminated_length": 454.70703125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.016, "grad_norm": 0.03302004188299179, "kl": 0.007363319396972656, "learning_rate": 3.7500000000000005e-06, "loss": 0.0141, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035681482404470444, "mask/share_reasoning": 0.848019003868103, "mask/share_step_conf": 0.11629950255155563, "num_tokens": 3505436.0, "reward": 1.0653555393218994, "reward_std": 0.22157421708106995, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6581991910934448, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7691745758056641, "step": 15 }, { "adv/mean_abs_final_conf": 0.7543657422065735, "adv/mean_abs_reasoning": 0.43760305643081665, "adv/mean_abs_step_conf": 0.7942132949829102, "adv/ratio_final_to_reasoning": 1.7238584857229757, "adv/ratio_step_to_reasoning": 1.8149171567965778, "adv/std_final_conf": 0.9253305792808533, "adv/std_reasoning": 0.72050940990448, "adv/std_step_conf": 0.9354434013366699, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.48640000000000005, "calib/avg_num_step_conf": 6.37890625, "calib/ece": 0.3292400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.796, "calib/gap": 0.007899999999999796, "calib/mean_conf": 0.92924, "calib/mu_c": 0.9324000000000001, "calib/mu_w": 0.9245000000000003, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3292400000000001, "calib/std_conf": 0.04746179937591914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6588923719958202, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.04671781578280232, "calib/step_q_w": 0.6121745562130179, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 636.96875, "completions/mean_terminated_length": 641.9842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.017066666666666667, "grad_norm": 0.03509129211306572, "kl": 0.008379936218261719, "learning_rate": 4.000000000000001e-06, "loss": 0.0631, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.026487503200769424, "mask/share_reasoning": 0.858582615852356, "mask/share_step_conf": 0.10711735486984253, "num_tokens": 3777348.0, "reward": 1.0628373622894287, "reward_std": 0.21678559482097626, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6378324031829834, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7835614681243896, "step": 16 }, { "adv/mean_abs_final_conf": 0.7382602691650391, "adv/mean_abs_reasoning": 0.39500361680984497, "adv/mean_abs_step_conf": 0.7596269249916077, "adv/ratio_final_to_reasoning": 1.8689962262306021, "adv/ratio_step_to_reasoning": 1.9230885304963996, "adv/std_final_conf": 0.9201560616493225, "adv/std_reasoning": 0.6816303730010986, "adv/std_step_conf": 0.9354602098464966, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5855099956223553, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.24278431372549014, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7686274509803922, "calib/gap": 0.011470888661899714, "calib/mean_conf": 0.9270980392156863, "calib/mu_c": 0.9305617977528089, "calib/mu_w": 0.9190909090909092, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23592156862745092, "calib/std_conf": 0.05528012616751612, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6086591123701605, "calib/step_q_c_n": 1059.0, "calib/step_q_gap": -0.0021803938026789815, "calib/step_q_w": 0.6108395061728394, "calib/step_q_w_n": 405.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 492.609375, "completions/mean_terminated_length": 494.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.018133333333333335, "grad_norm": 0.048337530344724655, "kl": 0.013302803039550781, "learning_rate": 4.25e-06, "loss": 0.0273, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03347424417734146, "mask/share_reasoning": 0.8387426137924194, "mask/share_step_conf": 0.12387684732675552, "num_tokens": 4006984.0, "reward": 1.1489111185073853, "reward_std": 0.20086175203323364, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7343758344650269, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8183392882347107, "step": 17 }, { "adv/mean_abs_final_conf": 0.7539812922477722, "adv/mean_abs_reasoning": 0.4319751560688019, "adv/mean_abs_step_conf": 0.7524322271347046, "adv/ratio_final_to_reasoning": 1.7454274433497363, "adv/ratio_step_to_reasoning": 1.7418414382489686, "adv/std_final_conf": 0.9187299013137817, "adv/std_reasoning": 0.7205332517623901, "adv/std_step_conf": 0.9355179667472839, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5428461142563047, "calib/avg_num_step_conf": 4.62890625, "calib/ece": 0.4055599999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.86, "calib/gap": 0.007656973751930085, "calib/mean_conf": 0.93988, "calib/mu_c": 0.9434328358208955, "calib/mu_w": 0.9357758620689655, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4047199999999999, "calib/std_conf": 0.04093391747683087, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6104299363057325, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.02750354492332674, "calib/step_q_w": 0.5829263913824058, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 484.90234375, "completions/mean_terminated_length": 490.6521911621094, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.0192, "grad_norm": 0.02956167608499527, "kl": 0.015323638916015625, "learning_rate": 4.5e-06, "loss": -0.0949, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033106379210948944, "mask/share_reasoning": 0.848029375076294, "mask/share_step_conf": 0.1071455180644989, "num_tokens": 4241839.0, "reward": 1.012449860572815, "reward_std": 0.20409558713436127, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5725746154785156, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7687375545501709, "step": 18 }, { "adv/mean_abs_final_conf": 0.725152313709259, "adv/mean_abs_reasoning": 0.39276736974716187, "adv/mean_abs_step_conf": 0.7688818573951721, "adv/ratio_final_to_reasoning": 1.846264149122329, "adv/ratio_step_to_reasoning": 1.95760115686323, "adv/std_final_conf": 0.9252442717552185, "adv/std_reasoning": 0.6815301775932312, "adv/std_step_conf": 0.9354442358016968, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5053263966307444, "calib/avg_num_step_conf": 4.5546875, "calib/ece": 0.3896470588235294, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8549019607843137, "calib/gap": 0.025685618729096693, "calib/mean_conf": 0.9308235294117647, "calib/mu_c": 0.9426086956521738, "calib/mu_w": 0.9169230769230771, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3896470588235294, "calib/std_conf": 0.09485855432547834, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.565136, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.009831009242144084, "calib/step_q_w": 0.5553049907578559, "calib/step_q_w_n": 541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 481.9921875, "completions/mean_terminated_length": 481.9921875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.020266666666666665, "grad_norm": 0.02938079461455345, "kl": 0.022005081176757812, "learning_rate": 4.75e-06, "loss": 0.0076, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033192120492458344, "mask/share_reasoning": 0.8598717451095581, "mask/share_step_conf": 0.10693618655204773, "num_tokens": 4469989.0, "reward": 1.0599896907806396, "reward_std": 0.19520623981952667, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.601270318031311, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8077850341796875, "step": 19 }, { "adv/mean_abs_final_conf": 0.7167485952377319, "adv/mean_abs_reasoning": 0.3509178161621094, "adv/mean_abs_step_conf": 0.7590775489807129, "adv/ratio_final_to_reasoning": 2.0424970241654075, "adv/ratio_step_to_reasoning": 2.1631205770129687, "adv/std_final_conf": 0.9080055356025696, "adv/std_reasoning": 0.661155104637146, "adv/std_step_conf": 0.935492992401123, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.47865067079463364, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.41544000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.948, "calib/gap": -0.002799277605779249, "calib/mean_conf": 0.9575999999999999, "calib/mu_c": 0.9563235294117647, "calib/mu_w": 0.9591228070175439, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41452, "calib/std_conf": 0.03253060097815593, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.524655172413793, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.036813946787052076, "calib/step_q_w": 0.4878412256267409, "calib/step_q_w_n": 718.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 496.65234375, "completions/mean_terminated_length": 496.65234375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.021333333333333333, "grad_norm": 0.024263571947813034, "kl": 0.030767440795898438, "learning_rate": 5e-06, "loss": 0.0385, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035842910408973694, "mask/share_reasoning": 0.8330389261245728, "mask/share_step_conf": 0.13111810386180878, "num_tokens": 4702004.0, "reward": 1.026486873626709, "reward_std": 0.18262585997581482, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5648671984672546, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7905083894729614, "step": 20 }, { "adv/mean_abs_final_conf": 0.7449398040771484, "adv/mean_abs_reasoning": 0.5243001580238342, "adv/mean_abs_step_conf": 0.7196472883224487, "adv/ratio_final_to_reasoning": 1.4208269684391057, "adv/ratio_step_to_reasoning": 1.372586441772032, "adv/std_final_conf": 0.9047093987464905, "adv/std_reasoning": 0.7753930687904358, "adv/std_step_conf": 0.9356669187545776, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.48425430778371953, "calib/avg_num_step_conf": 5.73828125, "calib/ece": 0.3716269841269841, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9603174603174603, "calib/gap": -0.011437908496732097, "calib/mean_conf": 0.9630555555555556, "calib/mu_c": 0.9585620915032681, "calib/mu_w": 0.9700000000000002, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36376984126984124, "calib/std_conf": 0.06451120310673689, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5064848484848485, "calib/step_q_c_n": 825.0, "calib/step_q_gap": 0.01601900997553174, "calib/step_q_w": 0.4904658385093168, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 509.41015625, "completions/mean_terminated_length": 509.41015625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0224, "grad_norm": 0.023283572867512703, "kl": 0.03099822998046875, "learning_rate": 4.9722222222222224e-06, "loss": 0.0217, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033678602427244186, "mask/share_reasoning": 0.8410661220550537, "mask/share_step_conf": 0.1252552568912506, "num_tokens": 4935373.0, "reward": 1.068263053894043, "reward_std": 0.23257187008857727, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6154191493988037, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8026129007339478, "step": 21 }, { "adv/mean_abs_final_conf": 0.6880729794502258, "adv/mean_abs_reasoning": 0.2956817150115967, "adv/mean_abs_step_conf": 0.7522487640380859, "adv/ratio_final_to_reasoning": 2.327073148311655, "adv/ratio_step_to_reasoning": 2.544116615424064, "adv/std_final_conf": 0.8778355121612549, "adv/std_reasoning": 0.5959526896476746, "adv/std_step_conf": 0.9354989528656006, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4271016650873155, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.32290196078431377, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9764705882352941, "calib/gap": -0.007377149045620923, "calib/mean_conf": 0.9656470588235294, "calib/mu_c": 0.9630722891566263, "calib/mu_w": 0.9704494382022473, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31878431372549026, "calib/std_conf": 0.045411764705882346, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4753470715835141, "calib/step_q_c_n": 922.0, "calib/step_q_gap": 0.011572752517366225, "calib/step_q_w": 0.46377431906614786, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 474.9765625, "completions/mean_terminated_length": 474.9765625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.023466666666666667, "grad_norm": 0.029273828491568565, "kl": 0.039241790771484375, "learning_rate": 4.944444444444445e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03368385136127472, "mask/share_reasoning": 0.8386745452880859, "mask/share_step_conf": 0.12764160335063934, "num_tokens": 5158783.0, "reward": 1.1068484783172607, "reward_std": 0.15006275475025177, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6657546758651733, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.812690794467926, "step": 22 }, { "adv/mean_abs_final_conf": 0.7450026869773865, "adv/mean_abs_reasoning": 0.5139386653900146, "adv/mean_abs_step_conf": 0.757665753364563, "adv/ratio_final_to_reasoning": 1.449594547263774, "adv/ratio_step_to_reasoning": 1.4742338033461448, "adv/std_final_conf": 0.9089747667312622, "adv/std_reasoning": 0.7575487494468689, "adv/std_step_conf": 0.9355910420417786, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.518332713984888, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.4283921568627452, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9725490196078431, "calib/gap": 0.0003084355258267113, "calib/mean_conf": 0.9695686274509804, "calib/mu_c": 0.9697101449275362, "calib/mu_w": 0.9694017094017094, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4283921568627452, "calib/std_conf": 0.03272299359595352, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.479627659574468, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.017357128173501768, "calib/step_q_w": 0.46227053140096624, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 499.3984375, "completions/mean_terminated_length": 501.3569030761719, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.024533333333333334, "grad_norm": 0.03355023264884949, "kl": 0.0365142822265625, "learning_rate": 4.9166666666666665e-06, "loss": -0.0396, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035151075571775436, "mask/share_reasoning": 0.8352031707763672, "mask/share_step_conf": 0.12573951482772827, "num_tokens": 5390565.0, "reward": 1.0469937324523926, "reward_std": 0.2159760296344757, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5650421380996704, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8146092891693115, "step": 23 }, { "adv/mean_abs_final_conf": 0.7351275682449341, "adv/mean_abs_reasoning": 0.5750550627708435, "adv/mean_abs_step_conf": 0.7765597105026245, "adv/ratio_final_to_reasoning": 1.278360309885453, "adv/ratio_step_to_reasoning": 1.3504093099552097, "adv/std_final_conf": 0.9156761169433594, "adv/std_reasoning": 0.8098498582839966, "adv/std_step_conf": 0.9356335997581482, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.536032258064516, "calib/avg_num_step_conf": 6.109375, "calib/ece": 0.47028112449799203, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.963855421686747, "calib/gap": 0.006545161290322676, "calib/mean_conf": 0.965140562248996, "calib/mu_c": 0.9684, "calib/mu_w": 0.9618548387096774, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.466706827309237, "calib/std_conf": 0.07599461195023637, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.490234375, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.024204224246231087, "calib/step_q_w": 0.4660301507537689, "calib/step_q_w_n": 796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 561.50390625, "completions/mean_terminated_length": 563.7059326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.0256, "grad_norm": 0.024830345064401627, "kl": 0.04190635681152344, "learning_rate": 4.888888888888889e-06, "loss": 0.0675, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03164277225732803, "mask/share_reasoning": 0.8389121294021606, "mask/share_step_conf": 0.12553885579109192, "num_tokens": 5638822.0, "reward": 1.0030418634414673, "reward_std": 0.24848011136054993, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5184351205825806, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7969739437103271, "step": 24 }, { "adv/mean_abs_final_conf": 0.7402602434158325, "adv/mean_abs_reasoning": 0.42128893733024597, "adv/mean_abs_step_conf": 0.7585857510566711, "adv/ratio_final_to_reasoning": 1.7571319297082486, "adv/ratio_step_to_reasoning": 1.8006305977648307, "adv/std_final_conf": 0.9028820991516113, "adv/std_reasoning": 0.6817774176597595, "adv/std_step_conf": 0.935698926448822, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5652805280528053, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.38023904382470125, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.0013273927392741802, "calib/mean_conf": 0.9701992031872511, "calib/mu_c": 0.9707333333333334, "calib/mu_w": 0.9694059405940593, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37641434262948215, "calib/std_conf": 0.0643908025849377, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5059181141439206, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.03667401965573158, "calib/step_q_w": 0.46924409448818905, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2649.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 488.55078125, "completions/mean_terminated_length": 490.4667053222656, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.02666666666666667, "grad_norm": 0.0347868986427784, "kl": 0.035980224609375, "learning_rate": 4.861111111111111e-06, "loss": 0.0343, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03435831144452095, "mask/share_reasoning": 0.8342337012290955, "mask/share_step_conf": 0.1275017410516739, "num_tokens": 5867115.0, "reward": 1.0532389879226685, "reward_std": 0.2238880842924118, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6051421165466309, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7920362949371338, "step": 25 }, { "adv/mean_abs_final_conf": 0.7297452092170715, "adv/mean_abs_reasoning": 0.36826425790786743, "adv/mean_abs_step_conf": 0.749592661857605, "adv/ratio_final_to_reasoning": 1.9815803286552984, "adv/ratio_step_to_reasoning": 2.0354749225897955, "adv/std_final_conf": 0.882972002029419, "adv/std_reasoning": 0.6402753591537476, "adv/std_step_conf": 0.9351133704185486, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5537442689760571, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.3780392156862745, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": 0.006663907284768289, "calib/mean_conf": 0.9701960784313725, "calib/mu_c": 0.9729139072847683, "calib/mu_w": 0.96625, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3780392156862745, "calib/std_conf": 0.02694149042257921, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5008847184986595, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.02464248247381473, "calib/step_q_w": 0.4762422360248447, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2088.0, "completions/max_terminated_length": 2088.0, "completions/mean_length": 484.125, "completions/mean_terminated_length": 486.0235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.027733333333333332, "grad_norm": 0.025464007630944252, "kl": 0.036617279052734375, "learning_rate": 4.833333333333333e-06, "loss": -0.0449, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03218268230557442, "mask/share_reasoning": 0.8462972640991211, "mask/share_step_conf": 0.1176137626171112, "num_tokens": 6096291.0, "reward": 1.0802245140075684, "reward_std": 0.17607977986335754, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6156578063964844, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8184024095535278, "step": 26 }, { "adv/mean_abs_final_conf": 0.7717294692993164, "adv/mean_abs_reasoning": 0.4704931378364563, "adv/mean_abs_step_conf": 0.772092878818512, "adv/ratio_final_to_reasoning": 1.6402565887529905, "adv/ratio_step_to_reasoning": 1.6410289900697594, "adv/std_final_conf": 0.9117308259010315, "adv/std_reasoning": 0.7206444144248962, "adv/std_step_conf": 0.9351636171340942, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5027194535795598, "calib/avg_num_step_conf": 6.2734375, "calib/ece": 0.4304761904761905, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": 0.018547938274728004, "calib/mean_conf": 0.9595238095238096, "calib/mu_c": 0.9682089552238806, "calib/mu_w": 0.9496610169491526, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.42912698412698413, "calib/std_conf": 0.09490477385133608, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4985406698564593, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.017735475051264438, "calib/step_q_w": 0.48080519480519485, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 496.59375, "completions/mean_terminated_length": 500.5039367675781, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.0288, "grad_norm": 0.029237594455480576, "kl": 0.035808563232421875, "learning_rate": 4.805555555555556e-06, "loss": -0.0563, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03279014676809311, "mask/share_reasoning": 0.8255751729011536, "mask/share_step_conf": 0.13382220268249512, "num_tokens": 6328635.0, "reward": 1.0464885234832764, "reward_std": 0.22028236091136932, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5554590225219727, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8239703178405762, "step": 27 }, { "adv/mean_abs_final_conf": 0.7000174522399902, "adv/mean_abs_reasoning": 0.3728755712509155, "adv/mean_abs_step_conf": 0.7837008237838745, "adv/ratio_final_to_reasoning": 1.877348655186999, "adv/ratio_step_to_reasoning": 2.1017757241503663, "adv/std_final_conf": 0.8813297152519226, "adv/std_reasoning": 0.6612759828567505, "adv/std_step_conf": 0.9354707598686218, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5263845889232885, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.35358870967741945, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9838709677419355, "calib/gap": 0.0007285861713107744, "calib/mean_conf": 0.9669758064516131, "calib/mu_c": 0.9672549019607843, "calib/mu_w": 0.9665263157894736, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35181451612903236, "calib/std_conf": 0.03516274689939519, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.497979094076655, "calib/step_q_c_n": 861.0, "calib/step_q_gap": 0.018402105920986678, "calib/step_q_w": 0.47957698815566835, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 556.34375, "completions/mean_terminated_length": 560.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.029866666666666666, "grad_norm": 0.03544874116778374, "kl": 0.033390045166015625, "learning_rate": 4.777777777777778e-06, "loss": -0.0343, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03131452947854996, "mask/share_reasoning": 0.8450330495834351, "mask/share_step_conf": 0.11583994328975677, "num_tokens": 6578003.0, "reward": 1.0687757730484009, "reward_std": 0.1918598711490631, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6202456951141357, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8026830554008484, "step": 28 }, { "adv/mean_abs_final_conf": 0.7551331520080566, "adv/mean_abs_reasoning": 0.5309878587722778, "adv/mean_abs_step_conf": 0.757839024066925, "adv/ratio_final_to_reasoning": 1.4221288482076329, "adv/ratio_step_to_reasoning": 1.4272247689790132, "adv/std_final_conf": 0.9082273840904236, "adv/std_reasoning": 0.7754151225090027, "adv/std_step_conf": 0.9351565837860107, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6091259802681508, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.4382936507936509, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": 0.008840121426764491, "calib/mean_conf": 0.9700396825396826, "calib/mu_c": 0.9741791044776119, "calib/mu_w": 0.9653389830508474, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4382936507936509, "calib/std_conf": 0.021903431924197383, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4950569800569801, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.011951845582652498, "calib/step_q_w": 0.4831051344743276, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 552.953125, "completions/mean_terminated_length": 555.12158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.030933333333333334, "grad_norm": 0.02542717568576336, "kl": 0.03304290771484375, "learning_rate": 4.75e-06, "loss": -0.0293, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030591856688261032, "mask/share_reasoning": 0.8471544981002808, "mask/share_step_conf": 0.1183474063873291, "num_tokens": 6826687.0, "reward": 1.050892949104309, "reward_std": 0.22539781033992767, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5498980283737183, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8345919251441956, "step": 29 }, { "adv/mean_abs_final_conf": 0.7546311616897583, "adv/mean_abs_reasoning": 0.6352213621139526, "adv/mean_abs_step_conf": 0.7367050051689148, "adv/ratio_final_to_reasoning": 1.1879813978207878, "adv/ratio_step_to_reasoning": 1.1597610677280041, "adv/std_final_conf": 0.9116274118423462, "adv/std_reasoning": 0.8429492712020874, "adv/std_step_conf": 0.9353036880493164, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5105012878937983, "calib/avg_num_step_conf": 6.34765625, "calib/ece": 0.3717199999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.968, "calib/gap": 0.023630539594478583, "calib/mean_conf": 0.95972, "calib/mu_c": 0.9694557823129251, "calib/mu_w": 0.9458252427184465, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3717199999999999, "calib/std_conf": 0.09404850663354522, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4668671963677639, "calib/step_q_c_n": 881.0, "calib/step_q_gap": 0.034877949055935886, "calib/step_q_w": 0.43198924731182803, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 620.7265625, "completions/mean_terminated_length": 623.1608276367188, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.032, "grad_norm": 0.030656639486551285, "kl": 0.038387298583984375, "learning_rate": 4.722222222222222e-06, "loss": -0.0384, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.027659885585308075, "mask/share_reasoning": 0.8523433208465576, "mask/share_step_conf": 0.11609058082103729, "num_tokens": 7092577.0, "reward": 1.0673344135284424, "reward_std": 0.26539111137390137, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6036843657493591, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8144062757492065, "step": 30 }, { "adv/mean_abs_final_conf": 0.7504103183746338, "adv/mean_abs_reasoning": 0.48929375410079956, "adv/mean_abs_step_conf": 0.7482815384864807, "adv/ratio_final_to_reasoning": 1.5336601215229113, "adv/ratio_step_to_reasoning": 1.529309402000515, "adv/std_final_conf": 0.8871038556098938, "adv/std_reasoning": 0.7393369674682617, "adv/std_step_conf": 0.9349742531776428, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5178012764359905, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.4745454545454547, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9565217391304348, "calib/gap": 0.02865473657865092, "calib/mean_conf": 0.9533596837944663, "calib/mu_c": 0.9681967213114755, "calib/mu_w": 0.9395419847328246, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4728458498023717, "calib/std_conf": 0.12559808331621283, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44105744125326374, "calib/step_q_c_n": 766.0, "calib/step_q_gap": 0.00939760518768995, "calib/step_q_w": 0.4316598360655738, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 583.03125, "completions/mean_terminated_length": 583.03125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.03306666666666667, "grad_norm": 0.03860628604888916, "kl": 0.0362091064453125, "learning_rate": 4.694444444444445e-06, "loss": 0.0399, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03051367774605751, "mask/share_reasoning": 0.8445330858230591, "mask/share_step_conf": 0.12495321035385132, "num_tokens": 7347745.0, "reward": 1.033031702041626, "reward_std": 0.19907008111476898, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5206976532936096, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8349312543869019, "step": 31 }, { "adv/mean_abs_final_conf": 0.7279558777809143, "adv/mean_abs_reasoning": 0.4749346375465393, "adv/mean_abs_step_conf": 0.7677797079086304, "adv/ratio_final_to_reasoning": 1.5327496043275664, "adv/ratio_step_to_reasoning": 1.6166007850572803, "adv/std_final_conf": 0.8810470104217529, "adv/std_reasoning": 0.7392776608467102, "adv/std_step_conf": 0.9347826242446899, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.661967418546366, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.4214624505928855, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9565217391304348, "calib/gap": 0.05381265664160384, "calib/mean_conf": 0.9462055335968379, "calib/mu_c": 0.9717293233082706, "calib/mu_w": 0.9179166666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4209881422924902, "calib/std_conf": 0.1326554172743245, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42289875173370317, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.058993346328297835, "calib/step_q_w": 0.36390540540540534, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 518.2421875, "completions/mean_terminated_length": 520.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.034133333333333335, "grad_norm": 0.03688067942857742, "kl": 0.06610107421875, "learning_rate": 4.666666666666667e-06, "loss": -0.0832, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03253469243645668, "mask/share_reasoning": 0.8402843475341797, "mask/share_step_conf": 0.12327471375465393, "num_tokens": 7587119.0, "reward": 1.0722166299819946, "reward_std": 0.19639672338962555, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5762332081794739, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8444249629974365, "step": 32 }, { "adv/mean_abs_final_conf": 0.7129478454589844, "adv/mean_abs_reasoning": 0.39785322546958923, "adv/mean_abs_step_conf": 0.7419391870498657, "adv/ratio_final_to_reasoning": 1.791987094279521, "adv/ratio_step_to_reasoning": 1.864856533899277, "adv/std_final_conf": 0.8591421842575073, "adv/std_reasoning": 0.6613561511039734, "adv/std_step_conf": 0.9346285462379456, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6605691056910569, "calib/avg_num_step_conf": 6.5234375, "calib/ece": 0.4621115537848607, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": 0.015661839430894386, "calib/mean_conf": 0.9690438247011953, "calib/mu_c": 0.9767187500000002, "calib/mu_w": 0.9610569105691058, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4605976095617531, "calib/std_conf": 0.04849818322995946, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.38405370843989767, "calib/step_q_c_n": 782.0, "calib/step_q_gap": 0.02532623096242026, "calib/step_q_w": 0.3587274774774774, "calib/step_q_w_n": 888.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 528.47265625, "completions/mean_terminated_length": 530.5451049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.0352, "grad_norm": 0.03596274182200432, "kl": 0.14304733276367188, "learning_rate": 4.638888888888889e-06, "loss": -0.0427, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03191022202372551, "mask/share_reasoning": 0.836162805557251, "mask/share_step_conf": 0.12802070379257202, "num_tokens": 7829280.0, "reward": 1.0433704853057861, "reward_std": 0.17046688497066498, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.534176230430603, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8376471996307373, "step": 33 }, { "adv/mean_abs_final_conf": 0.7462908029556274, "adv/mean_abs_reasoning": 0.5331032872200012, "adv/mean_abs_step_conf": 0.7383915185928345, "adv/ratio_final_to_reasoning": 1.3998990830601423, "adv/ratio_step_to_reasoning": 1.3850815335305087, "adv/std_final_conf": 0.9107847213745117, "adv/std_reasoning": 0.7754673957824707, "adv/std_step_conf": 0.934866726398468, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5206094364351245, "calib/avg_num_step_conf": 6.18359375, "calib/ece": 0.4085542168674698, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9196787148594378, "calib/gap": -0.011357798165137711, "calib/mean_conf": 0.9529718875502009, "calib/mu_c": 0.9479999999999998, "calib/mu_w": 0.9593577981651376, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.39963855421686734, "calib/std_conf": 0.10265943234329516, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3455625790139064, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.023289851741179146, "calib/step_q_w": 0.32227272727272727, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 461.56640625, "completions/mean_terminated_length": 467.03955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.03626666666666667, "grad_norm": 0.04626227170228958, "kl": 0.05771636962890625, "learning_rate": 4.611111111111112e-06, "loss": -0.141, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03449372947216034, "mask/share_reasoning": 0.8127261400222778, "mask/share_step_conf": 0.14106135070323944, "num_tokens": 8052553.0, "reward": 1.0586323738098145, "reward_std": 0.21905234456062317, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5688515901565552, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8296712040901184, "step": 34 }, { "adv/mean_abs_final_conf": 0.768435001373291, "adv/mean_abs_reasoning": 0.5288388729095459, "adv/mean_abs_step_conf": 0.722439706325531, "adv/ratio_final_to_reasoning": 1.4530607350128104, "adv/ratio_step_to_reasoning": 1.3660866160440122, "adv/std_final_conf": 0.9130533337593079, "adv/std_reasoning": 0.7755023837089539, "adv/std_step_conf": 0.935539722442627, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6663034308211473, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.3864435146443515, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.8493723849372385, "calib/gap": 0.09799564116985382, "calib/mean_conf": 0.910376569037657, "calib/mu_c": 0.9562992125984252, "calib/mu_w": 0.8583035714285714, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3827196652719666, "calib/std_conf": 0.20581165079173147, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.33267737617135207, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.02509575585696272, "calib/step_q_w": 0.30758162031438935, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 547.1171875, "completions/mean_terminated_length": 551.4251708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.037333333333333336, "grad_norm": 0.07981492578983307, "kl": 0.056163787841796875, "learning_rate": 4.583333333333333e-06, "loss": -0.1449, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.028393005952239037, "mask/share_reasoning": 0.8495216369628906, "mask/share_step_conf": 0.1142728328704834, "num_tokens": 8301871.0, "reward": 1.0419251918792725, "reward_std": 0.27103620767593384, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5730335712432861, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.813461184501648, "step": 35 }, { "adv/mean_abs_final_conf": 0.6559814214706421, "adv/mean_abs_reasoning": 0.40428081154823303, "adv/mean_abs_step_conf": 0.7389465570449829, "adv/ratio_final_to_reasoning": 1.6225885640193924, "adv/ratio_step_to_reasoning": 1.8278051689248238, "adv/std_final_conf": 0.8501717448234558, "adv/std_reasoning": 0.701352059841156, "adv/std_step_conf": 0.9354451894760132, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6510141093474426, "calib/avg_num_step_conf": 6.2421875, "calib/ece": 0.20823293172690757, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9437751004016064, "calib/gap": 0.024994708994708903, "calib/mean_conf": 0.9596385542168674, "calib/mu_c": 0.9656613756613757, "calib/mu_w": 0.9406666666666668, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20441767068273087, "calib/std_conf": 0.07641746131016414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3287301587301587, "calib/step_q_c_n": 1134.0, "calib/step_q_gap": 0.021812055281882792, "calib/step_q_w": 0.3069181034482759, "calib/step_q_w_n": 464.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 504.05859375, "completions/mean_terminated_length": 504.05859375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.0384, "grad_norm": 0.11191720515489578, "kl": 0.09484481811523438, "learning_rate": 4.555555555555556e-06, "loss": 0.0777, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03631111979484558, "mask/share_reasoning": 0.8209631443023682, "mask/share_step_conf": 0.14272576570510864, "num_tokens": 8533622.0, "reward": 1.140600323677063, "reward_std": 0.18435192108154297, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7588292956352234, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7867891788482666, "step": 36 }, { "adv/mean_abs_final_conf": 0.6528252363204956, "adv/mean_abs_reasoning": 0.3660427927970886, "adv/mean_abs_step_conf": 0.7701745629310608, "adv/ratio_final_to_reasoning": 1.783466985736778, "adv/ratio_step_to_reasoning": 2.104056077831309, "adv/std_final_conf": 0.8169144988059998, "adv/std_reasoning": 0.6403437256813049, "adv/std_step_conf": 0.9345270395278931, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7279202279202279, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.40871485943775115, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7710843373493976, "calib/gap": 0.14789432789432788, "calib/mean_conf": 0.8770682730923695, "calib/mu_c": 0.9554700854700854, "calib/mu_w": 0.8075757575757575, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40795180722891583, "calib/std_conf": 0.2362011925687789, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3431372549019608, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.07033477299916857, "calib/step_q_w": 0.2728024819027922, "calib/step_q_w_n": 967.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 513.05078125, "completions/mean_terminated_length": 519.1343994140625, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.039466666666666664, "grad_norm": 0.061256974935531616, "kl": 0.050884246826171875, "learning_rate": 4.527777777777778e-06, "loss": -0.0415, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033428847789764404, "mask/share_reasoning": 0.8275771141052246, "mask/share_step_conf": 0.1272752583026886, "num_tokens": 8772059.0, "reward": 1.063904881477356, "reward_std": 0.17878447473049164, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.5865042805671692, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8369119167327881, "step": 37 }, { "adv/mean_abs_final_conf": 0.5637679696083069, "adv/mean_abs_reasoning": 0.3914545178413391, "adv/mean_abs_step_conf": 0.7495837211608887, "adv/ratio_final_to_reasoning": 1.4401876690992959, "adv/ratio_step_to_reasoning": 1.9148679782633224, "adv/std_final_conf": 0.796682596206665, "adv/std_reasoning": 0.6815966367721558, "adv/std_step_conf": 0.9347833395004272, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6690769330988209, "calib/avg_num_step_conf": 6.109375, "calib/ece": 0.39092741935483877, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8185483870967742, "calib/gap": 0.11977916748094608, "calib/mean_conf": 0.9041532258064516, "calib/mu_c": 0.9616279069767443, "calib/mu_w": 0.8418487394957982, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3874596774193549, "calib/std_conf": 0.2034364267963304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38266757865937073, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.09713336497389652, "calib/step_q_w": 0.2855342136854742, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 510.79296875, "completions/mean_terminated_length": 516.849853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.04053333333333333, "grad_norm": 0.06795842945575714, "kl": 0.051410675048828125, "learning_rate": 4.5e-06, "loss": -0.0345, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032852061092853546, "mask/share_reasoning": 0.8267712593078613, "mask/share_step_conf": 0.12865795195102692, "num_tokens": 9009710.0, "reward": 1.0843178033828735, "reward_std": 0.1842365264892578, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6019449234008789, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8481062054634094, "step": 38 }, { "adv/mean_abs_final_conf": 0.6071762442588806, "adv/mean_abs_reasoning": 0.30981796979904175, "adv/mean_abs_step_conf": 0.7679427862167358, "adv/ratio_final_to_reasoning": 1.9597838196819743, "adv/ratio_step_to_reasoning": 2.4786902667874595, "adv/std_final_conf": 0.819245457649231, "adv/std_reasoning": 0.5726991891860962, "adv/std_step_conf": 0.9341544508934021, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6572, "calib/avg_num_step_conf": 6.328125, "calib/ece": 0.43850980392156863, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7529411764705882, "calib/gap": 0.05800923076923081, "calib/mean_conf": 0.8866666666666666, "calib/mu_c": 0.9162400000000002, "calib/mu_w": 0.8582307692307694, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41749019607843135, "calib/std_conf": 0.21295738933693426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36389705882352946, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.04426939924906137, "calib/step_q_w": 0.3196276595744681, "calib/step_q_w_n": 940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2230.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 486.4375, "completions/mean_terminated_length": 486.4375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0416, "grad_norm": 0.05202677845954895, "kl": 0.0638427734375, "learning_rate": 4.472222222222223e-06, "loss": -0.0256, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035491589456796646, "mask/share_reasoning": 0.8255130052566528, "mask/share_step_conf": 0.13899537920951843, "num_tokens": 9240326.0, "reward": 1.0818815231323242, "reward_std": 0.14773604273796082, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5742976665496826, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8617267608642578, "step": 39 }, { "adv/mean_abs_final_conf": 0.7261002063751221, "adv/mean_abs_reasoning": 0.4546775817871094, "adv/mean_abs_step_conf": 0.7494188547134399, "adv/ratio_final_to_reasoning": 1.5969562508914263, "adv/ratio_step_to_reasoning": 1.6482423693903063, "adv/std_final_conf": 0.8997302651405334, "adv/std_reasoning": 0.7204935550689697, "adv/std_step_conf": 0.9343372583389282, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5853891698532132, "calib/avg_num_step_conf": 5.7890625, "calib/ece": 0.3858039215686273, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.08098556802763046, "calib/mean_conf": 0.8352941176470589, "calib/mu_c": 0.8778512396694216, "calib/mu_w": 0.7968656716417911, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3732941176470587, "calib/std_conf": 0.25733134923427275, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37328571428571433, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.04505801475519794, "calib/step_q_w": 0.3282276995305164, "calib/step_q_w_n": 852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1949.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 508.890625, "completions/mean_terminated_length": 508.890625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.042666666666666665, "grad_norm": 0.08238881826400757, "kl": 0.08835983276367188, "learning_rate": 4.444444444444444e-06, "loss": -0.0775, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03551477566361427, "mask/share_reasoning": 0.8364818692207336, "mask/share_step_conf": 0.1280033439397812, "num_tokens": 9477362.0, "reward": 1.0873432159423828, "reward_std": 0.21221011877059937, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5923296809196472, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8590710759162903, "step": 40 }, { "adv/mean_abs_final_conf": 0.6341361999511719, "adv/mean_abs_reasoning": 0.41668474674224854, "adv/mean_abs_step_conf": 0.7611067891120911, "adv/ratio_final_to_reasoning": 1.5218608430210518, "adv/ratio_step_to_reasoning": 1.8265770347069938, "adv/std_final_conf": 0.8549427390098572, "adv/std_reasoning": 0.6816285848617554, "adv/std_step_conf": 0.9348737597465515, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7514878347628218, "calib/avg_num_step_conf": 5.80859375, "calib/ece": 0.1418823529411765, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5058823529411764, "calib/gap": 0.25013127953789593, "calib/mean_conf": 0.7325490196078431, "calib/mu_c": 0.7894416243654822, "calib/mu_w": 0.5393103448275862, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.050941176470588274, "calib/std_conf": 0.30395976989517065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.375041782729805, "calib/step_q_c_n": 1077.0, "calib/step_q_gap": 0.0862612949249269, "calib/step_q_w": 0.2887804878048781, "calib/step_q_w_n": 410.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 449.3359375, "completions/mean_terminated_length": 449.3359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.04373333333333333, "grad_norm": 0.116298146545887, "kl": 0.1528167724609375, "learning_rate": 4.416666666666667e-06, "loss": -0.0102, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.039690546691417694, "mask/share_reasoning": 0.8176549077033997, "mask/share_step_conf": 0.14265450835227966, "num_tokens": 9699640.0, "reward": 1.2204053401947021, "reward_std": 0.14665330946445465, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.8149999380111694, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8484572172164917, "step": 41 }, { "adv/mean_abs_final_conf": 0.7516371011734009, "adv/mean_abs_reasoning": 0.3156457245349884, "adv/mean_abs_step_conf": 0.7655600309371948, "adv/ratio_final_to_reasoning": 2.3812681203926274, "adv/ratio_step_to_reasoning": 2.425377476805756, "adv/std_final_conf": 0.9281405806541443, "adv/std_reasoning": 0.596068799495697, "adv/std_step_conf": 0.9343998432159424, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6625789669267931, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.16611764705882343, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.16887588257153463, "calib/mean_conf": 0.6492549019607843, "calib/mu_c": 0.7267391304347826, "calib/mu_w": 0.5578632478632479, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13709803921568617, "calib/std_conf": 0.30193206891729246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40450340136054425, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.051050856656280175, "calib/step_q_w": 0.35345254470426407, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 404.55078125, "completions/mean_terminated_length": 404.55078125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.0448, "grad_norm": 0.06288475543260574, "kl": 0.09320068359375, "learning_rate": 4.388888888888889e-06, "loss": 0.0136, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04105367884039879, "mask/share_reasoning": 0.8106446266174316, "mask/share_step_conf": 0.14830169081687927, "num_tokens": 9907573.0, "reward": 1.1653730869293213, "reward_std": 0.14880496263504028, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7298547029495239, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.862573504447937, "step": 42 }, { "adv/mean_abs_final_conf": 0.772418737411499, "adv/mean_abs_reasoning": 0.5696216821670532, "adv/mean_abs_step_conf": 0.7576810121536255, "adv/ratio_final_to_reasoning": 1.3560206038382006, "adv/ratio_step_to_reasoning": 1.3301477732924851, "adv/std_final_conf": 0.9291498064994812, "adv/std_reasoning": 0.8097960352897644, "adv/std_step_conf": 0.934786856174469, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6473269660133962, "calib/avg_num_step_conf": 5.76953125, "calib/ece": 0.12780392156862744, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.1450980392156863, "calib/gap": 0.1366335896799803, "calib/mean_conf": 0.5448235294117647, "calib/mu_c": 0.6069784172661871, "calib/mu_w": 0.47034482758620677, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06376470588235292, "calib/std_conf": 0.2741615886685601, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42774018944519626, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.05049086695197136, "calib/step_q_w": 0.3772493224932249, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 462.4375, "completions/mean_terminated_length": 464.2510070800781, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.04586666666666667, "grad_norm": 0.03930652514100075, "kl": 0.065765380859375, "learning_rate": 4.361111111111112e-06, "loss": -0.0474, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03815210610628128, "mask/share_reasoning": 0.822625994682312, "mask/share_step_conf": 0.13531562685966492, "num_tokens": 10131181.0, "reward": 1.1641141176223755, "reward_std": 0.1371372938156128, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.741721510887146, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8524627685546875, "step": 43 }, { "adv/mean_abs_final_conf": 0.7673567533493042, "adv/mean_abs_reasoning": 0.38849079608917236, "adv/mean_abs_step_conf": 0.7680708169937134, "adv/ratio_final_to_reasoning": 1.9752250531391449, "adv/ratio_step_to_reasoning": 1.9770630983427828, "adv/std_final_conf": 0.9358692765235901, "adv/std_reasoning": 0.6814560294151306, "adv/std_step_conf": 0.9348883032798767, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6887874676205747, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.1507450980392156, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.17254901960784313, "calib/gap": 0.18600037005057357, "calib/mean_conf": 0.5256470588235295, "calib/mu_c": 0.6233884297520661, "calib/mu_w": 0.4373880597014925, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10094117647058819, "calib/std_conf": 0.2883493811829746, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4323913043478261, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.06701324257998792, "calib/step_q_w": 0.36537806176783816, "calib/step_q_w_n": 939.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 491.39453125, "completions/mean_terminated_length": 493.32159423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.046933333333333334, "grad_norm": 0.03840534761548042, "kl": 0.0656280517578125, "learning_rate": 4.333333333333334e-06, "loss": -0.0351, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03497948497533798, "mask/share_reasoning": 0.8223901391029358, "mask/share_step_conf": 0.13872411847114563, "num_tokens": 10363298.0, "reward": 1.1694731712341309, "reward_std": 0.13373121619224548, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7546882629394531, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8603387475013733, "step": 44 }, { "adv/mean_abs_final_conf": 0.7848135828971863, "adv/mean_abs_reasoning": 0.5568833351135254, "adv/mean_abs_step_conf": 0.7529138326644897, "adv/ratio_final_to_reasoning": 1.4092962267172089, "adv/ratio_step_to_reasoning": 1.352013582002776, "adv/std_final_conf": 0.9361516237258911, "adv/std_reasoning": 0.7927802205085754, "adv/std_step_conf": 0.9352006316184998, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6241544094830261, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.14712598425196852, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.031496062992125984, "calib/gap": 0.11007075032582392, "calib/mean_conf": 0.4331102362204724, "calib/mu_c": 0.48641221374045807, "calib/mu_w": 0.37634146341463415, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.032244094488188976, "calib/std_conf": 0.26554685006653567, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.424720744680851, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.033590874550980865, "calib/step_q_w": 0.39112987012987016, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 445.77734375, "completions/mean_terminated_length": 445.77734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.048, "grad_norm": 0.03386177122592926, "kl": 0.07482147216796875, "learning_rate": 4.305555555555556e-06, "loss": -0.0692, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04040045291185379, "mask/share_reasoning": 0.8123257756233215, "mask/share_step_conf": 0.14727374911308289, "num_tokens": 10582465.0, "reward": 1.1441757678985596, "reward_std": 0.16143080592155457, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7184984683990479, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8465688824653625, "step": 45 }, { "adv/mean_abs_final_conf": 0.8008900880813599, "adv/mean_abs_reasoning": 0.49210262298583984, "adv/mean_abs_step_conf": 0.7658165097236633, "adv/ratio_final_to_reasoning": 1.627485915888738, "adv/ratio_step_to_reasoning": 1.5562130213349818, "adv/std_final_conf": 0.9361664652824402, "adv/std_reasoning": 0.7392691969871521, "adv/std_step_conf": 0.934954047203064, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5794730392156863, "calib/avg_num_step_conf": 6.7890625, "calib/ece": 0.1973046875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.125, "calib/gap": 0.08427941176470594, "calib/mean_conf": 0.4615234375, "calib/mu_c": 0.5010294117647058, "calib/mu_w": 0.4167499999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0637890625, "calib/std_conf": 0.2845768916535276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4298466981132075, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.06432647339410641, "calib/step_q_w": 0.3655202247191011, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 494.08203125, "completions/mean_terminated_length": 496.0196228027344, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.04906666666666667, "grad_norm": 0.029355766251683235, "kl": 0.07450103759765625, "learning_rate": 4.277777777777778e-06, "loss": -0.0534, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.039339788258075714, "mask/share_reasoning": 0.8077389597892761, "mask/share_step_conf": 0.14901497960090637, "num_tokens": 10813718.0, "reward": 1.141398310661316, "reward_std": 0.1377015858888626, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7071058750152588, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8462938070297241, "step": 46 }, { "adv/mean_abs_final_conf": 0.7672794461250305, "adv/mean_abs_reasoning": 0.3217868208885193, "adv/mean_abs_step_conf": 0.7579846382141113, "adv/ratio_final_to_reasoning": 2.38443402997803, "adv/ratio_step_to_reasoning": 2.35554904368414, "adv/std_final_conf": 0.9359407424926758, "adv/std_reasoning": 0.6402183175086975, "adv/std_step_conf": 0.9345706105232239, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7161493477282951, "calib/avg_num_step_conf": 6.41015625, "calib/ece": 0.11127999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.076, "calib/gap": 0.19506586980271196, "calib/mean_conf": 0.47103999999999996, "calib/mu_c": 0.5623308270676692, "calib/mu_w": 0.3672649572649573, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.02516, "calib/std_conf": 0.2673434465252515, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4381193255512321, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.060544612907553974, "calib/step_q_w": 0.37757471264367815, "calib/step_q_w_n": 870.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 503.60546875, "completions/mean_terminated_length": 507.57086181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.050133333333333335, "grad_norm": 0.04785727709531784, "kl": 0.082183837890625, "learning_rate": 4.25e-06, "loss": -0.0998, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03499086946249008, "mask/share_reasoning": 0.8182421326637268, "mask/share_step_conf": 0.1389545202255249, "num_tokens": 11048617.0, "reward": 1.1591176986694336, "reward_std": 0.15524102747440338, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.754852294921875, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8427762985229492, "step": 47 }, { "adv/mean_abs_final_conf": 0.7918989658355713, "adv/mean_abs_reasoning": 0.4984220564365387, "adv/mean_abs_step_conf": 0.762143611907959, "adv/ratio_final_to_reasoning": 1.5888120431452042, "adv/ratio_step_to_reasoning": 1.5291129316324679, "adv/std_final_conf": 0.9350493550300598, "adv/std_reasoning": 0.7393231987953186, "adv/std_step_conf": 0.9349328279495239, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6886497309473157, "calib/avg_num_step_conf": 6.078125, "calib/ece": 0.09735177865612649, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.08300395256916997, "calib/gap": 0.1758290576899012, "calib/mean_conf": 0.47158102766798415, "calib/mu_c": 0.5626229508196722, "calib/mu_w": 0.38679389312977097, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0433596837944664, "calib/std_conf": 0.2618946442738743, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4343328335832084, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.06582889657533436, "calib/step_q_w": 0.36850393700787404, "calib/step_q_w_n": 889.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 452.6953125, "completions/mean_terminated_length": 454.4706115722656, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.0512, "grad_norm": 0.030040910467505455, "kl": 0.095123291015625, "learning_rate": 4.222222222222223e-06, "loss": -0.0465, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.039761632680892944, "mask/share_reasoning": 0.809786319732666, "mask/share_step_conf": 0.14654578268527985, "num_tokens": 11268195.0, "reward": 1.1682615280151367, "reward_std": 0.13624170422554016, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.76040118932724, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8554352521896362, "step": 48 }, { "adv/mean_abs_final_conf": 0.7727680206298828, "adv/mean_abs_reasoning": 0.4392518997192383, "adv/mean_abs_step_conf": 0.7774852514266968, "adv/ratio_final_to_reasoning": 1.759282136568611, "adv/ratio_step_to_reasoning": 1.7700213748048694, "adv/std_final_conf": 0.9354192018508911, "adv/std_reasoning": 0.7013979554176331, "adv/std_step_conf": 0.9352362155914307, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7392572428543651, "calib/avg_num_step_conf": 6.4375, "calib/ece": 0.10184, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.168, "calib/gap": 0.23718063387128147, "calib/mean_conf": 0.55944, "calib/mu_c": 0.6647482014388489, "calib/mu_w": 0.4275675675675675, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05264000000000002, "calib/std_conf": 0.28427818488234374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4316390588235294, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.07549419666814083, "calib/step_q_w": 0.35614486215538854, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 503.390625, "completions/mean_terminated_length": 503.390625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.05226666666666667, "grad_norm": 0.03872883692383766, "kl": 0.0874481201171875, "learning_rate": 4.194444444444445e-06, "loss": -0.0188, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03684543818235397, "mask/share_reasoning": 0.8130807876586914, "mask/share_step_conf": 0.15007378160953522, "num_tokens": 11501599.0, "reward": 1.1672701835632324, "reward_std": 0.16787287592887878, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.770910918712616, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8398154377937317, "step": 49 }, { "adv/mean_abs_final_conf": 0.7783648371696472, "adv/mean_abs_reasoning": 0.4970937967300415, "adv/mean_abs_step_conf": 0.7481029629707336, "adv/ratio_final_to_reasoning": 1.5658309202203877, "adv/ratio_step_to_reasoning": 1.504953326498678, "adv/std_final_conf": 0.9354126453399658, "adv/std_reasoning": 0.7394469380378723, "adv/std_step_conf": 0.9351540207862854, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.737333859718384, "calib/avg_num_step_conf": 6.91796875, "calib/ece": 0.10685258964143424, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.199203187250996, "calib/gap": 0.2498032635873142, "calib/mean_conf": 0.5972111553784861, "calib/mu_c": 0.6987248322147651, "calib/mu_w": 0.4489215686274509, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05521912350597606, "calib/std_conf": 0.29781306343418107, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4361684460260973, "calib/step_q_c_n": 843.0, "calib/step_q_gap": 0.11400249775023519, "calib/step_q_w": 0.3221659482758621, "calib/step_q_w_n": 928.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 497.34765625, "completions/mean_terminated_length": 501.2637634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.05333333333333334, "grad_norm": 0.05003435164690018, "kl": 0.08087158203125, "learning_rate": 4.166666666666667e-06, "loss": -0.0706, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036934275180101395, "mask/share_reasoning": 0.8138121962547302, "mask/share_step_conf": 0.14144101738929749, "num_tokens": 11734280.0, "reward": 1.1801552772521973, "reward_std": 0.19399115443229675, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7727367281913757, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8505699038505554, "step": 50 }, { "adv/mean_abs_final_conf": 0.7511352300643921, "adv/mean_abs_reasoning": 0.36320850253105164, "adv/mean_abs_step_conf": 0.7650404572486877, "adv/ratio_final_to_reasoning": 2.06805519372492, "adv/ratio_step_to_reasoning": 2.1063396146219966, "adv/std_final_conf": 0.9327057003974915, "adv/std_reasoning": 0.6813850402832031, "adv/std_step_conf": 0.9352177977561951, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8028094105869102, "calib/avg_num_step_conf": 6.1328125, "calib/ece": 0.15687747035573124, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2766798418972332, "calib/gap": 0.298421349017645, "calib/mean_conf": 0.6408300395256917, "calib/mu_c": 0.784732824427481, "calib/mu_w": 0.48631147540983605, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13996047430830044, "calib/std_conf": 0.30277586442427024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38387305699481866, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.05847205448855297, "calib/step_q_w": 0.3254010025062657, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 476.37109375, "completions/mean_terminated_length": 478.2392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.0544, "grad_norm": 0.03629736974835396, "kl": 0.08521270751953125, "learning_rate": 4.138888888888889e-06, "loss": -0.0354, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03722125664353371, "mask/share_reasoning": 0.8207862377166748, "mask/share_step_conf": 0.1380862295627594, "num_tokens": 11965527.0, "reward": 1.1790621280670166, "reward_std": 0.16613981127738953, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7832379341125488, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8499242067337036, "step": 51 }, { "adv/mean_abs_final_conf": 0.7443599104881287, "adv/mean_abs_reasoning": 0.4907376766204834, "adv/mean_abs_step_conf": 0.7508261203765869, "adv/ratio_final_to_reasoning": 1.5168183450152868, "adv/ratio_step_to_reasoning": 1.5299948549849074, "adv/std_final_conf": 0.9310631155967712, "adv/std_reasoning": 0.7392945289611816, "adv/std_step_conf": 0.9352546334266663, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7442952527698291, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.1342913385826772, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.452755905511811, "calib/gap": 0.29940494533714884, "calib/mean_conf": 0.7053937007874016, "calib/mu_c": 0.7961581920903956, "calib/mu_w": 0.4967532467532468, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0714173228346457, "calib/std_conf": 0.32899818484242627, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3782980866062437, "calib/step_q_c_n": 993.0, "calib/step_q_gap": 0.07326826553268506, "calib/step_q_w": 0.30502982107355864, "calib/step_q_w_n": 503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 450.37109375, "completions/mean_terminated_length": 450.37109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.055466666666666664, "grad_norm": 0.040435388684272766, "kl": 0.090972900390625, "learning_rate": 4.111111111111111e-06, "loss": -0.0395, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037771232426166534, "mask/share_reasoning": 0.821386992931366, "mask/share_step_conf": 0.1408417671918869, "num_tokens": 12188774.0, "reward": 1.20395827293396, "reward_std": 0.17617203295230865, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.8006316423416138, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.847044050693512, "step": 52 }, { "adv/mean_abs_final_conf": 0.7123820781707764, "adv/mean_abs_reasoning": 0.3875897526741028, "adv/mean_abs_step_conf": 0.7608627080917358, "adv/ratio_final_to_reasoning": 1.8379796505346953, "adv/ratio_step_to_reasoning": 1.9630619820114086, "adv/std_final_conf": 0.8869374394416809, "adv/std_reasoning": 0.6815102100372314, "adv/std_step_conf": 0.934977114200592, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7330726621049202, "calib/avg_num_step_conf": 6.8515625, "calib/ece": 0.22224409448818894, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6377952755905512, "calib/gap": 0.24778820462691442, "calib/mean_conf": 0.8262598425196851, "calib/mu_c": 0.9228387096774194, "calib/mu_w": 0.675050505050505, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21913385826771647, "calib/std_conf": 0.2570283452795441, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4098411122144985, "calib/step_q_c_n": 1007.0, "calib/step_q_gap": 0.1232012193095453, "calib/step_q_w": 0.2866398929049532, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2443.0, "completions/max_terminated_length": 2443.0, "completions/mean_length": 501.37109375, "completions/mean_terminated_length": 501.37109375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05653333333333333, "grad_norm": 0.032593466341495514, "kl": 0.0872955322265625, "learning_rate": 4.083333333333334e-06, "loss": -0.0127, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03349916636943817, "mask/share_reasoning": 0.8224176168441772, "mask/share_step_conf": 0.1440831869840622, "num_tokens": 12422949.0, "reward": 1.1682567596435547, "reward_std": 0.18513035774230957, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7612996101379395, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8376426100730896, "step": 53 }, { "adv/mean_abs_final_conf": 0.6633133888244629, "adv/mean_abs_reasoning": 0.4657354950904846, "adv/mean_abs_step_conf": 0.7782049179077148, "adv/ratio_final_to_reasoning": 1.4242276910751503, "adv/ratio_step_to_reasoning": 1.670916058816867, "adv/std_final_conf": 0.8949787616729736, "adv/std_reasoning": 0.7574234008789062, "adv/std_step_conf": 0.9352843761444092, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.697829131652661, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.20271653543307086, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7165354330708661, "calib/gap": 0.2250952380952379, "calib/mean_conf": 0.8525590551181103, "calib/mu_c": 0.9269999999999998, "calib/mu_w": 0.7019047619047619, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19299212598425197, "calib/std_conf": 0.2707356310971349, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4095027027027027, "calib/step_q_c_n": 925.0, "calib/step_q_gap": 0.08858234872040183, "calib/step_q_w": 0.3209203539823009, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 422.12890625, "completions/mean_terminated_length": 423.7843322753906, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.0576, "grad_norm": 0.03692952170968056, "kl": 0.1038055419921875, "learning_rate": 4.055555555555556e-06, "loss": -0.0597, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03953488916158676, "mask/share_reasoning": 0.8116753101348877, "mask/share_step_conf": 0.14488358795642853, "num_tokens": 12637246.0, "reward": 1.1712415218353271, "reward_std": 0.20544785261154175, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7653933763504028, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8305597901344299, "step": 54 }, { "adv/mean_abs_final_conf": 0.754909098148346, "adv/mean_abs_reasoning": 0.5777817964553833, "adv/mean_abs_step_conf": 0.7564443349838257, "adv/ratio_final_to_reasoning": 1.306564351420581, "adv/ratio_step_to_reasoning": 1.3092214736160155, "adv/std_final_conf": 0.9008998274803162, "adv/std_reasoning": 0.7928237318992615, "adv/std_step_conf": 0.9352696537971497, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7237327755905512, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.3145882352941177, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6313725490196078, "calib/gap": 0.23964751476377943, "calib/mean_conf": 0.7945882352941177, "calib/mu_c": 0.9148818897637795, "calib/mu_w": 0.675234375, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30556862745098046, "calib/std_conf": 0.3087300474759994, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3964864864864865, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.08968368601974203, "calib/step_q_w": 0.30680280046674446, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 473.15234375, "completions/mean_terminated_length": 475.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.058666666666666666, "grad_norm": 0.03538893908262253, "kl": 0.0989227294921875, "learning_rate": 4.027777777777779e-06, "loss": -0.0235, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036335572600364685, "mask/share_reasoning": 0.8202430009841919, "mask/share_step_conf": 0.13951516151428223, "num_tokens": 12866197.0, "reward": 1.1363887786865234, "reward_std": 0.23207849264144897, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6799824237823486, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8634260296821594, "step": 55 }, { "adv/mean_abs_final_conf": 0.6986392140388489, "adv/mean_abs_reasoning": 0.43160390853881836, "adv/mean_abs_step_conf": 0.735781192779541, "adv/ratio_final_to_reasoning": 1.6187045580844488, "adv/ratio_step_to_reasoning": 1.7047602633407686, "adv/std_final_conf": 0.8800408244132996, "adv/std_reasoning": 0.7013711333274841, "adv/std_step_conf": 0.9352052807807922, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7471084040851482, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.3129803921568628, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6431372549019608, "calib/gap": 0.24679032853451455, "calib/mean_conf": 0.7961960784313724, "calib/mu_c": 0.9181395348837209, "calib/mu_w": 0.6713492063492064, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30164705882352943, "calib/std_conf": 0.3198603261111253, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3863864491844417, "calib/step_q_c_n": 797.0, "calib/step_q_gap": 0.09199756029555278, "calib/step_q_w": 0.2943888888888889, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2321.0, "completions/max_terminated_length": 2321.0, "completions/mean_length": 500.06640625, "completions/mean_terminated_length": 502.0274658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.05973333333333333, "grad_norm": 0.038583461195230484, "kl": 0.0832672119140625, "learning_rate": 4.000000000000001e-06, "loss": 0.0199, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035010527819395065, "mask/share_reasoning": 0.8196976184844971, "mask/share_step_conf": 0.14138558506965637, "num_tokens": 13101054.0, "reward": 1.1343872547149658, "reward_std": 0.18083718419075012, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6805593967437744, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8593308329582214, "step": 56 }, { "adv/mean_abs_final_conf": 0.646998941898346, "adv/mean_abs_reasoning": 0.4985702335834503, "adv/mean_abs_step_conf": 0.7729321718215942, "adv/ratio_final_to_reasoning": 1.2977087245022054, "adv/ratio_step_to_reasoning": 1.5502974701601824, "adv/std_final_conf": 0.8325883150100708, "adv/std_reasoning": 0.7393158674240112, "adv/std_step_conf": 0.9350900650024414, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7011326860841424, "calib/avg_num_step_conf": 6.5703125, "calib/ece": 0.2856521739130435, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7549407114624506, "calib/gap": 0.17861229773462772, "calib/mean_conf": 0.8607509881422926, "calib/mu_c": 0.9334666666666666, "calib/mu_w": 0.7548543689320388, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2767588932806324, "calib/std_conf": 0.26906922946717476, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38264637002341917, "calib/step_q_c_n": 854.0, "calib/step_q_gap": 0.09376955842921625, "calib/step_q_w": 0.2888768115942029, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 489.18359375, "completions/mean_terminated_length": 491.10198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.0608, "grad_norm": 0.03516416251659393, "kl": 0.0896453857421875, "learning_rate": 3.972222222222223e-06, "loss": 0.0234, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03536321222782135, "mask/share_reasoning": 0.8221592903137207, "mask/share_step_conf": 0.13857123255729675, "num_tokens": 13333077.0, "reward": 1.1429297924041748, "reward_std": 0.19141796231269836, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6924902200698853, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8523504734039307, "step": 57 }, { "adv/mean_abs_final_conf": 0.7294912338256836, "adv/mean_abs_reasoning": 0.6277225613594055, "adv/mean_abs_step_conf": 0.7514354586601257, "adv/ratio_final_to_reasoning": 1.162123649412706, "adv/ratio_step_to_reasoning": 1.1970821265891824, "adv/std_final_conf": 0.9116400480270386, "adv/std_reasoning": 0.843001663684845, "adv/std_step_conf": 0.9354155659675598, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6327699891172142, "calib/avg_num_step_conf": 7.890625, "calib/ece": 0.32192, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.54, "calib/gap": 0.14120478842583717, "calib/mean_conf": 0.76152, "calib/mu_c": 0.8332520325203252, "calib/mu_w": 0.692047244094488, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29572, "calib/std_conf": 0.3228474711067132, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.34553341148886285, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.07273992391388429, "calib/step_q_w": 0.27279348757497857, "calib/step_q_w_n": 1167.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 583.38671875, "completions/mean_terminated_length": 585.674560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.06186666666666667, "grad_norm": 0.027154145762324333, "kl": 0.10162353515625, "learning_rate": 3.944444444444445e-06, "loss": 0.0132, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031217699870467186, "mask/share_reasoning": 0.8236774206161499, "mask/share_step_conf": 0.14119866490364075, "num_tokens": 13588744.0, "reward": 1.077712059020996, "reward_std": 0.24210575222969055, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6262355446815491, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8257089257240295, "step": 58 }, { "adv/mean_abs_final_conf": 0.5891823172569275, "adv/mean_abs_reasoning": 0.47933292388916016, "adv/mean_abs_step_conf": 0.7589001655578613, "adv/ratio_final_to_reasoning": 1.2291713919346143, "adv/ratio_step_to_reasoning": 1.5832423097507644, "adv/std_final_conf": 0.8096221685409546, "adv/std_reasoning": 0.7207550406455994, "adv/std_step_conf": 0.9351330399513245, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6594375123786889, "calib/avg_num_step_conf": 6.453125, "calib/ece": 0.27825396825396814, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6626984126984127, "calib/gap": 0.15903743315508012, "calib/mean_conf": 0.8048412698412698, "calib/mu_c": 0.8673202614379084, "calib/mu_w": 0.7082828282828283, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23797619047619037, "calib/std_conf": 0.3100940311229322, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3749768844221105, "calib/step_q_c_n": 995.0, "calib/step_q_gap": 0.04001493617249108, "calib/step_q_w": 0.33496194824961945, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 513.859375, "completions/mean_terminated_length": 517.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.06293333333333333, "grad_norm": 0.03092842549085617, "kl": 0.10501861572265625, "learning_rate": 3.916666666666667e-06, "loss": -0.0668, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03509576991200447, "mask/share_reasoning": 0.8163177371025085, "mask/share_step_conf": 0.1407739818096161, "num_tokens": 13826540.0, "reward": 1.132265567779541, "reward_std": 0.19192655384540558, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6911335587501526, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8379942774772644, "step": 59 }, { "adv/mean_abs_final_conf": 0.6356889009475708, "adv/mean_abs_reasoning": 0.5132553577423096, "adv/mean_abs_step_conf": 0.7455482482910156, "adv/ratio_final_to_reasoning": 1.238543137170195, "adv/ratio_step_to_reasoning": 1.452587366200147, "adv/std_final_conf": 0.876144528388977, "adv/std_reasoning": 0.7927860021591187, "adv/std_step_conf": 0.9351310729980469, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7157082479508197, "calib/avg_num_step_conf": 6.9609375, "calib/ece": 0.29088, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.644, "calib/gap": 0.2622540983606557, "calib/mean_conf": 0.77952, "calib/mu_c": 0.9075, "calib/mu_w": 0.6452459016393443, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2792, "calib/std_conf": 0.3344616115490685, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39738062755798087, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.11975431678581688, "calib/step_q_w": 0.277626310772164, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 497.171875, "completions/mean_terminated_length": 501.08660888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.064, "grad_norm": 0.0763402134180069, "kl": 0.0853118896484375, "learning_rate": 3.88888888888889e-06, "loss": 0.0266, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03505894914269447, "mask/share_reasoning": 0.8122283816337585, "mask/share_step_conf": 0.1449001580476761, "num_tokens": 14062672.0, "reward": 1.1305547952651978, "reward_std": 0.24602213501930237, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6814101338386536, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.856257975101471, "step": 60 }, { "adv/mean_abs_final_conf": 0.6465821266174316, "adv/mean_abs_reasoning": 0.5136411190032959, "adv/mean_abs_step_conf": 0.7517024278640747, "adv/ratio_final_to_reasoning": 1.258820804440469, "adv/ratio_step_to_reasoning": 1.463477903254181, "adv/std_final_conf": 0.8604548573493958, "adv/std_reasoning": 0.7753183841705322, "adv/std_step_conf": 0.9348891973495483, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6626819126819127, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.26952380952380944, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6706349206349206, "calib/gap": 0.1926559251559249, "calib/mean_conf": 0.821031746031746, "calib/mu_c": 0.9005405405405403, "calib/mu_w": 0.7078846153846154, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25162698412698403, "calib/std_conf": 0.29678657118114243, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4120119760479042, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.11402000817641822, "calib/step_q_w": 0.29799196787148596, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 431.1953125, "completions/mean_terminated_length": 431.1953125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.06506666666666666, "grad_norm": 0.03931074216961861, "kl": 0.09100341796875, "learning_rate": 3.861111111111112e-06, "loss": 0.0669, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04118172079324722, "mask/share_reasoning": 0.8074016571044922, "mask/share_step_conf": 0.1514166295528412, "num_tokens": 14277122.0, "reward": 1.138131856918335, "reward_std": 0.20521759986877441, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6933664083480835, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8474522829055786, "step": 61 }, { "adv/mean_abs_final_conf": 0.7413532137870789, "adv/mean_abs_reasoning": 0.6082359552383423, "adv/mean_abs_step_conf": 0.7599615454673767, "adv/ratio_final_to_reasoning": 1.218857924136651, "adv/ratio_step_to_reasoning": 1.2494518597960549, "adv/std_final_conf": 0.8915730118751526, "adv/std_reasoning": 0.7930753827095032, "adv/std_step_conf": 0.9355272054672241, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6220703124999999, "calib/avg_num_step_conf": 7.06640625, "calib/ece": 0.24818548387096775, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.41935483870967744, "calib/gap": 0.17193229166666668, "calib/mean_conf": 0.6165725806451613, "calib/mu_c": 0.699765625, "calib/mu_w": 0.5278333333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1743145161290323, "calib/std_conf": 0.37285184018863304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3638804220398593, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.05817958521977562, "calib/step_q_w": 0.3057008368200837, "calib/step_q_w_n": 956.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 519.65234375, "completions/mean_terminated_length": 523.7440795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.06613333333333334, "grad_norm": 0.041193604469299316, "kl": 0.19834136962890625, "learning_rate": 3.833333333333334e-06, "loss": 0.0184, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033691082149744034, "mask/share_reasoning": 0.8175060749053955, "mask/share_step_conf": 0.14099037647247314, "num_tokens": 14517233.0, "reward": 1.0956416130065918, "reward_std": 0.2342512458562851, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6636785268783569, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8230906128883362, "step": 62 }, { "adv/mean_abs_final_conf": 0.7436953186988831, "adv/mean_abs_reasoning": 0.5573487281799316, "adv/mean_abs_step_conf": 0.7506574392318726, "adv/ratio_final_to_reasoning": 1.334344694976665, "adv/ratio_step_to_reasoning": 1.3468361929939385, "adv/std_final_conf": 0.9340667128562927, "adv/std_reasoning": 0.775478184223175, "adv/std_step_conf": 0.935070276260376, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8093303713106952, "calib/avg_num_step_conf": 6.62109375, "calib/ece": 0.10964285714285713, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": 0.39582164392256414, "calib/mean_conf": 0.5507539682539684, "calib/mu_c": 0.7313868613138685, "calib/mu_w": 0.3355652173913044, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05837301587301586, "calib/std_conf": 0.3528925639491461, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39803337306317044, "calib/step_q_c_n": 839.0, "calib/step_q_gap": 0.10076701792298354, "calib/step_q_w": 0.2972663551401869, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 544.3828125, "completions/mean_terminated_length": 548.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.0672, "grad_norm": 0.07931511104106903, "kl": 0.0871429443359375, "learning_rate": 3.8055555555555556e-06, "loss": -0.0398, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03265227749943733, "mask/share_reasoning": 0.8248114585876465, "mask/share_step_conf": 0.1347237527370453, "num_tokens": 14765235.0, "reward": 1.2072186470031738, "reward_std": 0.1769953966140747, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.8108534812927246, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8664516806602478, "step": 63 }, { "adv/mean_abs_final_conf": 0.7594350576400757, "adv/mean_abs_reasoning": 0.6033494472503662, "adv/mean_abs_step_conf": 0.7881118655204773, "adv/ratio_final_to_reasoning": 1.2586985222262748, "adv/ratio_step_to_reasoning": 1.3062278736014852, "adv/std_final_conf": 0.9346402883529663, "adv/std_reasoning": 0.8266748189926147, "adv/std_step_conf": 0.9354366660118103, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6618489037843877, "calib/avg_num_step_conf": 6.3359375, "calib/ece": 0.21761133603238866, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.2591093117408907, "calib/gap": 0.19151096215612334, "calib/mean_conf": 0.5554251012145749, "calib/mu_c": 0.6275324675324675, "calib/mu_w": 0.43602150537634415, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07477732793522268, "calib/std_conf": 0.34017339210741376, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38183770883054896, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.0992611782183041, "calib/step_q_w": 0.28257653061224486, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 498.33203125, "completions/mean_terminated_length": 502.2558898925781, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.06826666666666667, "grad_norm": 0.051069580018520355, "kl": 0.0894927978515625, "learning_rate": 3.777777777777778e-06, "loss": 0.0888, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03600117191672325, "mask/share_reasoning": 0.822626531124115, "mask/share_step_conf": 0.13355976343154907, "num_tokens": 14996584.0, "reward": 1.1186976432800293, "reward_std": 0.21334236860275269, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7089800834655762, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8100893497467041, "step": 64 }, { "adv/mean_abs_final_conf": 0.7145624160766602, "adv/mean_abs_reasoning": 0.4103482961654663, "adv/mean_abs_step_conf": 0.7509387731552124, "adv/ratio_final_to_reasoning": 1.7413558743973057, "adv/ratio_step_to_reasoning": 1.8300033902234323, "adv/std_final_conf": 0.8932875394821167, "adv/std_reasoning": 0.6815477609634399, "adv/std_step_conf": 0.9347946643829346, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8041538461538461, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.18450980392156868, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.36470588235294116, "calib/gap": 0.3085692307692308, "calib/mean_conf": 0.6925098039215688, "calib/mu_c": 0.8437692307692307, "calib/mu_w": 0.5351999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18360784313725498, "calib/std_conf": 0.3136064332916695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4357640750670241, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.05981271598261778, "calib/step_q_w": 0.3759513590844063, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 401.5703125, "completions/mean_terminated_length": 401.5703125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06933333333333333, "grad_norm": 0.03955350071191788, "kl": 0.090484619140625, "learning_rate": 3.7500000000000005e-06, "loss": -0.0301, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.041694652289152145, "mask/share_reasoning": 0.8116481900215149, "mask/share_step_conf": 0.14665712416172028, "num_tokens": 15204410.0, "reward": 1.1743881702423096, "reward_std": 0.1349031776189804, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7695730924606323, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8522812128067017, "step": 65 }, { "adv/mean_abs_final_conf": 0.7148923277854919, "adv/mean_abs_reasoning": 0.46863657236099243, "adv/mean_abs_step_conf": 0.7454225420951843, "adv/ratio_final_to_reasoning": 1.525472764927121, "adv/ratio_step_to_reasoning": 1.590619652964222, "adv/std_final_conf": 0.9109669327735901, "adv/std_reasoning": 0.7575206756591797, "adv/std_step_conf": 0.9346955418586731, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7966229508196722, "calib/avg_num_step_conf": 7.65234375, "calib/ece": 0.08914979757085015, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.242914979757085, "calib/gap": 0.36650950819672135, "calib/mean_conf": 0.534251012145749, "calib/mu_c": 0.71528, "calib/mu_w": 0.3487704918032787, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05866396761133601, "calib/std_conf": 0.33597544515327177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40850253807106596, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.1352019402913905, "calib/step_q_w": 0.27330059777967547, "calib/step_q_w_n": 1171.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 598.10546875, "completions/mean_terminated_length": 598.10546875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0704, "grad_norm": 0.03529307246208191, "kl": 0.0828857421875, "learning_rate": 3.7222222222222225e-06, "loss": 0.1242, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.032410524785518646, "mask/share_reasoning": 0.8293423652648926, "mask/share_step_conf": 0.13824716210365295, "num_tokens": 15463877.0, "reward": 1.1834068298339844, "reward_std": 0.1921721249818802, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.790777325630188, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8569409847259521, "step": 66 }, { "adv/mean_abs_final_conf": 0.7141172885894775, "adv/mean_abs_reasoning": 0.38203346729278564, "adv/mean_abs_step_conf": 0.7260956764221191, "adv/ratio_final_to_reasoning": 1.869253219226961, "adv/ratio_step_to_reasoning": 1.9006075084663945, "adv/std_final_conf": 0.9237503409385681, "adv/std_reasoning": 0.6815032958984375, "adv/std_step_conf": 0.9345933794975281, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7906249999999999, "calib/avg_num_step_conf": 6.69921875, "calib/ece": 0.09826086956521743, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4031620553359684, "calib/gap": 0.36772513440860205, "calib/mean_conf": 0.6520158102766799, "calib/mu_c": 0.7871874999999999, "calib/mu_w": 0.4194623655913979, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.058932806324110736, "calib/std_conf": 0.3431432935497034, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4334216101694915, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.1032270576403086, "calib/step_q_w": 0.3301945525291829, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 489.15625, "completions/mean_terminated_length": 491.07452392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.07146666666666666, "grad_norm": 0.061680082231760025, "kl": 0.088104248046875, "learning_rate": 3.694444444444445e-06, "loss": -0.0804, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03602088615298271, "mask/share_reasoning": 0.8216089010238647, "mask/share_step_conf": 0.13846397399902344, "num_tokens": 15694109.0, "reward": 1.2258788347244263, "reward_std": 0.1442015916109085, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.8107554316520691, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8788971900939941, "step": 67 }, { "adv/mean_abs_final_conf": 0.6820115447044373, "adv/mean_abs_reasoning": 0.46427327394485474, "adv/mean_abs_step_conf": 0.7704893350601196, "adv/ratio_final_to_reasoning": 1.4689873033386043, "adv/ratio_step_to_reasoning": 1.6595599581113007, "adv/std_final_conf": 0.8864242434501648, "adv/std_reasoning": 0.7391756772994995, "adv/std_step_conf": 0.9349017143249512, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7717391304347826, "calib/avg_num_step_conf": 6.6484375, "calib/ece": 0.1602777777777778, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.43253968253968256, "calib/gap": 0.3402326468344775, "calib/mean_conf": 0.7078968253968254, "calib/mu_c": 0.8618115942028985, "calib/mu_w": 0.521578947368421, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1602777777777778, "calib/std_conf": 0.32516531398427684, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5014229765013054, "calib/step_q_c_n": 766.0, "calib/step_q_gap": 0.1717328055611344, "calib/step_q_w": 0.329690170940171, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 493.16015625, "completions/mean_terminated_length": 493.16015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.07253333333333334, "grad_norm": 0.04005345329642296, "kl": 0.0855255126953125, "learning_rate": 3.6666666666666666e-06, "loss": -0.0084, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036960065364837646, "mask/share_reasoning": 0.8169299364089966, "mask/share_step_conf": 0.14610998332500458, "num_tokens": 15924446.0, "reward": 1.1919140815734863, "reward_std": 0.18502911925315857, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7743504047393799, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8709017038345337, "step": 68 }, { "adv/mean_abs_final_conf": 0.7154502868652344, "adv/mean_abs_reasoning": 0.46667271852493286, "adv/mean_abs_step_conf": 0.7458276748657227, "adv/ratio_final_to_reasoning": 1.5330878760743545, "adv/ratio_step_to_reasoning": 1.598181434781848, "adv/std_final_conf": 0.9064786434173584, "adv/std_reasoning": 0.7393455505371094, "adv/std_step_conf": 0.9349368810653687, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7176014854654885, "calib/avg_num_step_conf": 6.6953125, "calib/ece": 0.22788844621513943, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.36254980079681276, "calib/gap": 0.26841208861569976, "calib/mean_conf": 0.6157768924302789, "calib/mu_c": 0.7622807017543859, "calib/mu_w": 0.49386861313868613, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1947410358565737, "calib/std_conf": 0.3426026673469013, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4538686131386861, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.09047697076745187, "calib/step_q_w": 0.3633916423712342, "calib/step_q_w_n": 1029.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 552.44140625, "completions/mean_terminated_length": 554.6078491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.0736, "grad_norm": 0.06692394614219666, "kl": 0.24575042724609375, "learning_rate": 3.638888888888889e-06, "loss": 0.0174, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032987501472234726, "mask/share_reasoning": 0.831560492515564, "mask/share_step_conf": 0.131545752286911, "num_tokens": 16170367.0, "reward": 1.145660638809204, "reward_std": 0.21713489294052124, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7272031307220459, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8526413440704346, "step": 69 }, { "adv/mean_abs_final_conf": 0.7055125832557678, "adv/mean_abs_reasoning": 0.48625022172927856, "adv/mean_abs_step_conf": 0.7672015428543091, "adv/ratio_final_to_reasoning": 1.4509249594719245, "adv/ratio_step_to_reasoning": 1.5777916565793384, "adv/std_final_conf": 0.903593122959137, "adv/std_reasoning": 0.7575156688690186, "adv/std_step_conf": 0.9353858828544617, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8324080267558528, "calib/avg_num_step_conf": 7.1328125, "calib/ece": 0.19293877551020416, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4816326530612245, "calib/gap": 0.42121070234113706, "calib/mean_conf": 0.6623265306122449, "calib/mu_c": 0.8858260869565217, "calib/mu_w": 0.4646153846153846, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19293877551020416, "calib/std_conf": 0.36361099351126486, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5061911764705882, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.1676920490709372, "calib/step_q_w": 0.338499127399651, "calib/step_q_w_n": 1146.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 524.58203125, "completions/mean_terminated_length": 530.8023681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.07466666666666667, "grad_norm": 0.05342242121696472, "kl": 0.12483978271484375, "learning_rate": 3.6111111111111115e-06, "loss": -0.0851, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03515394777059555, "mask/share_reasoning": 0.8060823678970337, "mask/share_step_conf": 0.14704498648643494, "num_tokens": 16411652.0, "reward": 1.1459590196609497, "reward_std": 0.2323385775089264, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7573128938674927, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8355700373649597, "step": 70 }, { "adv/mean_abs_final_conf": 0.7016242742538452, "adv/mean_abs_reasoning": 0.5792092084884644, "adv/mean_abs_step_conf": 0.7449058294296265, "adv/ratio_final_to_reasoning": 1.2113486180318884, "adv/ratio_step_to_reasoning": 1.286073871949607, "adv/std_final_conf": 0.8915162086486816, "adv/std_reasoning": 0.8265349864959717, "adv/std_step_conf": 0.9350942373275757, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6868696727178085, "calib/avg_num_step_conf": 6.8125, "calib/ece": 0.22936000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.448, "calib/gap": 0.21375886524822707, "calib/mean_conf": 0.66056, "calib/mu_c": 0.753758865248227, "calib/mu_w": 0.5399999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16296000000000005, "calib/std_conf": 0.3538594161527993, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4909367681498829, "calib/step_q_c_n": 854.0, "calib/step_q_gap": 0.11885811646448968, "calib/step_q_w": 0.3720786516853932, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 521.33203125, "completions/mean_terminated_length": 521.33203125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.07573333333333333, "grad_norm": 0.046845778822898865, "kl": 0.15727996826171875, "learning_rate": 3.5833333333333335e-06, "loss": -0.007, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034949902445077896, "mask/share_reasoning": 0.8204107284545898, "mask/share_step_conf": 0.14463931322097778, "num_tokens": 16649521.0, "reward": 1.1325509548187256, "reward_std": 0.23369169235229492, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.707699179649353, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8346226215362549, "step": 71 }, { "adv/mean_abs_final_conf": 0.6370866894721985, "adv/mean_abs_reasoning": 0.3830593228340149, "adv/mean_abs_step_conf": 0.7415189146995544, "adv/ratio_final_to_reasoning": 1.6631541160747503, "adv/ratio_step_to_reasoning": 1.9357808843119195, "adv/std_final_conf": 0.8679909706115723, "adv/std_reasoning": 0.6815306544303894, "adv/std_step_conf": 0.9352692365646362, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8205185659411013, "calib/avg_num_step_conf": 6.53125, "calib/ece": 0.21690476190476196, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5952380952380952, "calib/gap": 0.3527323943661972, "calib/mean_conf": 0.7747619047619047, "calib/mu_c": 0.9287323943661971, "calib/mu_w": 0.576, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21408730158730163, "calib/std_conf": 0.3167887340812743, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.542945914844649, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.1503058152182481, "calib/step_q_w": 0.39264009962640095, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2398.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 479.93359375, "completions/mean_terminated_length": 479.93359375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0768, "grad_norm": 0.05887433886528015, "kl": 0.102203369140625, "learning_rate": 3.555555555555556e-06, "loss": 0.0013, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03538592532277107, "mask/share_reasoning": 0.8194608688354492, "mask/share_step_conf": 0.14515320956707, "num_tokens": 16876792.0, "reward": 1.1849489212036133, "reward_std": 0.1923227161169052, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7667582035064697, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8640721440315247, "step": 72 }, { "adv/mean_abs_final_conf": 0.6404528617858887, "adv/mean_abs_reasoning": 0.5191984176635742, "adv/mean_abs_step_conf": 0.7633548378944397, "adv/ratio_final_to_reasoning": 1.2335416287822432, "adv/ratio_step_to_reasoning": 1.47025647984365, "adv/std_final_conf": 0.8272820711135864, "adv/std_reasoning": 0.7753027677536011, "adv/std_step_conf": 0.9352089166641235, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7346065406234031, "calib/avg_num_step_conf": 6.1875, "calib/ece": 0.21533333333333327, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6431372549019608, "calib/gap": 0.2979605263157894, "calib/mean_conf": 0.777607843137255, "calib/mu_c": 0.8979605263157895, "calib/mu_w": 0.6000000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19843137254901952, "calib/std_conf": 0.33210109956253087, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.54545067264574, "calib/step_q_c_n": 892.0, "calib/step_q_gap": 0.12984807148967065, "calib/step_q_w": 0.41560260115606934, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 468.4609375, "completions/mean_terminated_length": 470.2980651855469, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.07786666666666667, "grad_norm": 0.027524832636117935, "kl": 0.09661865234375, "learning_rate": 3.5277777777777784e-06, "loss": -0.0586, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03430914878845215, "mask/share_reasoning": 0.8275830745697021, "mask/share_step_conf": 0.1342015266418457, "num_tokens": 17103750.0, "reward": 1.1775658130645752, "reward_std": 0.2001640647649765, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7525933384895325, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8569004535675049, "step": 73 }, { "adv/mean_abs_final_conf": 0.6963641047477722, "adv/mean_abs_reasoning": 0.4993099272251129, "adv/mean_abs_step_conf": 0.763095498085022, "adv/ratio_final_to_reasoning": 1.394653033673448, "adv/ratio_step_to_reasoning": 1.5283002729905304, "adv/std_final_conf": 0.8839811682701111, "adv/std_reasoning": 0.7575883269309998, "adv/std_step_conf": 0.9351733326911926, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.733104674796748, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.2261354581673307, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.47410358565737054, "calib/gap": 0.27355119410569095, "calib/mean_conf": 0.6674103585657369, "calib/mu_c": 0.806910569105691, "calib/mu_w": 0.5333593750000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20175298804780878, "calib/std_conf": 0.361738920694594, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4805170068027211, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.09661590790162217, "calib/step_q_w": 0.3839010989010989, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 495.07421875, "completions/mean_terminated_length": 498.9724426269531, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.07893333333333333, "grad_norm": 0.029221879318356514, "kl": 0.10986328125, "learning_rate": 3.5e-06, "loss": -0.0388, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035621289163827896, "mask/share_reasoning": 0.8160022497177124, "mask/share_step_conf": 0.1405639499425888, "num_tokens": 17334417.0, "reward": 1.1277399063110352, "reward_std": 0.23573684692382812, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7065363526344299, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8383581638336182, "step": 74 }, { "adv/mean_abs_final_conf": 0.5262219309806824, "adv/mean_abs_reasoning": 0.47117334604263306, "adv/mean_abs_step_conf": 0.7681576013565063, "adv/ratio_final_to_reasoning": 1.1168329775026544, "adv/ratio_step_to_reasoning": 1.630307842767917, "adv/std_final_conf": 0.778659999370575, "adv/std_reasoning": 0.7391869425773621, "adv/std_step_conf": 0.9349520206451416, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8094276094276096, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.1750980392156863, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7137254901960784, "calib/gap": 0.4265555555555557, "calib/mean_conf": 0.8001176470588235, "calib/mu_c": 0.9506666666666668, "calib/mu_w": 0.5241111111111111, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16407843137254904, "calib/std_conf": 0.34238778880660364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5480240174672488, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.12948050059335897, "calib/step_q_w": 0.41854351687388985, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 413.14453125, "completions/mean_terminated_length": 414.7647399902344, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.08, "grad_norm": 0.035005368292331696, "kl": 0.12042236328125, "learning_rate": 3.4722222222222224e-06, "loss": -0.0406, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.038583215326070786, "mask/share_reasoning": 0.8095848560333252, "mask/share_step_conf": 0.14792568981647491, "num_tokens": 17544934.0, "reward": 1.2190998792648315, "reward_std": 0.19501495361328125, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.8225722312927246, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8583348989486694, "step": 75 }, { "adv/mean_abs_final_conf": 0.5986725091934204, "adv/mean_abs_reasoning": 0.49344608187675476, "adv/mean_abs_step_conf": 0.7562553882598877, "adv/ratio_final_to_reasoning": 1.2132480754866901, "adv/ratio_step_to_reasoning": 1.5325998443103928, "adv/std_final_conf": 0.8273491263389587, "adv/std_reasoning": 0.7574414610862732, "adv/std_step_conf": 0.9353153109550476, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7254597254597255, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.19760784313725496, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.611764705882353, "calib/gap": 0.3269153069153068, "calib/mean_conf": 0.7394901960784314, "calib/mu_c": 0.8664102564102564, "calib/mu_w": 0.5394949494949496, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16266666666666674, "calib/std_conf": 0.36759117169992844, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4986483253588517, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.13443219141669038, "calib/step_q_w": 0.3642161339421613, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 445.44921875, "completions/mean_terminated_length": 445.44921875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.08106666666666666, "grad_norm": 0.040073275566101074, "kl": 0.1200408935546875, "learning_rate": 3.444444444444445e-06, "loss": 0.1011, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.038839176297187805, "mask/share_reasoning": 0.8191479444503784, "mask/share_step_conf": 0.14201289415359497, "num_tokens": 17762025.0, "reward": 1.187859296798706, "reward_std": 0.17804092168807983, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7633512020111084, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.860849142074585, "step": 76 }, { "adv/mean_abs_final_conf": 0.694234311580658, "adv/mean_abs_reasoning": 0.4896814823150635, "adv/mean_abs_step_conf": 0.7751322388648987, "adv/ratio_final_to_reasoning": 1.4177262907687087, "adv/ratio_step_to_reasoning": 1.5829314908954935, "adv/std_final_conf": 0.8711609244346619, "adv/std_reasoning": 0.7207184433937073, "adv/std_step_conf": 0.9352930784225464, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6509993337774818, "calib/avg_num_step_conf": 6.76171875, "calib/ece": 0.2668770750988142, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5177865612648221, "calib/gap": 0.2290602398401066, "calib/mean_conf": 0.638103162055336, "calib/mu_c": 0.7241139240506329, "calib/mu_w": 0.4950536842105263, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14023715415019758, "calib/std_conf": 0.40786421303490994, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43599173553719006, "calib/step_q_c_n": 968.0, "calib/step_q_gap": 0.07213852452801572, "calib/step_q_w": 0.36385321100917434, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 443.3359375, "completions/mean_terminated_length": 448.5928955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.08213333333333334, "grad_norm": 0.035977620631456375, "kl": 0.1226959228515625, "learning_rate": 3.416666666666667e-06, "loss": -0.1244, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04012027755379677, "mask/share_reasoning": 0.7930710315704346, "mask/share_step_conf": 0.15508995950222015, "num_tokens": 17980183.0, "reward": 1.145344614982605, "reward_std": 0.19340042769908905, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6981140375137329, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8476542234420776, "step": 77 }, { "adv/mean_abs_final_conf": 0.6861317753791809, "adv/mean_abs_reasoning": 0.42679959535598755, "adv/mean_abs_step_conf": 0.7618663311004639, "adv/ratio_final_to_reasoning": 1.607620491783475, "adv/ratio_step_to_reasoning": 1.785068072674722, "adv/std_final_conf": 0.9054518342018127, "adv/std_reasoning": 0.7013146281242371, "adv/std_step_conf": 0.9350811243057251, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7512456638284454, "calib/avg_num_step_conf": 6.015625, "calib/ece": 0.24960937499999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.48828125, "calib/gap": 0.33470829391359197, "calib/mean_conf": 0.6021875000000001, "calib/mu_c": 0.7394701986754967, "calib/mu_w": 0.4047619047619047, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13097656249999995, "calib/std_conf": 0.4208203251908705, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4692647058823529, "calib/step_q_c_n": 884.0, "calib/step_q_gap": 0.1322372668579626, "calib/step_q_w": 0.3370274390243903, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 477.24609375, "completions/mean_terminated_length": 479.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.0832, "grad_norm": 0.04717475175857544, "kl": 0.1195831298828125, "learning_rate": 3.3888888888888893e-06, "loss": -0.0247, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03434506803750992, "mask/share_reasoning": 0.8292993307113647, "mask/share_step_conf": 0.13244935870170593, "num_tokens": 18210382.0, "reward": 1.1887092590332031, "reward_std": 0.18209871649742126, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7424741983413696, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8785045742988586, "step": 78 }, { "adv/mean_abs_final_conf": 0.6638272404670715, "adv/mean_abs_reasoning": 0.46231958270072937, "adv/mean_abs_step_conf": 0.7449385523796082, "adv/ratio_final_to_reasoning": 1.4358622591524162, "adv/ratio_step_to_reasoning": 1.6113065079958442, "adv/std_final_conf": 0.8744451999664307, "adv/std_reasoning": 0.7206392884254456, "adv/std_step_conf": 0.9351168274879456, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6891389432485323, "calib/avg_num_step_conf": 6.79296875, "calib/ece": 0.24629482071713144, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4820717131474104, "calib/gap": 0.3075264187866928, "calib/mean_conf": 0.6057370517928288, "calib/mu_c": 0.7343835616438356, "calib/mu_w": 0.42685714285714277, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13517928286852587, "calib/std_conf": 0.4147179651512537, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4479030439684329, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.14023872471960652, "calib/step_q_w": 0.30766431924882637, "calib/step_q_w_n": 852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 480.0703125, "completions/mean_terminated_length": 487.69049072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.08426666666666667, "grad_norm": 0.12207309156656265, "kl": 0.6846923828125, "learning_rate": 3.3611111111111117e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03398709371685982, "mask/share_reasoning": 0.8186401128768921, "mask/share_step_conf": 0.1317477524280548, "num_tokens": 18439656.0, "reward": 1.1509122848510742, "reward_std": 0.21398352086544037, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7164999842643738, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8506331443786621, "step": 79 }, { "adv/mean_abs_final_conf": 0.5320920944213867, "adv/mean_abs_reasoning": 0.3927716910839081, "adv/mean_abs_step_conf": 0.750799298286438, "adv/ratio_final_to_reasoning": 1.3547109083982214, "adv/ratio_step_to_reasoning": 1.9115412727798762, "adv/std_final_conf": 0.7759950757026672, "adv/std_reasoning": 0.6815386414527893, "adv/std_step_conf": 0.9349751472473145, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6903703703703703, "calib/avg_num_step_conf": 6.390625, "calib/ece": 0.2574117647058824, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7098039215686275, "calib/gap": 0.2938585858585858, "calib/mean_conf": 0.7692549019607843, "calib/mu_c": 0.872969696969697, "calib/mu_w": 0.5791111111111111, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18980392156862746, "calib/std_conf": 0.3844199012995899, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45761523046092184, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.12296005804712873, "calib/step_q_w": 0.3346551724137931, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 427.796875, "completions/mean_terminated_length": 429.47454833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.08533333333333333, "grad_norm": 0.22899942100048065, "kl": 1.1295013427734375, "learning_rate": 3.3333333333333333e-06, "loss": -0.033, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03959141671657562, "mask/share_reasoning": 0.807418704032898, "mask/share_step_conf": 0.14908364415168762, "num_tokens": 18651332.0, "reward": 1.1782162189483643, "reward_std": 0.17925795912742615, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.740231990814209, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8587168455123901, "step": 80 }, { "adv/mean_abs_final_conf": 0.6153532266616821, "adv/mean_abs_reasoning": 0.5657199025154114, "adv/mean_abs_step_conf": 0.7426446676254272, "adv/ratio_final_to_reasoning": 1.087734802904373, "adv/ratio_step_to_reasoning": 1.3127426917867646, "adv/std_final_conf": 0.8280157446861267, "adv/std_reasoning": 0.8098756670951843, "adv/std_step_conf": 0.9354367852210999, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7782327301545371, "calib/avg_num_step_conf": 6.99609375, "calib/ece": 0.21093117408906886, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.582995951417004, "calib/gap": 0.46848236692642986, "calib/mean_conf": 0.6295546558704453, "calib/mu_c": 0.8438805970149255, "calib/mu_w": 0.3753982300884956, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1489878542510122, "calib/std_conf": 0.45019766694245333, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47373198847262254, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.19115222548264987, "calib/step_q_w": 0.28257976298997267, "calib/step_q_w_n": 1097.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 523.76171875, "completions/mean_terminated_length": 529.9723510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0864, "grad_norm": 0.04257092624902725, "kl": 0.1041107177734375, "learning_rate": 3.3055555555555558e-06, "loss": 0.0727, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03545115515589714, "mask/share_reasoning": 0.8166124820709229, "mask/share_step_conf": 0.1362176388502121, "num_tokens": 18891663.0, "reward": 1.1495893001556396, "reward_std": 0.2690851092338562, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7468858957290649, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8364243507385254, "step": 81 }, { "adv/mean_abs_final_conf": 0.6698199510574341, "adv/mean_abs_reasoning": 0.5826090574264526, "adv/mean_abs_step_conf": 0.7445812821388245, "adv/ratio_final_to_reasoning": 1.1496902468633363, "adv/ratio_step_to_reasoning": 1.2780118548582964, "adv/std_final_conf": 0.8605046272277832, "adv/std_reasoning": 0.8098371028900146, "adv/std_step_conf": 0.9354823231697083, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.683967408738051, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.31599206349206355, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6388888888888888, "calib/gap": 0.2821293385513568, "calib/mean_conf": 0.7020238095238095, "calib/mu_c": 0.824055944055944, "calib/mu_w": 0.5419266055045872, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22527777777777783, "calib/std_conf": 0.4171723971656276, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46937583001328026, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.16399935942504495, "calib/step_q_w": 0.3053764705882353, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 445.4140625, "completions/mean_terminated_length": 445.4140625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.08746666666666666, "grad_norm": 0.05169864743947983, "kl": 0.134735107421875, "learning_rate": 3.277777777777778e-06, "loss": 0.0822, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.039256948977708817, "mask/share_reasoning": 0.8154367208480835, "mask/share_step_conf": 0.14530633389949799, "num_tokens": 19111241.0, "reward": 1.1285752058029175, "reward_std": 0.26243820786476135, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6821433305740356, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8453171253204346, "step": 82 }, { "adv/mean_abs_final_conf": 0.6228658556938171, "adv/mean_abs_reasoning": 0.4538366198539734, "adv/mean_abs_step_conf": 0.7593032121658325, "adv/ratio_final_to_reasoning": 1.3724451233006068, "adv/ratio_step_to_reasoning": 1.6730761224383923, "adv/std_final_conf": 0.8233391642570496, "adv/std_reasoning": 0.7206999063491821, "adv/std_step_conf": 0.9348890781402588, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6860727471625871, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.273705306122449, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5877551020408164, "calib/gap": 0.3269244906331191, "calib/mean_conf": 0.6560089795918368, "calib/mu_c": 0.7934507042253521, "calib/mu_w": 0.466526213592233, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.17506122448979594, "calib/std_conf": 0.43924072903342637, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.44755494505494503, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.17971623537752562, "calib/step_q_w": 0.2678387096774194, "calib/step_q_w_n": 930.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 510.4765625, "completions/mean_terminated_length": 516.5296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.08853333333333334, "grad_norm": 0.04278864711523056, "kl": 0.11474609375, "learning_rate": 3.2500000000000002e-06, "loss": -0.0214, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03520066291093826, "mask/share_reasoning": 0.8231779932975769, "mask/share_step_conf": 0.12990263104438782, "num_tokens": 19349187.0, "reward": 1.1222634315490723, "reward_std": 0.2234598845243454, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6821732521057129, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8405274748802185, "step": 83 }, { "adv/mean_abs_final_conf": 0.6835577487945557, "adv/mean_abs_reasoning": 0.48697832226753235, "adv/mean_abs_step_conf": 0.7442065477371216, "adv/ratio_final_to_reasoning": 1.4036718218003716, "adv/ratio_step_to_reasoning": 1.528212886914246, "adv/std_final_conf": 0.8857553005218506, "adv/std_reasoning": 0.7393399477005005, "adv/std_step_conf": 0.9358551502227783, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8175134526342136, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.17885375494071146, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5454545454545454, "calib/gap": 0.551800775872857, "calib/mean_conf": 0.6150592885375494, "calib/mu_c": 0.881145038167939, "calib/mu_w": 0.32934426229508196, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13806324110671936, "calib/std_conf": 0.4436211221776772, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4896778916544657, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.1805784292888743, "calib/step_q_w": 0.3090994623655914, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 442.5078125, "completions/mean_terminated_length": 442.5078125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0896, "grad_norm": 0.028044484555721283, "kl": 0.1327667236328125, "learning_rate": 3.2222222222222227e-06, "loss": 0.0029, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03884014114737511, "mask/share_reasoning": 0.826440691947937, "mask/share_step_conf": 0.13471916317939758, "num_tokens": 19568389.0, "reward": 1.1857885122299194, "reward_std": 0.23890095949172974, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.8046886920928955, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.845633864402771, "step": 84 }, { "adv/mean_abs_final_conf": 0.6975332498550415, "adv/mean_abs_reasoning": 0.5464292168617249, "adv/mean_abs_step_conf": 0.7447526454925537, "adv/ratio_final_to_reasoning": 1.276529929825392, "adv/ratio_step_to_reasoning": 1.3629444080055753, "adv/std_final_conf": 0.8820512294769287, "adv/std_reasoning": 0.7927603721618652, "adv/std_step_conf": 0.9354966282844543, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8047684189955051, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.1956620967741936, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5362903225806451, "calib/gap": 0.5061403296202204, "calib/mean_conf": 0.5976443548387097, "calib/mu_c": 0.8405100775193799, "calib/mu_w": 0.3343697478991595, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13657258064516137, "calib/std_conf": 0.45424748775246127, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43061111111111117, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.13943253968253977, "calib/step_q_w": 0.2911785714285714, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2364.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 489.54296875, "completions/mean_terminated_length": 493.39764404296875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.09066666666666667, "grad_norm": 0.02340533211827278, "kl": 0.11705780029296875, "learning_rate": 3.1944444444444443e-06, "loss": 0.0119, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03687724471092224, "mask/share_reasoning": 0.8172705173492432, "mask/share_step_conf": 0.1380397528409958, "num_tokens": 19801536.0, "reward": 1.1727514266967773, "reward_std": 0.27141836285591125, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7622454762458801, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8596714735031128, "step": 85 }, { "adv/mean_abs_final_conf": 0.718398928642273, "adv/mean_abs_reasoning": 0.5384534001350403, "adv/mean_abs_step_conf": 0.7146424055099487, "adv/ratio_final_to_reasoning": 1.3341896038953485, "adv/ratio_step_to_reasoning": 1.3272130983493122, "adv/std_final_conf": 0.8906943798065186, "adv/std_reasoning": 0.7927916049957275, "adv/std_step_conf": 0.9359204173088074, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7341952165481578, "calib/avg_num_step_conf": 5.578125, "calib/ece": 0.25289156626506026, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4738955823293173, "calib/gap": 0.42104072398190046, "calib/mean_conf": 0.5242971887550201, "calib/mu_c": 0.7441176470588236, "calib/mu_w": 0.3230769230769231, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.14963855421686748, "calib/std_conf": 0.46650155717157316, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.44880690737833595, "calib/step_q_c_n": 637.0, "calib/step_q_gap": 0.15325697058946108, "calib/step_q_w": 0.29554993678887487, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 464.55078125, "completions/mean_terminated_length": 466.3725891113281, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.09173333333333333, "grad_norm": 0.03036743961274624, "kl": 0.1243438720703125, "learning_rate": 3.1666666666666667e-06, "loss": -0.1255, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.039164334535598755, "mask/share_reasoning": 0.8249720931053162, "mask/share_step_conf": 0.13195732235908508, "num_tokens": 20025973.0, "reward": 1.1242517232894897, "reward_std": 0.2771415114402771, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7137933969497681, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8330358266830444, "step": 86 }, { "adv/mean_abs_final_conf": 0.6766160726547241, "adv/mean_abs_reasoning": 0.4702734351158142, "adv/mean_abs_step_conf": 0.7363328337669373, "adv/ratio_final_to_reasoning": 1.438771621212442, "adv/ratio_step_to_reasoning": 1.5657546839438221, "adv/std_final_conf": 0.8864827156066895, "adv/std_reasoning": 0.7575428485870361, "adv/std_step_conf": 0.9356470704078674, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6348829201101929, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.2603557312252964, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6640316205533597, "calib/gap": 0.26056060606060605, "calib/mean_conf": 0.7392490118577075, "calib/mu_c": 0.8298787878787879, "calib/mu_w": 0.5693181818181818, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1737154150197628, "calib/std_conf": 0.3979252383367354, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45692941176470586, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.13245238206173554, "calib/step_q_w": 0.3244770297029703, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 395.359375, "completions/mean_terminated_length": 395.359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.0928, "grad_norm": 0.029617100954055786, "kl": 0.1478118896484375, "learning_rate": 3.138888888888889e-06, "loss": 0.0086, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04414728283882141, "mask/share_reasoning": 0.8172008395195007, "mask/share_step_conf": 0.13865187764167786, "num_tokens": 20232681.0, "reward": 1.141819715499878, "reward_std": 0.2538405954837799, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7091293334960938, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.833006739616394, "step": 87 }, { "adv/mean_abs_final_conf": 0.6007475852966309, "adv/mean_abs_reasoning": 0.44460412859916687, "adv/mean_abs_step_conf": 0.7527111172676086, "adv/ratio_final_to_reasoning": 1.3511965963731192, "adv/ratio_step_to_reasoning": 1.6929917399534897, "adv/std_final_conf": 0.8415180444717407, "adv/std_reasoning": 0.7013773322105408, "adv/std_step_conf": 0.9355311393737793, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.834070796460177, "calib/avg_num_step_conf": 5.0859375, "calib/ece": 0.1724274509803922, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5686274509803921, "calib/gap": 0.596774149320703, "calib/mean_conf": 0.6180823529411765, "calib/mu_c": 0.8825352112676057, "calib/mu_w": 0.2857610619469027, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11682352941176474, "calib/std_conf": 0.4629612412125359, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.48145161290322575, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.16227419354838707, "calib/step_q_w": 0.3191774193548387, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 431.01171875, "completions/mean_terminated_length": 432.7019958496094, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.09386666666666667, "grad_norm": 0.02849281206727028, "kl": 0.132293701171875, "learning_rate": 3.1111111111111116e-06, "loss": -0.0292, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037658434361219406, "mask/share_reasoning": 0.8327912092208862, "mask/share_step_conf": 0.12564414739608765, "num_tokens": 20452868.0, "reward": 1.18056058883667, "reward_std": 0.21894444525241852, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.8136299848556519, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8269731402397156, "step": 88 }, { "adv/mean_abs_final_conf": 0.6541560292243958, "adv/mean_abs_reasoning": 0.537696123123169, "adv/mean_abs_step_conf": 0.7444639801979065, "adv/ratio_final_to_reasoning": 1.2165905631321607, "adv/ratio_step_to_reasoning": 1.3845440727259506, "adv/std_final_conf": 0.8602299094200134, "adv/std_reasoning": 0.7754936814308167, "adv/std_step_conf": 0.9357773065567017, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7566519546027743, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.23392857142857146, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.48412698412698413, "calib/gap": 0.44930517023959654, "calib/mean_conf": 0.5343253968253967, "calib/mu_c": 0.7518461538461539, "calib/mu_w": 0.3025409836065574, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1261904761904762, "calib/std_conf": 0.46855902593331045, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4928026533996683, "calib/step_q_c_n": 603.0, "calib/step_q_gap": 0.18202324163496242, "calib/step_q_w": 0.31077941176470586, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 453.1171875, "completions/mean_terminated_length": 453.1171875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.09493333333333333, "grad_norm": 0.02972985990345478, "kl": 0.13824462890625, "learning_rate": 3.0833333333333336e-06, "loss": -0.1189, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04198278859257698, "mask/share_reasoning": 0.8361014127731323, "mask/share_step_conf": 0.12191580981016159, "num_tokens": 20677754.0, "reward": 1.1438519954681396, "reward_std": 0.2497408092021942, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7326839566230774, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8392842411994934, "step": 89 }, { "adv/mean_abs_final_conf": 0.6622197031974792, "adv/mean_abs_reasoning": 0.5669442415237427, "adv/mean_abs_step_conf": 0.744004487991333, "adv/ratio_final_to_reasoning": 1.1680508499701316, "adv/ratio_step_to_reasoning": 1.3123062789944138, "adv/std_final_conf": 0.8622347116470337, "adv/std_reasoning": 0.792926013469696, "adv/std_step_conf": 0.9356372356414795, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7202957674655788, "calib/avg_num_step_conf": 5.46484375, "calib/ece": 0.27196456692913396, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5748031496062992, "calib/gap": 0.3847572667006629, "calib/mean_conf": 0.6178779527559055, "calib/mu_c": 0.7784459459459458, "calib/mu_w": 0.39368867924528295, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1535826771653544, "calib/std_conf": 0.4612813257464722, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.443278364116095, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.15647649204121195, "calib/step_q_w": 0.28680187207488306, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 446.40625, "completions/mean_terminated_length": 446.40625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.096, "grad_norm": 0.032319456338882446, "kl": 0.1483612060546875, "learning_rate": 3.055555555555556e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03996175155043602, "mask/share_reasoning": 0.8266382813453674, "mask/share_step_conf": 0.13339999318122864, "num_tokens": 20895354.0, "reward": 1.12278413772583, "reward_std": 0.25631049275398254, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7038639783859253, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8215529322624207, "step": 90 }, { "adv/mean_abs_final_conf": 0.6649396419525146, "adv/mean_abs_reasoning": 0.6094751358032227, "adv/mean_abs_step_conf": 0.7547518014907837, "adv/ratio_final_to_reasoning": 1.0910037225328244, "adv/ratio_step_to_reasoning": 1.2383635642430302, "adv/std_final_conf": 0.8605794906616211, "adv/std_reasoning": 0.8588709831237793, "adv/std_step_conf": 0.9356327652931213, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7864431486880467, "calib/avg_num_step_conf": 5.578125, "calib/ece": 0.2057248979591837, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5959183673469388, "calib/gap": 0.4907748299319727, "calib/mean_conf": 0.6532138775510203, "calib/mu_c": 0.8495238095238095, "calib/mu_w": 0.35874897959183677, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12946938775510203, "calib/std_conf": 0.4481611051105887, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.45719131614654, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.1617209832955993, "calib/step_q_w": 0.2954703328509407, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 447.51953125, "completions/mean_terminated_length": 447.51953125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.09706666666666666, "grad_norm": 0.026164310052990913, "kl": 0.142333984375, "learning_rate": 3.0277777777777776e-06, "loss": 0.0197, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03725254535675049, "mask/share_reasoning": 0.8388819098472595, "mask/share_step_conf": 0.12386555224657059, "num_tokens": 21117631.0, "reward": 1.150740146636963, "reward_std": 0.2667975425720215, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7564589977264404, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8258475065231323, "step": 91 }, { "adv/mean_abs_final_conf": 0.6045271158218384, "adv/mean_abs_reasoning": 0.41747045516967773, "adv/mean_abs_step_conf": 0.7512445449829102, "adv/ratio_final_to_reasoning": 1.4480716140166923, "adv/ratio_step_to_reasoning": 1.7995154763169348, "adv/std_final_conf": 0.8119957447052002, "adv/std_reasoning": 0.7015039324760437, "adv/std_step_conf": 0.9357256889343262, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8115202702702702, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.20149193548387095, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5282258064516129, "calib/gap": 0.5300135135135136, "calib/mean_conf": 0.5797983870967741, "calib/mu_c": 0.7935135135135135, "calib/mu_w": 0.26349999999999996, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09225806451612903, "calib/std_conf": 0.4660389447910064, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.48583832335329347, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.18569791773706884, "calib/step_q_w": 0.3001404056162246, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 395.828125, "completions/mean_terminated_length": 398.94488525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.09813333333333334, "grad_norm": 0.040191903710365295, "kl": 0.14263916015625, "learning_rate": 3e-06, "loss": -0.0157, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.042106062173843384, "mask/share_reasoning": 0.8261221051216125, "mask/share_step_conf": 0.12395933270454407, "num_tokens": 21325683.0, "reward": 1.1780098676681519, "reward_std": 0.24134156107902527, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7681527137756348, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8523279428482056, "step": 92 }, { "adv/mean_abs_final_conf": 0.7300728559494019, "adv/mean_abs_reasoning": 0.5577922463417053, "adv/mean_abs_step_conf": 0.7381477355957031, "adv/ratio_final_to_reasoning": 1.3088616070546037, "adv/ratio_step_to_reasoning": 1.323338107399061, "adv/std_final_conf": 0.8927043080329895, "adv/std_reasoning": 0.8100207448005676, "adv/std_step_conf": 0.9362002611160278, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7275034293552811, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.25652949245541834, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.43621399176954734, "calib/gap": 0.40540123456790117, "calib/mean_conf": 0.4959807956104253, "calib/mu_c": 0.7212037037037036, "calib/mu_w": 0.3158024691358024, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.15403292181069955, "calib/std_conf": 0.46841187055040207, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5340118577075099, "calib/step_q_c_n": 506.0, "calib/step_q_gap": 0.23139059867758216, "calib/step_q_w": 0.30262125902992776, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 464.828125, "completions/mean_terminated_length": 466.6510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.0992, "grad_norm": 0.037815142422914505, "kl": 0.1412811279296875, "learning_rate": 2.9722222222222225e-06, "loss": -0.1825, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03980404883623123, "mask/share_reasoning": 0.8261933326721191, "mask/share_step_conf": 0.13009636104106903, "num_tokens": 21550455.0, "reward": 1.0781924724578857, "reward_std": 0.3342801332473755, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.6839195489883423, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8003935217857361, "step": 93 }, { "adv/mean_abs_final_conf": 0.6486687660217285, "adv/mean_abs_reasoning": 0.5233867168426514, "adv/mean_abs_step_conf": 0.7764137387275696, "adv/ratio_final_to_reasoning": 1.2393680335161072, "adv/ratio_step_to_reasoning": 1.4834418103143934, "adv/std_final_conf": 0.8439018130302429, "adv/std_reasoning": 0.7754757404327393, "adv/std_step_conf": 0.9355916976928711, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7897025171624714, "calib/avg_num_step_conf": 5.125, "calib/ece": 0.2174999999999999, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.45161290322580644, "calib/gap": 0.47784897025171624, "calib/mean_conf": 0.5110483870967741, "calib/mu_c": 0.7326315789473684, "calib/mu_w": 0.25478260869565217, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.09612903225806443, "calib/std_conf": 0.4654729099708739, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5159848484848484, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.20066276259527782, "calib/step_q_w": 0.31532208588957056, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 407.4921875, "completions/mean_terminated_length": 410.7007751464844, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.10026666666666667, "grad_norm": 0.03030179999768734, "kl": 0.14703369140625, "learning_rate": 2.944444444444445e-06, "loss": -0.0434, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.04173561558127403, "mask/share_reasoning": 0.8220940232276917, "mask/share_step_conf": 0.12835785746574402, "num_tokens": 21763453.0, "reward": 1.1417219638824463, "reward_std": 0.24963004887104034, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7397593855857849, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8312063217163086, "step": 94 }, { "adv/mean_abs_final_conf": 0.5931450128555298, "adv/mean_abs_reasoning": 0.4570343494415283, "adv/mean_abs_step_conf": 0.7530524730682373, "adv/ratio_final_to_reasoning": 1.297812765233776, "adv/ratio_step_to_reasoning": 1.6476933823211044, "adv/std_final_conf": 0.81406569480896, "adv/std_reasoning": 0.7394242286682129, "adv/std_step_conf": 0.9350123405456543, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8168081494057726, "calib/avg_num_step_conf": 5.34375, "calib/ece": 0.18647999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.596, "calib/gap": 0.5381324278438031, "calib/mean_conf": 0.6447999999999999, "calib/mu_c": 0.8492903225806453, "calib/mu_w": 0.31115789473684213, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.10563999999999997, "calib/std_conf": 0.45376222848535996, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.4892594339622641, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.1607017416545718, "calib/step_q_w": 0.32855769230769233, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 417.03515625, "completions/mean_terminated_length": 418.6706237792969, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 0.10133333333333333, "grad_norm": 0.023666884750127792, "kl": 0.13555908203125, "learning_rate": 2.916666666666667e-06, "loss": -0.0643, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04241481423377991, "mask/share_reasoning": 0.8130682706832886, "mask/share_step_conf": 0.14061065018177032, "num_tokens": 21976342.0, "reward": 1.1643306016921997, "reward_std": 0.25867539644241333, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7615358829498291, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.837979257106781, "step": 95 }, { "adv/mean_abs_final_conf": 0.5102696418762207, "adv/mean_abs_reasoning": 0.3827498257160187, "adv/mean_abs_step_conf": 0.7660720944404602, "adv/ratio_final_to_reasoning": 1.3331675355348571, "adv/ratio_step_to_reasoning": 2.0014956061896356, "adv/std_final_conf": 0.7896233797073364, "adv/std_reasoning": 0.6612542271614075, "adv/std_step_conf": 0.934788167476654, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8952425125894514, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.11535714285714285, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5912698412698413, "calib/gap": 0.7047031539888683, "calib/mean_conf": 0.6286111111111111, "calib/mu_c": 0.9026623376623376, "calib/mu_w": 0.19795918367346935, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.06642857142857142, "calib/std_conf": 0.46566309560314306, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5700742942050521, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.2269945168395604, "calib/step_q_w": 0.34307977736549167, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 396.13671875, "completions/mean_terminated_length": 396.13671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1024, "grad_norm": 0.03511851280927658, "kl": 0.15106201171875, "learning_rate": 2.888888888888889e-06, "loss": 0.0488, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.044577062129974365, "mask/share_reasoning": 0.8310078382492065, "mask/share_step_conf": 0.12441502511501312, "num_tokens": 22183569.0, "reward": 1.2230329513549805, "reward_std": 0.18603476881980896, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.8587906360626221, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8477666974067688, "step": 96 }, { "adv/mean_abs_final_conf": 0.775114893913269, "adv/mean_abs_reasoning": 0.6093622446060181, "adv/mean_abs_step_conf": 0.7242501378059387, "adv/ratio_final_to_reasoning": 1.2720100412758883, "adv/ratio_step_to_reasoning": 1.188537925046868, "adv/std_final_conf": 0.9237008690834045, "adv/std_reasoning": 0.8266138434410095, "adv/std_step_conf": 0.9355844855308533, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6741730279898219, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.30406374501992034, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.46613545816733065, "calib/gap": 0.3173810432569975, "calib/mean_conf": 0.5304382470119522, "calib/mu_c": 0.6960833333333334, "calib/mu_w": 0.3787022900763359, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17820717131474106, "calib/std_conf": 0.4694815814810152, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5330985915492958, "calib/step_q_c_n": 568.0, "calib/step_q_gap": 0.21296716980497082, "calib/step_q_w": 0.320131421744325, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 383.9453125, "completions/mean_terminated_length": 385.45098876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.10346666666666667, "grad_norm": 0.03347829729318619, "kl": 0.1522064208984375, "learning_rate": 2.861111111111111e-06, "loss": -0.0896, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04272129386663437, "mask/share_reasoning": 0.812430739402771, "mask/share_step_conf": 0.14094170928001404, "num_tokens": 22386931.0, "reward": 1.104257583618164, "reward_std": 0.2778571546077728, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6669449806213379, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8355258703231812, "step": 97 }, { "adv/mean_abs_final_conf": 0.6549431085586548, "adv/mean_abs_reasoning": 0.5944483876228333, "adv/mean_abs_step_conf": 0.7494335174560547, "adv/ratio_final_to_reasoning": 1.1017661452119276, "adv/ratio_step_to_reasoning": 1.2607209188555435, "adv/std_final_conf": 0.8280261754989624, "adv/std_reasoning": 0.8100293874740601, "adv/std_step_conf": 0.9358698725700378, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7347417840375587, "calib/avg_num_step_conf": 4.546875, "calib/ece": 0.26029045643153526, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5311203319502075, "calib/gap": 0.4239486413430076, "calib/mean_conf": 0.5760580912863071, "calib/mu_c": 0.7502112676056338, "calib/mu_w": 0.32626262626262625, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12356846473029046, "calib/std_conf": 0.47653641097218263, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5361228070175439, "calib/step_q_c_n": 570.0, "calib/step_q_gap": 0.20119014708488397, "calib/step_q_w": 0.3349326599326599, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 448.4140625, "completions/mean_terminated_length": 448.4140625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.10453333333333334, "grad_norm": 0.03285660967230797, "kl": 0.13463592529296875, "learning_rate": 2.8333333333333335e-06, "loss": -0.067, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.04362473264336586, "mask/share_reasoning": 0.8375781178474426, "mask/share_step_conf": 0.1187971755862236, "num_tokens": 22607909.0, "reward": 1.088001012802124, "reward_std": 0.30376923084259033, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6849917769432068, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7955691814422607, "step": 98 }, { "adv/mean_abs_final_conf": 0.6276251077651978, "adv/mean_abs_reasoning": 0.4622398614883423, "adv/mean_abs_step_conf": 0.7264397144317627, "adv/ratio_final_to_reasoning": 1.3577909653752924, "adv/ratio_step_to_reasoning": 1.5715644083414548, "adv/std_final_conf": 0.8565012216567993, "adv/std_reasoning": 0.7206854820251465, "adv/std_step_conf": 0.9356908798217773, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.775219298245614, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.20413978494623658, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.32661290322580644, "calib/gap": 0.5181816520467837, "calib/mean_conf": 0.3597311827956989, "calib/mu_c": 0.677326388888889, "calib/mu_w": 0.15914473684210526, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.08838709677419354, "calib/std_conf": 0.459931153639939, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4832096069868996, "calib/step_q_c_n": 458.0, "calib/step_q_gap": 0.17074250172374167, "calib/step_q_w": 0.3124671052631579, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2182.0, "completions/max_terminated_length": 2182.0, "completions/mean_length": 492.59765625, "completions/mean_terminated_length": 494.5294494628906, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.1056, "grad_norm": 0.030274106189608574, "kl": 0.1233978271484375, "learning_rate": 2.805555555555556e-06, "loss": -0.1299, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.038488492369651794, "mask/share_reasoning": 0.8359262943267822, "mask/share_step_conf": 0.12167896330356598, "num_tokens": 22839814.0, "reward": 1.1263723373413086, "reward_std": 0.2581334710121155, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.762831449508667, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8161922693252563, "step": 99 }, { "adv/mean_abs_final_conf": 0.5714178085327148, "adv/mean_abs_reasoning": 0.4815468192100525, "adv/mean_abs_step_conf": 0.7494217157363892, "adv/ratio_final_to_reasoning": 1.186629805737457, "adv/ratio_step_to_reasoning": 1.556280066319966, "adv/std_final_conf": 0.8202148675918579, "adv/std_reasoning": 0.7575972080230713, "adv/std_step_conf": 0.9345163702964783, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8089188488462536, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.1883534136546185, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.46987951807228917, "calib/gap": 0.5778376976925071, "calib/mean_conf": 0.5196787148594376, "calib/mu_c": 0.7888721804511278, "calib/mu_w": 0.21103448275862066, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.08694779116465866, "calib/std_conf": 0.48183540615364495, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5101029411764706, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.21871832579185518, "calib/step_q_w": 0.2913846153846154, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 464.1171875, "completions/mean_terminated_length": 464.1171875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.10666666666666667, "grad_norm": 0.03927968069911003, "kl": 0.1226654052734375, "learning_rate": 2.7777777777777783e-06, "loss": 0.0249, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.040494032204151154, "mask/share_reasoning": 0.8294671773910522, "mask/share_step_conf": 0.13003885746002197, "num_tokens": 23066036.0, "reward": 1.1902015209197998, "reward_std": 0.20735013484954834, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7843140363693237, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8651010394096375, "step": 100 }, { "adv/mean_abs_final_conf": 0.6751835346221924, "adv/mean_abs_reasoning": 0.5666059255599976, "adv/mean_abs_step_conf": 0.7753067016601562, "adv/ratio_final_to_reasoning": 1.1916280860544894, "adv/ratio_step_to_reasoning": 1.3683349691302504, "adv/std_final_conf": 0.8751931190490723, "adv/std_reasoning": 0.8098925948143005, "adv/std_step_conf": 0.9354108572006226, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7181372549019608, "calib/avg_num_step_conf": 5.296875, "calib/ece": 0.2703238866396761, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.31983805668016196, "calib/gap": 0.3784651563328034, "calib/mean_conf": 0.3620647773279352, "calib/mu_c": 0.5704504504504505, "calib/mu_w": 0.19198529411764706, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.09149797570850204, "calib/std_conf": 0.4572796120287642, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4386486486486486, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.10985910256886872, "calib/step_q_w": 0.3287895460797799, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2491.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 475.5703125, "completions/mean_terminated_length": 475.5703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.10773333333333333, "grad_norm": 0.028355449438095093, "kl": 0.127655029296875, "learning_rate": 2.7500000000000004e-06, "loss": -0.0626, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.036653559654951096, "mask/share_reasoning": 0.8383820652961731, "mask/share_step_conf": 0.12496437877416611, "num_tokens": 23294774.0, "reward": 1.1122589111328125, "reward_std": 0.24153487384319305, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6937956809997559, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8345438241958618, "step": 101 }, { "adv/mean_abs_final_conf": 0.6679022312164307, "adv/mean_abs_reasoning": 0.41688475012779236, "adv/mean_abs_step_conf": 0.747225284576416, "adv/ratio_final_to_reasoning": 1.602126801260279, "adv/ratio_step_to_reasoning": 1.792402538944782, "adv/std_final_conf": 0.843447744846344, "adv/std_reasoning": 0.6817389130592346, "adv/std_step_conf": 0.9350944757461548, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7101910828025477, "calib/avg_num_step_conf": 5.30078125, "calib/ece": 0.3211952191235059, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.41434262948207173, "calib/gap": 0.35007318064778425, "calib/mean_conf": 0.47450199203187254, "calib/mu_c": 0.6056050955414013, "calib/mu_w": 0.25553191489361704, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08509960159362544, "calib/std_conf": 0.4715871850502317, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4950946142649199, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.19806476351865127, "calib/step_q_w": 0.29702985074626864, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 388.14453125, "completions/mean_terminated_length": 388.14453125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.1088, "grad_norm": 0.046899620443582535, "kl": 0.139801025390625, "learning_rate": 2.7222222222222224e-06, "loss": -0.0034, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04749134182929993, "mask/share_reasoning": 0.8126708269119263, "mask/share_step_conf": 0.139837846159935, "num_tokens": 23500835.0, "reward": 1.130561113357544, "reward_std": 0.241227924823761, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6594758033752441, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8568267822265625, "step": 102 }, { "adv/mean_abs_final_conf": 0.6553860902786255, "adv/mean_abs_reasoning": 0.46481823921203613, "adv/mean_abs_step_conf": 0.7428863048553467, "adv/ratio_final_to_reasoning": 1.4099835914133696, "adv/ratio_step_to_reasoning": 1.5982296781526772, "adv/std_final_conf": 0.8601471185684204, "adv/std_reasoning": 0.7576319575309753, "adv/std_step_conf": 0.9357208609580994, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7938649080735413, "calib/avg_num_step_conf": 5.4140625, "calib/ece": 0.20838056680161943, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4979757085020243, "calib/gap": 0.5344164668265388, "calib/mean_conf": 0.5435222672064778, "calib/mu_c": 0.7771942446043165, "calib/mu_w": 0.24277777777777776, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.09457489878542506, "calib/std_conf": 0.4767536799716597, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5236781609195402, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.16034482758620683, "calib/step_q_w": 0.3633333333333334, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 472.52734375, "completions/mean_terminated_length": 474.38043212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.10986666666666667, "grad_norm": 0.03498329222202301, "kl": 0.1224365234375, "learning_rate": 2.6944444444444444e-06, "loss": -0.0804, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03836042433977127, "mask/share_reasoning": 0.8347409963607788, "mask/share_step_conf": 0.12299229949712753, "num_tokens": 23726354.0, "reward": 1.1513614654541016, "reward_std": 0.28517740964889526, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7498066425323486, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8357982635498047, "step": 103 }, { "adv/mean_abs_final_conf": 0.6393879055976868, "adv/mean_abs_reasoning": 0.43127331137657166, "adv/mean_abs_step_conf": 0.7517832517623901, "adv/ratio_final_to_reasoning": 1.4825584814345196, "adv/ratio_step_to_reasoning": 1.7431712835714084, "adv/std_final_conf": 0.8423436880111694, "adv/std_reasoning": 0.7014029622077942, "adv/std_step_conf": 0.9348614811897278, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7802152579930357, "calib/avg_num_step_conf": 5.5703125, "calib/ece": 0.2068650793650794, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3611111111111111, "calib/gap": 0.5105584045584045, "calib/mean_conf": 0.40511904761904766, "calib/mu_c": 0.6786324786324786, "calib/mu_w": 0.16807407407407407, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.07384920634920637, "calib/std_conf": 0.46226708604824707, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5198888888888888, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.18010496929089892, "calib/step_q_w": 0.3397839195979899, "calib/step_q_w_n": 796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 463.06640625, "completions/mean_terminated_length": 463.06640625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.11093333333333333, "grad_norm": 0.04268581047654152, "kl": 0.1243896484375, "learning_rate": 2.666666666666667e-06, "loss": 0.0509, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03702666983008385, "mask/share_reasoning": 0.8287439346313477, "mask/share_step_conf": 0.13422942161560059, "num_tokens": 23951579.0, "reward": 1.1680681705474854, "reward_std": 0.1921078860759735, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7757472991943359, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8485928773880005, "step": 104 }, { "adv/mean_abs_final_conf": 0.715624213218689, "adv/mean_abs_reasoning": 0.6200796365737915, "adv/mean_abs_step_conf": 0.7586425542831421, "adv/ratio_final_to_reasoning": 1.1540843643452356, "adv/ratio_step_to_reasoning": 1.2234598744041503, "adv/std_final_conf": 0.8816226124763489, "adv/std_reasoning": 0.8266752362251282, "adv/std_step_conf": 0.9358975887298584, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7314206932773109, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.2634412955465587, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4939271255060729, "calib/gap": 0.42086462710084027, "calib/mean_conf": 0.5375303643724696, "calib/mu_c": 0.7556302521008403, "calib/mu_w": 0.33476562500000007, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1595951417004049, "calib/std_conf": 0.47572826558997283, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4693718166383702, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.1266095788761324, "calib/step_q_w": 0.3427622377622378, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 472.3515625, "completions/mean_terminated_length": 472.3515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.112, "grad_norm": 0.03280618041753769, "kl": 0.11415863037109375, "learning_rate": 2.6388888888888893e-06, "loss": 0.0266, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03885013237595558, "mask/share_reasoning": 0.8337293863296509, "mask/share_step_conf": 0.12742048501968384, "num_tokens": 24178261.0, "reward": 1.1076858043670654, "reward_std": 0.2959200143814087, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6975457072257996, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8223005533218384, "step": 105 }, { "adv/mean_abs_final_conf": 0.6601937413215637, "adv/mean_abs_reasoning": 0.5205706357955933, "adv/mean_abs_step_conf": 0.7354485988616943, "adv/ratio_final_to_reasoning": 1.2682116430032269, "adv/ratio_step_to_reasoning": 1.4127738836780546, "adv/std_final_conf": 0.8471842408180237, "adv/std_reasoning": 0.7753406167030334, "adv/std_step_conf": 0.9351677298545837, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7849028106120304, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.25281124497991964, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5140562248995983, "calib/gap": 0.41113278171788814, "calib/mean_conf": 0.5916064257028113, "calib/mu_c": 0.7699290780141844, "calib/mu_w": 0.3587962962962963, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13907630522088352, "calib/std_conf": 0.4523759561933446, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5066219839142091, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.15805658307032727, "calib/step_q_w": 0.3485654008438818, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 449.7421875, "completions/mean_terminated_length": 451.50592041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.11306666666666666, "grad_norm": 0.03570527955889702, "kl": 0.11209869384765625, "learning_rate": 2.6111111111111113e-06, "loss": -0.0621, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03827952593564987, "mask/share_reasoning": 0.8275967836380005, "mask/share_step_conf": 0.13021747767925262, "num_tokens": 24397979.0, "reward": 1.1611725091934204, "reward_std": 0.2587474584579468, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7297816276550293, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8591046929359436, "step": 106 }, { "adv/mean_abs_final_conf": 0.6493844985961914, "adv/mean_abs_reasoning": 0.5660048723220825, "adv/mean_abs_step_conf": 0.7470730543136597, "adv/ratio_final_to_reasoning": 1.1473125592223914, "adv/ratio_step_to_reasoning": 1.3199056948904515, "adv/std_final_conf": 0.8542997241020203, "adv/std_reasoning": 0.7928820848464966, "adv/std_step_conf": 0.9353109002113342, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.708204334365325, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.26161417322834657, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6496062992125984, "calib/gap": 0.3564912280701755, "calib/mean_conf": 0.705, "calib/mu_c": 0.8481578947368422, "calib/mu_w": 0.4916666666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18409448818897645, "calib/std_conf": 0.42935288241468145, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5074205378973105, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.14568369579204732, "calib/step_q_w": 0.36173684210526313, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 446.109375, "completions/mean_terminated_length": 446.109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.11413333333333334, "grad_norm": 0.031605981290340424, "kl": 0.1169891357421875, "learning_rate": 2.5833333333333337e-06, "loss": 0.0015, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03787776082754135, "mask/share_reasoning": 0.8313525319099426, "mask/share_step_conf": 0.13076971471309662, "num_tokens": 24616799.0, "reward": 1.1570827960968018, "reward_std": 0.24284838140010834, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7178605794906616, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8543074131011963, "step": 107 }, { "adv/mean_abs_final_conf": 0.48324739933013916, "adv/mean_abs_reasoning": 0.4229738414287567, "adv/mean_abs_step_conf": 0.7486779689788818, "adv/ratio_final_to_reasoning": 1.142499492871203, "adv/ratio_step_to_reasoning": 1.7700337364834058, "adv/std_final_conf": 0.7400717735290527, "adv/std_reasoning": 0.7013469338417053, "adv/std_step_conf": 0.9353437423706055, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7492725156409137, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.20225296442687749, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7707509881422925, "calib/gap": 0.4063814927979049, "calib/mean_conf": 0.797588932806324, "calib/mu_c": 0.9244827586206897, "calib/mu_w": 0.5181012658227848, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15604743083003955, "calib/std_conf": 0.38190095175662503, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.533143153526971, "calib/step_q_c_n": 964.0, "calib/step_q_gap": 0.13612315352697096, "calib/step_q_w": 0.39702, "calib/step_q_w_n": 500.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 460.95703125, "completions/mean_terminated_length": 462.7647399902344, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.1152, "grad_norm": 0.03979187458753586, "kl": 0.1657867431640625, "learning_rate": 2.5555555555555557e-06, "loss": 0.0304, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03806034475564957, "mask/share_reasoning": 0.8202017545700073, "mask/share_step_conf": 0.1378316879272461, "num_tokens": 24838036.0, "reward": 1.1871888637542725, "reward_std": 0.21590068936347961, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7780038714408875, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8434158563613892, "step": 108 }, { "adv/mean_abs_final_conf": 0.5771458148956299, "adv/mean_abs_reasoning": 0.42229336500167847, "adv/mean_abs_step_conf": 0.717828631401062, "adv/ratio_final_to_reasoning": 1.3666940158847534, "adv/ratio_step_to_reasoning": 1.6998340274614756, "adv/std_final_conf": 0.8250502943992615, "adv/std_reasoning": 0.7206637263298035, "adv/std_step_conf": 0.9353057146072388, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8377148634984832, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.17438524590163945, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5163934426229508, "calib/gap": 0.5859433771486349, "calib/mean_conf": 0.5756967213114754, "calib/mu_c": 0.8854782608695652, "calib/mu_w": 0.29953488372093023, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13938524590163945, "calib/std_conf": 0.4660418504626948, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5106009244992296, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.238884887727013, "calib/step_q_w": 0.27171603677221656, "calib/step_q_w_n": 979.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 511.7109375, "completions/mean_terminated_length": 513.7176513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.11626666666666667, "grad_norm": 0.03111584298312664, "kl": 0.10516357421875, "learning_rate": 2.5277777777777778e-06, "loss": 0.0068, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03405262529850006, "mask/share_reasoning": 0.8269233107566833, "mask/share_step_conf": 0.1351177990436554, "num_tokens": 25073634.0, "reward": 1.1639158725738525, "reward_std": 0.26197850704193115, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7765480279922485, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8472098708152771, "step": 109 }, { "adv/mean_abs_final_conf": 0.6765207648277283, "adv/mean_abs_reasoning": 0.579919695854187, "adv/mean_abs_step_conf": 0.7378029227256775, "adv/ratio_final_to_reasoning": 1.166576630633753, "adv/ratio_step_to_reasoning": 1.2722501546338032, "adv/std_final_conf": 0.8702954053878784, "adv/std_reasoning": 0.8099896311759949, "adv/std_step_conf": 0.9350488781929016, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7209169418338837, "calib/avg_num_step_conf": 5.046875, "calib/ece": 0.26960159362549807, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5697211155378487, "calib/gap": 0.39499047498094986, "calib/mean_conf": 0.6247410358565737, "calib/mu_c": 0.8245967741935484, "calib/mu_w": 0.4296062992125985, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20015936254980088, "calib/std_conf": 0.45552237063477774, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5037122557726466, "calib/step_q_c_n": 563.0, "calib/step_q_gap": 0.15666150131448464, "calib/step_q_w": 0.3470507544581619, "calib/step_q_w_n": 729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2427.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 438.20703125, "completions/mean_terminated_length": 441.657470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.11733333333333333, "grad_norm": 0.04666442424058914, "kl": 0.117340087890625, "learning_rate": 2.5e-06, "loss": -0.0124, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.038558922708034515, "mask/share_reasoning": 0.827201783657074, "mask/share_step_conf": 0.12642675638198853, "num_tokens": 25290735.0, "reward": 1.1315009593963623, "reward_std": 0.24479171633720398, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7048894166946411, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8439500331878662, "step": 110 }, { "adv/mean_abs_final_conf": 0.5560250878334045, "adv/mean_abs_reasoning": 0.5363640785217285, "adv/mean_abs_step_conf": 0.7406983375549316, "adv/ratio_final_to_reasoning": 1.0366560888377605, "adv/ratio_step_to_reasoning": 1.380961863807823, "adv/std_final_conf": 0.7945669889450073, "adv/std_reasoning": 0.7754170894622803, "adv/std_step_conf": 0.9349315762519836, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7761217347831522, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.2229644268774703, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.549407114624506, "calib/gap": 0.49260529933758285, "calib/mean_conf": 0.603596837944664, "calib/mu_c": 0.8508730158730159, "calib/mu_w": 0.35826771653543305, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16426877470355727, "calib/std_conf": 0.4633459121415682, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.46158139534883724, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.1424465352979466, "calib/step_q_w": 0.31913486005089065, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 475.69140625, "completions/mean_terminated_length": 477.556884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.1184, "grad_norm": 0.03786475956439972, "kl": 0.126617431640625, "learning_rate": 2.4722222222222226e-06, "loss": -0.0007, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03777540475130081, "mask/share_reasoning": 0.826372504234314, "mask/share_step_conf": 0.13194583356380463, "num_tokens": 25519920.0, "reward": 1.154246211051941, "reward_std": 0.21423739194869995, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7497198581695557, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8433482646942139, "step": 111 }, { "adv/mean_abs_final_conf": 0.5513437390327454, "adv/mean_abs_reasoning": 0.4522683918476105, "adv/mean_abs_step_conf": 0.7459721565246582, "adv/ratio_final_to_reasoning": 1.2190631690629352, "adv/ratio_step_to_reasoning": 1.6494014836570974, "adv/std_final_conf": 0.7818516492843628, "adv/std_reasoning": 0.7205687761306763, "adv/std_step_conf": 0.9342420101165771, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8228179143510952, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.19661290322580646, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4435483870967742, "calib/gap": 0.5576894409937889, "calib/mean_conf": 0.48282258064516126, "calib/mu_c": 0.7414285714285714, "calib/mu_w": 0.18373913043478257, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07157258064516132, "calib/std_conf": 0.47570077421304313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4241379310344828, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.19787178116368176, "calib/step_q_w": 0.22626614987080101, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 511.33203125, "completions/mean_terminated_length": 517.395263671875, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.11946666666666667, "grad_norm": 0.026878803968429565, "kl": 0.09716796875, "learning_rate": 2.4444444444444447e-06, "loss": -0.0867, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032932404428720474, "mask/share_reasoning": 0.8346401453018188, "mask/share_step_conf": 0.12070866674184799, "num_tokens": 25758741.0, "reward": 1.1662858724594116, "reward_std": 0.1943838894367218, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7745569944381714, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8402389287948608, "step": 112 }, { "adv/mean_abs_final_conf": 0.7061506509780884, "adv/mean_abs_reasoning": 0.544262170791626, "adv/mean_abs_step_conf": 0.7459894418716431, "adv/ratio_final_to_reasoning": 1.2974457694735546, "adv/ratio_step_to_reasoning": 1.3706435646383544, "adv/std_final_conf": 0.8878163695335388, "adv/std_reasoning": 0.8097857236862183, "adv/std_step_conf": 0.9347683787345886, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.723826597131682, "calib/avg_num_step_conf": 6.38671875, "calib/ece": 0.27116935483870963, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4032258064516129, "calib/gap": 0.36347979139504566, "calib/mean_conf": 0.46366935483870964, "calib/mu_c": 0.6366153846153847, "calib/mu_w": 0.27313559322033903, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1053225806451613, "calib/std_conf": 0.4609927964900458, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3957500000000001, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.14008879781420774, "calib/step_q_w": 0.25566120218579236, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 490.6796875, "completions/mean_terminated_length": 490.6796875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.12053333333333334, "grad_norm": 0.038772035390138626, "kl": 0.11591339111328125, "learning_rate": 2.4166666666666667e-06, "loss": 0.1866, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03541328012943268, "mask/share_reasoning": 0.8271290063858032, "mask/share_step_conf": 0.1374577134847641, "num_tokens": 25989555.0, "reward": 1.145193099975586, "reward_std": 0.2337724268436432, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6933558583259583, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8683326244354248, "step": 113 }, { "adv/mean_abs_final_conf": 0.576580286026001, "adv/mean_abs_reasoning": 0.44374608993530273, "adv/mean_abs_step_conf": 0.7638773918151855, "adv/ratio_final_to_reasoning": 1.2993473049194983, "adv/ratio_step_to_reasoning": 1.7214290089329176, "adv/std_final_conf": 0.8202750086784363, "adv/std_reasoning": 0.7392296195030212, "adv/std_step_conf": 0.934648871421814, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.847147626651183, "calib/avg_num_step_conf": 6.0078125, "calib/ece": 0.15614457831325304, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5542168674698795, "calib/gap": 0.6127710843373495, "calib/mean_conf": 0.625863453815261, "calib/mu_c": 0.8301204819277109, "calib/mu_w": 0.21734939759036143, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05767068273092371, "calib/std_conf": 0.45038421508140897, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45877307274701407, "calib/step_q_c_n": 921.0, "calib/step_q_gap": 0.2278654552429622, "calib/step_q_w": 0.23090761750405187, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2338.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 444.98828125, "completions/mean_terminated_length": 450.26483154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.1216, "grad_norm": 0.06637249886989594, "kl": 0.3884124755859375, "learning_rate": 2.388888888888889e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.037197984755039215, "mask/share_reasoning": 0.8127014636993408, "mask/share_step_conf": 0.13838176429271698, "num_tokens": 26208496.0, "reward": 1.2241928577423096, "reward_std": 0.21004557609558105, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.822487473487854, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8672654628753662, "step": 114 }, { "adv/mean_abs_final_conf": 0.6964755058288574, "adv/mean_abs_reasoning": 0.4984782338142395, "adv/mean_abs_step_conf": 0.7788224220275879, "adv/ratio_final_to_reasoning": 1.3972034455739197, "adv/ratio_step_to_reasoning": 1.562400059212656, "adv/std_final_conf": 0.8562023043632507, "adv/std_reasoning": 0.7393795251846313, "adv/std_step_conf": 0.9353573322296143, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7044401544401544, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.2933864541832669, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5139442231075697, "calib/gap": 0.3227696267696266, "calib/mean_conf": 0.6018326693227092, "calib/mu_c": 0.7445714285714284, "calib/mu_w": 0.4218018018018018, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.16872509960159363, "calib/std_conf": 0.45197202682942783, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.40772423025435073, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.06531822709479307, "calib/step_q_w": 0.34240600315955766, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 441.5, "completions/mean_terminated_length": 441.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.12266666666666666, "grad_norm": 0.0388445183634758, "kl": 0.11220550537109375, "learning_rate": 2.361111111111111e-06, "loss": -0.0193, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037708356976509094, "mask/share_reasoning": 0.8265572786331177, "mask/share_step_conf": 0.13573437929153442, "num_tokens": 26426784.0, "reward": 1.1016981601715088, "reward_std": 0.24980241060256958, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6808328032493591, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.812958836555481, "step": 115 }, { "adv/mean_abs_final_conf": 0.6575169563293457, "adv/mean_abs_reasoning": 0.46607887744903564, "adv/mean_abs_step_conf": 0.766182541847229, "adv/ratio_final_to_reasoning": 1.4107418038940056, "adv/ratio_step_to_reasoning": 1.6438902917908114, "adv/std_final_conf": 0.8525789380073547, "adv/std_reasoning": 0.7393465042114258, "adv/std_step_conf": 0.9343810081481934, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7432891280671408, "calib/avg_num_step_conf": 6.70703125, "calib/ece": 0.23700000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.496, "calib/gap": 0.42522198731501054, "calib/mean_conf": 0.57396, "calib/mu_c": 0.7797674418604651, "calib/mu_w": 0.35454545454545455, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14748000000000003, "calib/std_conf": 0.45915914278167214, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4153658536585365, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.14663245222850585, "calib/step_q_w": 0.26873340143003066, "calib/step_q_w_n": 979.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 540.48046875, "completions/mean_terminated_length": 542.6000366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.12373333333333333, "grad_norm": 0.034233979880809784, "kl": 0.1004180908203125, "learning_rate": 2.3333333333333336e-06, "loss": -0.0016, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032839495688676834, "mask/share_reasoning": 0.8337111473083496, "mask/share_step_conf": 0.12954317033290863, "num_tokens": 26669667.0, "reward": 1.1264750957489014, "reward_std": 0.25903597474098206, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.711389422416687, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8334362506866455, "step": 116 }, { "adv/mean_abs_final_conf": 0.6496747732162476, "adv/mean_abs_reasoning": 0.455629825592041, "adv/mean_abs_step_conf": 0.7362050414085388, "adv/ratio_final_to_reasoning": 1.425882891604092, "adv/ratio_step_to_reasoning": 1.615796420815782, "adv/std_final_conf": 0.8587450385093689, "adv/std_reasoning": 0.7206072807312012, "adv/std_step_conf": 0.9353505373001099, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7187174479166667, "calib/avg_num_step_conf": 5.8984375, "calib/ece": 0.28959677419354835, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.38306451612903225, "calib/gap": 0.31717187500000005, "calib/mean_conf": 0.4939516129032258, "calib/mu_c": 0.647421875, "calib/mu_w": 0.33025, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1337096774193548, "calib/std_conf": 0.4501609363621312, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.42036231884057973, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.14292817249911632, "calib/step_q_w": 0.2774341463414634, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 478.65234375, "completions/mean_terminated_length": 478.65234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1248, "grad_norm": 0.059748198837041855, "kl": 0.1706085205078125, "learning_rate": 2.305555555555556e-06, "loss": -0.0748, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03415881469845772, "mask/share_reasoning": 0.8324176073074341, "mask/share_step_conf": 0.1334235817193985, "num_tokens": 26898802.0, "reward": 1.1214919090270996, "reward_std": 0.23342269659042358, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.67963707447052, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8474394083023071, "step": 117 }, { "adv/mean_abs_final_conf": 0.6411893367767334, "adv/mean_abs_reasoning": 0.46815019845962524, "adv/mean_abs_step_conf": 0.755583643913269, "adv/ratio_final_to_reasoning": 1.369623122849176, "adv/ratio_step_to_reasoning": 1.613976980890745, "adv/std_final_conf": 0.8469781875610352, "adv/std_reasoning": 0.7574445009231567, "adv/std_step_conf": 0.9356558322906494, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7250325520833334, "calib/avg_num_step_conf": 7.359375, "calib/ece": 0.2762903225806452, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.3405885416666667, "calib/mean_conf": 0.6088709677419355, "calib/mu_c": 0.773671875, "calib/mu_w": 0.4330833333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18451612903225806, "calib/std_conf": 0.4435027542508566, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42813802083333335, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.13966131832437279, "calib/step_q_w": 0.28847670250896057, "calib/step_q_w_n": 1116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 519.95703125, "completions/mean_terminated_length": 524.0512084960938, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.12586666666666665, "grad_norm": 0.07028115540742874, "kl": 0.1076812744140625, "learning_rate": 2.277777777777778e-06, "loss": -0.103, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03212461620569229, "mask/share_reasoning": 0.8212298154830933, "mask/share_step_conf": 0.13883310556411743, "num_tokens": 27135919.0, "reward": 1.117949366569519, "reward_std": 0.2524290680885315, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6927351951599121, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8334632515907288, "step": 118 }, { "adv/mean_abs_final_conf": 0.6336255073547363, "adv/mean_abs_reasoning": 0.501274824142456, "adv/mean_abs_step_conf": 0.7393103837966919, "adv/ratio_final_to_reasoning": 1.2640281874093637, "adv/ratio_step_to_reasoning": 1.474860392323611, "adv/std_final_conf": 0.8469235897064209, "adv/std_reasoning": 0.7394473552703857, "adv/std_step_conf": 0.934846818447113, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7766627312081857, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.25226720647773276, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.46963562753036436, "calib/gap": 0.4519526433162798, "calib/mean_conf": 0.5153441295546558, "calib/mu_c": 0.7367460317460318, "calib/mu_w": 0.284793388429752, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12874493927125505, "calib/std_conf": 0.47622565394745764, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45553129548762744, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.20295553791186988, "calib/step_q_w": 0.25257575757575756, "calib/step_q_w_n": 924.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 532.75, "completions/mean_terminated_length": 534.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.12693333333333334, "grad_norm": 0.03186385706067085, "kl": 0.100189208984375, "learning_rate": 2.25e-06, "loss": 0.0034, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03202106058597565, "mask/share_reasoning": 0.8441102504730225, "mask/share_step_conf": 0.1199624240398407, "num_tokens": 27377367.0, "reward": 1.1394624710083008, "reward_std": 0.24343638122081757, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.718923807144165, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8467713594436646, "step": 119 }, { "adv/mean_abs_final_conf": 0.5528663396835327, "adv/mean_abs_reasoning": 0.4402967095375061, "adv/mean_abs_step_conf": 0.7554687261581421, "adv/ratio_final_to_reasoning": 1.2556676616190736, "adv/ratio_step_to_reasoning": 1.7158173336150913, "adv/std_final_conf": 0.8008996248245239, "adv/std_reasoning": 0.7205032706260681, "adv/std_step_conf": 0.935148298740387, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8480592105263157, "calib/avg_num_step_conf": 5.50390625, "calib/ece": 0.1924603174603174, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4166666666666667, "calib/gap": 0.594978947368421, "calib/mean_conf": 0.4854761904761905, "calib/mu_c": 0.7215789473684211, "calib/mu_w": 0.1266, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.03738095238095231, "calib/std_conf": 0.4695744563623826, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.41179334916864613, "calib/step_q_c_n": 842.0, "calib/step_q_gap": 0.14965931036794067, "calib/step_q_w": 0.26213403880070546, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 443.40625, "completions/mean_terminated_length": 448.6640625, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.128, "grad_norm": 0.08257104456424713, "kl": 0.1157684326171875, "learning_rate": 2.222222222222222e-06, "loss": -0.068, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034355901181697845, "mask/share_reasoning": 0.8268024921417236, "mask/share_step_conf": 0.12712284922599792, "num_tokens": 27597567.0, "reward": 1.1937901973724365, "reward_std": 0.19093571603298187, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7867218255996704, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8583849668502808, "step": 120 }, { "adv/mean_abs_final_conf": 0.7695930004119873, "adv/mean_abs_reasoning": 0.6066721677780151, "adv/mean_abs_step_conf": 0.7388530969619751, "adv/ratio_final_to_reasoning": 1.268548387889101, "adv/ratio_step_to_reasoning": 1.217878676828184, "adv/std_final_conf": 0.9217678904533386, "adv/std_reasoning": 0.843062162399292, "adv/std_step_conf": 0.935973048210144, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7125100779360387, "calib/avg_num_step_conf": 6.828125, "calib/ece": 0.27237704918032785, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.36885245901639346, "calib/gap": 0.32524590163934425, "calib/mean_conf": 0.47975409836065575, "calib/mu_c": 0.6423770491803279, "calib/mu_w": 0.31713114754098365, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12606557377049177, "calib/std_conf": 0.44842431441871206, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4331975867269985, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.15232201069013213, "calib/step_q_w": 0.28087557603686636, "calib/step_q_w_n": 1085.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 553.16796875, "completions/mean_terminated_length": 557.5236206054688, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.12906666666666666, "grad_norm": 0.03182586282491684, "kl": 0.09899139404296875, "learning_rate": 2.1944444444444445e-06, "loss": 0.0229, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030971001833677292, "mask/share_reasoning": 0.8363680839538574, "mask/share_step_conf": 0.12484840303659439, "num_tokens": 27844234.0, "reward": 1.0988376140594482, "reward_std": 0.30374959111213684, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6719839572906494, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8275442123413086, "step": 121 }, { "adv/mean_abs_final_conf": 0.6513844728469849, "adv/mean_abs_reasoning": 0.4658776521682739, "adv/mean_abs_step_conf": 0.7674317955970764, "adv/ratio_final_to_reasoning": 1.3981878499973772, "adv/ratio_step_to_reasoning": 1.6472818389663426, "adv/std_final_conf": 0.8468610048294067, "adv/std_reasoning": 0.7205932140350342, "adv/std_step_conf": 0.9350212812423706, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8093630660120023, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.18643999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.5671153846153847, "calib/mean_conf": 0.56388, "calib/mu_c": 0.7771153846153847, "calib/mu_w": 0.21, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06315999999999997, "calib/std_conf": 0.471184619443377, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4641991341991342, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.18450777617444286, "calib/step_q_w": 0.27969135802469136, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 484.57421875, "completions/mean_terminated_length": 484.57421875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.13013333333333332, "grad_norm": 0.05260329321026802, "kl": 0.110565185546875, "learning_rate": 2.166666666666667e-06, "loss": -0.0138, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033760882914066315, "mask/share_reasoning": 0.8320001363754272, "mask/share_step_conf": 0.1342390477657318, "num_tokens": 28075629.0, "reward": 1.2003474235534668, "reward_std": 0.19919967651367188, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7869769334793091, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8643536567687988, "step": 122 }, { "adv/mean_abs_final_conf": 0.6745046377182007, "adv/mean_abs_reasoning": 0.540432870388031, "adv/mean_abs_step_conf": 0.7357199788093567, "adv/ratio_final_to_reasoning": 1.248082184997197, "adv/ratio_step_to_reasoning": 1.361353128430381, "adv/std_final_conf": 0.8684989809989929, "adv/std_reasoning": 0.7928730845451355, "adv/std_step_conf": 0.9353545904159546, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7710259373394965, "calib/avg_num_step_conf": 7.02734375, "calib/ece": 0.22268000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.44, "calib/gap": 0.41282614278376994, "calib/mean_conf": 0.55204, "calib/mu_c": 0.7468939393939394, "calib/mu_w": 0.3340677966101695, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12336000000000005, "calib/std_conf": 0.4447815625675147, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4186330049261084, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.15670797959682775, "calib/step_q_w": 0.26192502532928064, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 552.5390625, "completions/mean_terminated_length": 556.8897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.1312, "grad_norm": 0.0321660153567791, "kl": 0.09477996826171875, "learning_rate": 2.138888888888889e-06, "loss": -0.021, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03134801238775253, "mask/share_reasoning": 0.8331590890884399, "mask/share_step_conf": 0.12768039107322693, "num_tokens": 28322367.0, "reward": 1.1497811079025269, "reward_std": 0.24997322261333466, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7325191497802734, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8472995162010193, "step": 123 }, { "adv/mean_abs_final_conf": 0.6126042008399963, "adv/mean_abs_reasoning": 0.30936306715011597, "adv/mean_abs_step_conf": 0.756053626537323, "adv/ratio_final_to_reasoning": 1.9802111689781476, "adv/ratio_step_to_reasoning": 2.4439039653380927, "adv/std_final_conf": 0.8274469375610352, "adv/std_reasoning": 0.596075713634491, "adv/std_step_conf": 0.9345342516899109, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7629561796228463, "calib/avg_num_step_conf": 6.10546875, "calib/ece": 0.2468774703557313, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4505928853754941, "calib/gap": 0.35856532356532356, "calib/mean_conf": 0.5804743083003953, "calib/mu_c": 0.7094444444444444, "calib/mu_w": 0.3508791208791209, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09351778656126483, "calib/std_conf": 0.43693119131131747, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48198266522210187, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.1471076652221019, "calib/step_q_w": 0.334875, "calib/step_q_w_n": 640.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2397.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 488.70703125, "completions/mean_terminated_length": 488.70703125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.13226666666666667, "grad_norm": 0.042988456785678864, "kl": 0.1159820556640625, "learning_rate": 2.1111111111111114e-06, "loss": 0.049, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032922565937042236, "mask/share_reasoning": 0.8366243839263916, "mask/share_step_conf": 0.13045307993888855, "num_tokens": 28554292.0, "reward": 1.163818120956421, "reward_std": 0.19366100430488586, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7316859364509583, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8478209972381592, "step": 124 }, { "adv/mean_abs_final_conf": 0.7198798060417175, "adv/mean_abs_reasoning": 0.48377013206481934, "adv/mean_abs_step_conf": 0.7534222602844238, "adv/ratio_final_to_reasoning": 1.4880617018854367, "adv/ratio_step_to_reasoning": 1.5573972230750996, "adv/std_final_conf": 0.8781076073646545, "adv/std_reasoning": 0.739425003528595, "adv/std_step_conf": 0.9353218078613281, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7068089430894309, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.2947011952191235, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.450199203187251, "calib/gap": 0.28353467987804887, "calib/mean_conf": 0.5841035856573704, "calib/mu_c": 0.7230468750000001, "calib/mu_w": 0.4395121951219512, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18442231075697216, "calib/std_conf": 0.4379773926144206, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47793696275071634, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.10005817487192847, "calib/step_q_w": 0.37787878787878787, "calib/step_q_w_n": 825.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 513.16015625, "completions/mean_terminated_length": 515.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.13333333333333333, "grad_norm": 0.04248211905360222, "kl": 0.099151611328125, "learning_rate": 2.0833333333333334e-06, "loss": -0.0845, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03320443257689476, "mask/share_reasoning": 0.8370097875595093, "mask/share_step_conf": 0.12587954103946686, "num_tokens": 28790469.0, "reward": 1.1076419353485107, "reward_std": 0.2369730919599533, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6809245944023132, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8255103826522827, "step": 125 }, { "adv/mean_abs_final_conf": 0.5678446292877197, "adv/mean_abs_reasoning": 0.4430251121520996, "adv/mean_abs_step_conf": 0.768993616104126, "adv/ratio_final_to_reasoning": 1.2817436612775281, "adv/ratio_step_to_reasoning": 1.735778841900309, "adv/std_final_conf": 0.7833918333053589, "adv/std_reasoning": 0.7015147805213928, "adv/std_step_conf": 0.9341139197349548, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7835065835065835, "calib/avg_num_step_conf": 6.734375, "calib/ece": 0.23622406639004156, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5352697095435685, "calib/gap": 0.4505980595980596, "calib/mean_conf": 0.5942323651452281, "calib/mu_c": 0.8017692307692308, "calib/mu_w": 0.3511711711711712, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1455186721991702, "calib/std_conf": 0.45836296742522104, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5312717536813922, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.25587769022182205, "calib/step_q_w": 0.2753940634595702, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 561.328125, "completions/mean_terminated_length": 563.5294189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.1344, "grad_norm": 0.038122810423374176, "kl": 0.09313201904296875, "learning_rate": 2.0555555555555555e-06, "loss": 0.0491, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03354516625404358, "mask/share_reasoning": 0.8258533477783203, "mask/share_step_conf": 0.1366952508687973, "num_tokens": 29039633.0, "reward": 1.1189892292022705, "reward_std": 0.22997254133224487, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7145671844482422, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.822378396987915, "step": 126 }, { "adv/mean_abs_final_conf": 0.6837416887283325, "adv/mean_abs_reasoning": 0.5188707113265991, "adv/mean_abs_step_conf": 0.7492235898971558, "adv/ratio_final_to_reasoning": 1.3177496316571946, "adv/ratio_step_to_reasoning": 1.4439504360953663, "adv/std_final_conf": 0.876003623008728, "adv/std_reasoning": 0.7755688428878784, "adv/std_step_conf": 0.9350953698158264, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7494789915966387, "calib/avg_num_step_conf": 7.17578125, "calib/ece": 0.25270491803278683, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4385245901639344, "calib/gap": 0.3674971428571428, "calib/mean_conf": 0.5354098360655737, "calib/mu_c": 0.7146399999999999, "calib/mu_w": 0.34714285714285714, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13790983606557375, "calib/std_conf": 0.44824365362825375, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5143576158940397, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.19518016672583266, "calib/step_q_w": 0.31917744916820706, "calib/step_q_w_n": 1082.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 497.125, "completions/mean_terminated_length": 507.02789306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.13546666666666668, "grad_norm": 0.03446248173713684, "kl": 0.1027679443359375, "learning_rate": 2.027777777777778e-06, "loss": -0.0337, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.032688289880752563, "mask/share_reasoning": 0.8081912994384766, "mask/share_step_conf": 0.13958916068077087, "num_tokens": 29270569.0, "reward": 1.1078226566314697, "reward_std": 0.2610389292240143, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.698003888130188, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8195732831954956, "step": 127 }, { "adv/mean_abs_final_conf": 0.7368889451026917, "adv/mean_abs_reasoning": 0.5929111242294312, "adv/mean_abs_step_conf": 0.763623833656311, "adv/ratio_final_to_reasoning": 1.24283204512376, "adv/ratio_step_to_reasoning": 1.2879229322080006, "adv/std_final_conf": 0.9130525588989258, "adv/std_reasoning": 0.8429921865463257, "adv/std_step_conf": 0.9349132776260376, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7458779191062657, "calib/avg_num_step_conf": 5.62109375, "calib/ece": 0.30168032786885246, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5860655737704918, "calib/gap": 0.3515774951208024, "calib/mean_conf": 0.6850409836065574, "calib/mu_c": 0.8680341880341881, "calib/mu_w": 0.5164566929133857, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2536065573770492, "calib/std_conf": 0.4225600141204308, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5703135888501742, "calib/step_q_c_n": 574.0, "calib/step_q_gap": 0.1935737044571107, "calib/step_q_w": 0.37673988439306355, "calib/step_q_w_n": 865.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 516.453125, "completions/mean_terminated_length": 518.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.13653333333333334, "grad_norm": 0.0671505406498909, "kl": 0.097900390625, "learning_rate": 2.0000000000000003e-06, "loss": 0.025, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.035544052720069885, "mask/share_reasoning": 0.8391733169555664, "mask/share_step_conf": 0.1213764026761055, "num_tokens": 29509445.0, "reward": 1.0835953950881958, "reward_std": 0.3216124475002289, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6603434085845947, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.817585825920105, "step": 128 }, { "adv/mean_abs_final_conf": 0.6601653099060059, "adv/mean_abs_reasoning": 0.5416503548622131, "adv/mean_abs_step_conf": 0.7622501850128174, "adv/ratio_final_to_reasoning": 1.2188034291493097, "adv/ratio_step_to_reasoning": 1.407273489568231, "adv/std_final_conf": 0.8547391295433044, "adv/std_reasoning": 0.7928169965744019, "adv/std_step_conf": 0.9353224635124207, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7006404810143128, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.2701200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.568, "calib/gap": 0.28760930658126915, "calib/mean_conf": 0.6901200000000001, "calib/mu_c": 0.8132167832167831, "calib/mu_w": 0.525607476635514, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1941200000000001, "calib/std_conf": 0.4056850818060728, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5494588500563697, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.1163981640405386, "calib/step_q_w": 0.43306068601583114, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 478.265625, "completions/mean_terminated_length": 478.265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1376, "grad_norm": 0.03564068675041199, "kl": 0.1029205322265625, "learning_rate": 1.9722222222222224e-06, "loss": 0.0749, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03495129942893982, "mask/share_reasoning": 0.8219506740570068, "mask/share_step_conf": 0.14309805631637573, "num_tokens": 29734265.0, "reward": 1.1189069747924805, "reward_std": 0.2425338625907898, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7006582021713257, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8200829029083252, "step": 129 }, { "adv/mean_abs_final_conf": 0.5851052403450012, "adv/mean_abs_reasoning": 0.38511407375335693, "adv/mean_abs_step_conf": 0.7399400472640991, "adv/ratio_final_to_reasoning": 1.5193037082299072, "adv/ratio_step_to_reasoning": 1.9213529125346571, "adv/std_final_conf": 0.8160495758056641, "adv/std_reasoning": 0.6612229943275452, "adv/std_step_conf": 0.934004545211792, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7913043478260868, "calib/avg_num_step_conf": 5.41015625, "calib/ece": 0.18928286852589649, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6095617529880478, "calib/gap": 0.46168668046928907, "calib/mean_conf": 0.7056972111553784, "calib/mu_c": 0.8712422360248446, "calib/mu_w": 0.40955555555555556, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12677290836653396, "calib/std_conf": 0.4146959380894869, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5991325898389095, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.14627791855863265, "calib/step_q_w": 0.45285467128027684, "calib/step_q_w_n": 578.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 437.25390625, "completions/mean_terminated_length": 438.9686584472656, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.13866666666666666, "grad_norm": 0.03557712584733963, "kl": 0.1061553955078125, "learning_rate": 1.944444444444445e-06, "loss": 0.0019, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03761007636785507, "mask/share_reasoning": 0.8260776996612549, "mask/share_step_conf": 0.13240596652030945, "num_tokens": 29951490.0, "reward": 1.198660135269165, "reward_std": 0.19062399864196777, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7905261516571045, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8571335077285767, "step": 130 }, { "adv/mean_abs_final_conf": 0.620849609375, "adv/mean_abs_reasoning": 0.4095512628555298, "adv/mean_abs_step_conf": 0.7489230632781982, "adv/ratio_final_to_reasoning": 1.5159264924401081, "adv/ratio_step_to_reasoning": 1.8286430325143026, "adv/std_final_conf": 0.8143406510353088, "adv/std_reasoning": 0.7012926340103149, "adv/std_step_conf": 0.9351503849029541, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8083251392985906, "calib/avg_num_step_conf": 6.41015625, "calib/ece": 0.23830645161290326, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4838709677419355, "calib/gap": 0.44567944936086534, "calib/mean_conf": 0.5695161290322581, "calib/mu_c": 0.8121238938053098, "calib/mu_w": 0.36644444444444446, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.17608870967741935, "calib/std_conf": 0.4515865487701963, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5597693574958814, "calib/step_q_c_n": 607.0, "calib/step_q_gap": 0.2432606534339859, "calib/step_q_w": 0.3165087040618955, "calib/step_q_w_n": 1034.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 485.3984375, "completions/mean_terminated_length": 487.302001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.13973333333333332, "grad_norm": 0.04578558728098869, "kl": 0.099395751953125, "learning_rate": 1.916666666666667e-06, "loss": 0.0238, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034842804074287415, "mask/share_reasoning": 0.8241904377937317, "mask/share_step_conf": 0.1370604932308197, "num_tokens": 30181960.0, "reward": 1.127051591873169, "reward_std": 0.20851066708564758, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7286202907562256, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8294886350631714, "step": 131 }, { "adv/mean_abs_final_conf": 0.5589631795883179, "adv/mean_abs_reasoning": 0.438191294670105, "adv/mean_abs_step_conf": 0.7336174249649048, "adv/ratio_final_to_reasoning": 1.275614523581845, "adv/ratio_step_to_reasoning": 1.67419443035082, "adv/std_final_conf": 0.8089057803153992, "adv/std_reasoning": 0.7013886570930481, "adv/std_step_conf": 0.9349938631057739, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8533506831489914, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.183784860557769, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6055776892430279, "calib/gap": 0.5183786597267406, "calib/mean_conf": 0.6826693227091635, "calib/mu_c": 0.9015862068965519, "calib/mu_w": 0.3832075471698113, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14438247011952196, "calib/std_conf": 0.4299805960311306, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.616969696969697, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.270364758698092, "calib/step_q_w": 0.34660493827160493, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 482.35546875, "completions/mean_terminated_length": 484.2471008300781, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.1408, "grad_norm": 0.047956615686416626, "kl": 0.098297119140625, "learning_rate": 1.888888888888889e-06, "loss": -0.0322, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03540698066353798, "mask/share_reasoning": 0.8221828937530518, "mask/share_step_conf": 0.1385037899017334, "num_tokens": 30411035.0, "reward": 1.1885790824890137, "reward_std": 0.2194693386554718, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7971832156181335, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.846545934677124, "step": 132 }, { "adv/mean_abs_final_conf": 0.6864287853240967, "adv/mean_abs_reasoning": 0.5413827896118164, "adv/mean_abs_step_conf": 0.7185360193252563, "adv/ratio_final_to_reasoning": 1.2679176333187125, "adv/ratio_step_to_reasoning": 1.3272236079770152, "adv/std_final_conf": 0.8655786514282227, "adv/std_reasoning": 0.8099425435066223, "adv/std_step_conf": 0.9355461001396179, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7693362193362193, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.26643032786885257, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4344262295081967, "calib/gap": 0.45164862914862913, "calib/mean_conf": 0.5078319672131147, "calib/mu_c": 0.7928888888888889, "calib/mu_w": 0.34124025974025973, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20270491803278695, "calib/std_conf": 0.46393697508061343, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49898576512455517, "calib/step_q_c_n": 562.0, "calib/step_q_gap": 0.1589264430906569, "calib/step_q_w": 0.34005932203389827, "calib/step_q_w_n": 1180.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 527.63671875, "completions/mean_terminated_length": 544.6572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, "epoch": 0.14186666666666667, "grad_norm": 0.040555987507104874, "kl": 0.095001220703125, "learning_rate": 1.8611111111111113e-06, "loss": -0.1968, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03197144716978073, "mask/share_reasoning": 0.8133314847946167, "mask/share_step_conf": 0.12344710528850555, "num_tokens": 30652454.0, "reward": 1.107369065284729, "reward_std": 0.289519727230072, "rewards/accuracy_reward_step": 0.3515625, "rewards/final_brier_reward_step": 0.7042034268379211, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8335855603218079, "step": 133 }, { "adv/mean_abs_final_conf": 0.6914292573928833, "adv/mean_abs_reasoning": 0.608074963092804, "adv/mean_abs_step_conf": 0.7426619529724121, "adv/ratio_final_to_reasoning": 1.1370789776907126, "adv/ratio_step_to_reasoning": 1.2213328915814408, "adv/std_final_conf": 0.8574259281158447, "adv/std_reasoning": 0.8431612849235535, "adv/std_step_conf": 0.9358600974082947, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7793327678823863, "calib/avg_num_step_conf": 5.9609375, "calib/ece": 0.2264016736401674, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.4811715481171548, "calib/gap": 0.4349752615210632, "calib/mean_conf": 0.5857322175732217, "calib/mu_c": 0.782290076335878, "calib/mu_w": 0.3473148148148148, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.13200836820083683, "calib/std_conf": 0.4550141239332735, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.48848167539267023, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.1779829877286282, "calib/step_q_w": 0.31049868766404204, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 554.38671875, "completions/mean_terminated_length": 558.751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.14293333333333333, "grad_norm": 0.032993435859680176, "kl": 0.0858001708984375, "learning_rate": 1.8333333333333333e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03068811446428299, "mask/share_reasoning": 0.8482072949409485, "mask/share_step_conf": 0.11329209804534912, "num_tokens": 30903329.0, "reward": 1.092668056488037, "reward_std": 0.33639949560165405, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6982929706573486, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7996953725814819, "step": 134 }, { "adv/mean_abs_final_conf": 0.6794062852859497, "adv/mean_abs_reasoning": 0.5256780385971069, "adv/mean_abs_step_conf": 0.7526997923851013, "adv/ratio_final_to_reasoning": 1.2924380236600754, "adv/ratio_step_to_reasoning": 1.4318646340902015, "adv/std_final_conf": 0.8573894500732422, "adv/std_reasoning": 0.7755757570266724, "adv/std_step_conf": 0.9348970651626587, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6805479452054795, "calib/avg_num_step_conf": 6.73828125, "calib/ece": 0.28081300813008125, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4146341463414634, "calib/gap": 0.30192739726027396, "calib/mean_conf": 0.5482926829268292, "calib/mu_c": 0.6710273972602739, "calib/mu_w": 0.3691, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11780487804878045, "calib/std_conf": 0.4409085359810263, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4391374527112232, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.14010311794727465, "calib/step_q_w": 0.29903433476394853, "calib/step_q_w_n": 932.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2771.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 514.8828125, "completions/mean_terminated_length": 525.1394653320312, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.144, "grad_norm": 0.04568664729595184, "kl": 0.090484619140625, "learning_rate": 1.8055555555555557e-06, "loss": -0.0877, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0343073233962059, "mask/share_reasoning": 0.8224637508392334, "mask/share_step_conf": 0.12369771301746368, "num_tokens": 31141019.0, "reward": 1.1282449960708618, "reward_std": 0.2306542545557022, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6699097752571106, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8551160097122192, "step": 135 }, { "adv/mean_abs_final_conf": 0.5642716884613037, "adv/mean_abs_reasoning": 0.4450831115245819, "adv/mean_abs_step_conf": 0.7536429762840271, "adv/ratio_final_to_reasoning": 1.2677894843694582, "adv/ratio_step_to_reasoning": 1.6932634754495812, "adv/std_final_conf": 0.7953715920448303, "adv/std_reasoning": 0.7014799118041992, "adv/std_step_conf": 0.9343529939651489, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8631752305665348, "calib/avg_num_step_conf": 6.421875, "calib/ece": 0.1445564516129032, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.31048387096774194, "calib/gap": 0.6184361001317522, "calib/mean_conf": 0.3692338709677419, "calib/mu_c": 0.7133636363636363, "calib/mu_w": 0.09492753623188407, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.03512096774193546, "calib/std_conf": 0.44941518645958045, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.45343076923076925, "calib/step_q_c_n": 650.0, "calib/step_q_gap": 0.20295793220863645, "calib/step_q_w": 0.2504728370221328, "calib/step_q_w_n": 994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 483.77734375, "completions/mean_terminated_length": 483.77734375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.14506666666666668, "grad_norm": 0.04788472130894661, "kl": 0.1029815673828125, "learning_rate": 1.777777777777778e-06, "loss": -0.0129, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03419430926442146, "mask/share_reasoning": 0.8263726234436035, "mask/share_step_conf": 0.13943305611610413, "num_tokens": 31373354.0, "reward": 1.2139787673950195, "reward_std": 0.17367833852767944, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.8243738412857056, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8825975060462952, "step": 136 }, { "adv/mean_abs_final_conf": 0.6119123697280884, "adv/mean_abs_reasoning": 0.45151853561401367, "adv/mean_abs_step_conf": 0.7595447897911072, "adv/ratio_final_to_reasoning": 1.3552320037004846, "adv/ratio_step_to_reasoning": 1.6822006847586288, "adv/std_final_conf": 0.8421707153320312, "adv/std_reasoning": 0.7206130623817444, "adv/std_step_conf": 0.9349687695503235, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.788516129032258, "calib/avg_num_step_conf": 6.859375, "calib/ece": 0.20839357429718883, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3855421686746988, "calib/gap": 0.47698516129032253, "calib/mean_conf": 0.4663052208835341, "calib/mu_c": 0.7038399999999999, "calib/mu_w": 0.2268548387096774, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.0863453815261045, "calib/std_conf": 0.4581734330927749, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4103771131339402, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.12516941303262308, "calib/step_q_w": 0.28520770010131713, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 481.43359375, "completions/mean_terminated_length": 485.2243957519531, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.14613333333333334, "grad_norm": 0.04561835899949074, "kl": 0.1042022705078125, "learning_rate": 1.75e-06, "loss": -0.077, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03354161977767944, "mask/share_reasoning": 0.8171862959861755, "mask/share_step_conf": 0.14145955443382263, "num_tokens": 31603585.0, "reward": 1.181359052658081, "reward_std": 0.20482224225997925, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7560410499572754, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8763262033462524, "step": 137 }, { "adv/mean_abs_final_conf": 0.611071765422821, "adv/mean_abs_reasoning": 0.5744329690933228, "adv/mean_abs_step_conf": 0.7322912216186523, "adv/ratio_final_to_reasoning": 1.063782544353832, "adv/ratio_step_to_reasoning": 1.2748070898063022, "adv/std_final_conf": 0.8569561839103699, "adv/std_reasoning": 0.8266201615333557, "adv/std_step_conf": 0.9349929690361023, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7623496107572542, "calib/avg_num_step_conf": 6.609375, "calib/ece": 0.2608906882591093, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.39271255060728744, "calib/gap": 0.42322717622080674, "calib/mean_conf": 0.48234817813765185, "calib/mu_c": 0.6365605095541401, "calib/mu_w": 0.21333333333333337, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05380566801619439, "calib/std_conf": 0.4529741484641731, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4167561761546724, "calib/step_q_c_n": 931.0, "calib/step_q_gap": 0.1608954665620312, "calib/step_q_w": 0.25586070959264123, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 495.27734375, "completions/mean_terminated_length": 499.1771545410156, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1472, "grad_norm": 0.04322437196969986, "kl": 0.100189208984375, "learning_rate": 1.7222222222222224e-06, "loss": 0.0086, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03365977108478546, "mask/share_reasoning": 0.8261804580688477, "mask/share_step_conf": 0.1323472559452057, "num_tokens": 31834712.0, "reward": 1.139789342880249, "reward_std": 0.22284376621246338, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7059851884841919, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8386456370353699, "step": 138 }, { "adv/mean_abs_final_conf": 0.6286588907241821, "adv/mean_abs_reasoning": 0.40157026052474976, "adv/mean_abs_step_conf": 0.7390480041503906, "adv/ratio_final_to_reasoning": 1.565501613348273, "adv/ratio_step_to_reasoning": 1.8403952603079812, "adv/std_final_conf": 0.8444784879684448, "adv/std_reasoning": 0.681614339351654, "adv/std_step_conf": 0.9344739317893982, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7652531517562191, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.2688582677165354, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2755905511811024, "calib/gap": 0.3859131665880132, "calib/mean_conf": 0.4299606299212599, "calib/mu_c": 0.5682208588957055, "calib/mu_w": 0.1823076923076923, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028543307086614178, "calib/std_conf": 0.42258158419456776, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39593394077448746, "calib/step_q_c_n": 878.0, "calib/step_q_gap": 0.1422441817383429, "calib/step_q_w": 0.25368975903614455, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 468.63671875, "completions/mean_terminated_length": 468.63671875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.14826666666666666, "grad_norm": 0.06251625716686249, "kl": 0.10687255859375, "learning_rate": 1.6944444444444446e-06, "loss": 0.1717, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037038594484329224, "mask/share_reasoning": 0.8287454843521118, "mask/share_step_conf": 0.134215846657753, "num_tokens": 32057779.0, "reward": 1.1774184703826904, "reward_std": 0.18099024891853333, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7145543098449707, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8768551349639893, "step": 139 }, { "adv/mean_abs_final_conf": 0.5951710939407349, "adv/mean_abs_reasoning": 0.4744147062301636, "adv/mean_abs_step_conf": 0.7358417510986328, "adv/ratio_final_to_reasoning": 1.254537614717167, "adv/ratio_step_to_reasoning": 1.5510517305541478, "adv/std_final_conf": 0.816806972026825, "adv/std_reasoning": 0.7207441329956055, "adv/std_step_conf": 0.93478924036026, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7961909224795822, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.20410358565737058, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4940239043824701, "calib/gap": 0.4661393760878296, "calib/mean_conf": 0.5888844621513944, "calib/mu_c": 0.769025974025974, "calib/mu_w": 0.3028865979381443, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.08972111553784864, "calib/std_conf": 0.4461374185301638, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.47370283018867926, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.18343341655951922, "calib/step_q_w": 0.29026941362916003, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 485.96875, "completions/mean_terminated_length": 485.96875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.14933333333333335, "grad_norm": 0.06402932852506638, "kl": 0.098541259765625, "learning_rate": 1.6666666666666667e-06, "loss": 0.0951, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03483884036540985, "mask/share_reasoning": 0.8365387916564941, "mask/share_step_conf": 0.12862235307693481, "num_tokens": 32287203.0, "reward": 1.1847764253616333, "reward_std": 0.2182726263999939, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7572652101516724, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8654833436012268, "step": 140 }, { "adv/mean_abs_final_conf": 0.5650936961174011, "adv/mean_abs_reasoning": 0.4365852475166321, "adv/mean_abs_step_conf": 0.7492316961288452, "adv/ratio_final_to_reasoning": 1.2943490402658955, "adv/ratio_step_to_reasoning": 1.7161177579650182, "adv/std_final_conf": 0.8094077706336975, "adv/std_reasoning": 0.7205508351325989, "adv/std_step_conf": 0.9348444938659668, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8673163418290855, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.12342741935483872, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5362903225806451, "calib/gap": 0.6508752766473904, "calib/mean_conf": 0.6141532258064516, "calib/mu_c": 0.8424844720496893, "calib/mu_w": 0.19160919540229887, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04419354838709679, "calib/std_conf": 0.4500027875246072, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4526805251641138, "calib/step_q_c_n": 914.0, "calib/step_q_gap": 0.19392873176382686, "calib/step_q_w": 0.2587517934002869, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 513.63671875, "completions/mean_terminated_length": 515.6510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.1504, "grad_norm": 0.04482351988554001, "kl": 0.0861968994140625, "learning_rate": 1.638888888888889e-06, "loss": 0.0588, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032881513237953186, "mask/share_reasoning": 0.8362753987312317, "mask/share_step_conf": 0.12693683803081512, "num_tokens": 32525790.0, "reward": 1.232285737991333, "reward_std": 0.1995995044708252, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.8379597663879395, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8713870644569397, "step": 141 }, { "adv/mean_abs_final_conf": 0.6428453922271729, "adv/mean_abs_reasoning": 0.5539547801017761, "adv/mean_abs_step_conf": 0.7590426206588745, "adv/ratio_final_to_reasoning": 1.160465466349194, "adv/ratio_step_to_reasoning": 1.3702248774158394, "adv/std_final_conf": 0.8463671803474426, "adv/std_reasoning": 0.8099223971366882, "adv/std_step_conf": 0.9345756769180298, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.804326923076923, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.18180000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.38, "calib/gap": 0.4997435897435897, "calib/mean_conf": 0.4782, "calib/mu_c": 0.7180769230769231, "calib/mu_w": 0.21833333333333335, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07000000000000002, "calib/std_conf": 0.4522251209298307, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4578113663845224, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.15025867317609148, "calib/step_q_w": 0.3075526932084309, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 510.1796875, "completions/mean_terminated_length": 512.180419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.15146666666666667, "grad_norm": 0.03714209049940109, "kl": 0.0925140380859375, "learning_rate": 1.6111111111111113e-06, "loss": -0.0994, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03342527151107788, "mask/share_reasoning": 0.8303429484367371, "mask/share_step_conf": 0.13232550024986267, "num_tokens": 32761556.0, "reward": 1.1912291049957275, "reward_std": 0.20463573932647705, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7750167846679688, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8737108707427979, "step": 142 }, { "adv/mean_abs_final_conf": 0.6427560448646545, "adv/mean_abs_reasoning": 0.5893682241439819, "adv/mean_abs_step_conf": 0.7372866868972778, "adv/ratio_final_to_reasoning": 1.090584830558544, "adv/ratio_step_to_reasoning": 1.250978007794936, "adv/std_final_conf": 0.8869020938873291, "adv/std_reasoning": 0.8429052829742432, "adv/std_step_conf": 0.9352031946182251, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8430587739625026, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.16661157024793383, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4256198347107438, "calib/gap": 0.5487072537040938, "calib/mean_conf": 0.5290082644628099, "calib/mu_c": 0.7580141843971631, "calib/mu_w": 0.20930693069306933, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.056487603305785075, "calib/std_conf": 0.45419150757310883, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42411154345006485, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.16634266473152937, "calib/step_q_w": 0.2577688787185355, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 519.203125, "completions/mean_terminated_length": 523.2913208007812, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.15253333333333333, "grad_norm": 0.047295790165662766, "kl": 0.09334564208984375, "learning_rate": 1.5833333333333333e-06, "loss": -0.0946, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03189526125788689, "mask/share_reasoning": 0.8289496898651123, "mask/share_step_conf": 0.13134250044822693, "num_tokens": 33001808.0, "reward": 1.1662962436676025, "reward_std": 0.2495536059141159, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.769977331161499, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8422641754150391, "step": 143 }, { "adv/mean_abs_final_conf": 0.537711501121521, "adv/mean_abs_reasoning": 0.48254498839378357, "adv/mean_abs_step_conf": 0.7453676462173462, "adv/ratio_final_to_reasoning": 1.1143240818050284, "adv/ratio_step_to_reasoning": 1.5446593875078953, "adv/std_final_conf": 0.7811760902404785, "adv/std_reasoning": 0.7392632365226746, "adv/std_step_conf": 0.9346184730529785, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7303717575253156, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.26418326693227084, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5139442231075697, "calib/gap": 0.41801359411846306, "calib/mean_conf": 0.5696812749003984, "calib/mu_c": 0.7179012345679012, "calib/mu_w": 0.29988764044943816, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.09422310756972105, "calib/std_conf": 0.46778436182848504, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.44524625267665957, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.1155849336035758, "calib/step_q_w": 0.32966131907308377, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 475.97265625, "completions/mean_terminated_length": 477.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.1536, "grad_norm": 0.07732080668210983, "kl": 0.141204833984375, "learning_rate": 1.5555555555555558e-06, "loss": -0.0174, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035442303866147995, "mask/share_reasoning": 0.8312504291534424, "mask/share_step_conf": 0.12940102815628052, "num_tokens": 33227785.0, "reward": 1.1550257205963135, "reward_std": 0.18985150754451752, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7156910300254822, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8488442301750183, "step": 144 }, { "adv/mean_abs_final_conf": 0.6415660977363586, "adv/mean_abs_reasoning": 0.5276660323143005, "adv/mean_abs_step_conf": 0.7385779619216919, "adv/ratio_final_to_reasoning": 1.2158563531605429, "adv/ratio_step_to_reasoning": 1.3997072327781812, "adv/std_final_conf": 0.8449078798294067, "adv/std_reasoning": 0.7753786444664001, "adv/std_step_conf": 0.9349455237388611, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7155692729766804, "calib/avg_num_step_conf": 6.47265625, "calib/ece": 0.24250000000000005, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47619047619047616, "calib/gap": 0.30662962962962975, "calib/mean_conf": 0.623452380952381, "calib/mu_c": 0.732962962962963, "calib/mu_w": 0.4263333333333333, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11154761904761909, "calib/std_conf": 0.4225334420341048, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4739775051124744, "calib/step_q_c_n": 978.0, "calib/step_q_gap": 0.15094363176932268, "calib/step_q_w": 0.3230338733431517, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 456.296875, "completions/mean_terminated_length": 456.296875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.15466666666666667, "grad_norm": 0.03158734366297722, "kl": 0.0939178466796875, "learning_rate": 1.527777777777778e-06, "loss": 0.0975, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037699561566114426, "mask/share_reasoning": 0.8159480094909668, "mask/share_step_conf": 0.1463523805141449, "num_tokens": 33447301.0, "reward": 1.1692979335784912, "reward_std": 0.21003535389900208, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7208542823791504, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.862869381904602, "step": 145 }, { "adv/mean_abs_final_conf": 0.6607988476753235, "adv/mean_abs_reasoning": 0.46970364451408386, "adv/mean_abs_step_conf": 0.729251503944397, "adv/ratio_final_to_reasoning": 1.406842070299307, "adv/ratio_step_to_reasoning": 1.5525779126087464, "adv/std_final_conf": 0.8592219352722168, "adv/std_reasoning": 0.7394216656684875, "adv/std_step_conf": 0.9346105456352234, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7792182662538699, "calib/avg_num_step_conf": 6.2734375, "calib/ece": 0.24348000000000009, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.512, "calib/gap": 0.46127708978328164, "calib/mean_conf": 0.6119600000000001, "calib/mu_c": 0.8628947368421052, "calib/mu_w": 0.4016176470588235, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.19972000000000006, "calib/std_conf": 0.44000972534706545, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4796870342771982, "calib/step_q_c_n": 671.0, "calib/step_q_gap": 0.15754799684404308, "calib/step_q_w": 0.32213903743315514, "calib/step_q_w_n": 935.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 502.37890625, "completions/mean_terminated_length": 504.34906005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.15573333333333333, "grad_norm": 0.04502442106604576, "kl": 0.08422088623046875, "learning_rate": 1.5e-06, "loss": -0.1369, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033101823180913925, "mask/share_reasoning": 0.8348150253295898, "mask/share_step_conf": 0.12817689776420593, "num_tokens": 33683126.0, "reward": 1.1563743352890015, "reward_std": 0.23182733356952667, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7413148283958435, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8585601449012756, "step": 146 }, { "adv/mean_abs_final_conf": 0.6285818219184875, "adv/mean_abs_reasoning": 0.5205052495002747, "adv/mean_abs_step_conf": 0.729662299156189, "adv/ratio_final_to_reasoning": 1.2076378144542725, "adv/ratio_step_to_reasoning": 1.4018346594135627, "adv/std_final_conf": 0.8171048164367676, "adv/std_reasoning": 0.7753753662109375, "adv/std_step_conf": 0.9347905516624451, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7239484396200814, "calib/avg_num_step_conf": 7.15234375, "calib/ece": 0.33013661202185796, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6311475409836066, "calib/gap": 0.3178037087290819, "calib/mean_conf": 0.6843169398907105, "calib/mu_c": 0.8588484848484849, "calib/mu_w": 0.5410447761194029, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2818169398907104, "calib/std_conf": 0.43229103073089625, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5687556904400608, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.2364263389042246, "calib/step_q_w": 0.3323293515358362, "calib/step_q_w_n": 1172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 505.43359375, "completions/mean_terminated_length": 513.4563598632812, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.1568, "grad_norm": 0.03567939251661301, "kl": 0.111602783203125, "learning_rate": 1.4722222222222225e-06, "loss": -0.0141, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03253734111785889, "mask/share_reasoning": 0.8155205845832825, "mask/share_step_conf": 0.13631707429885864, "num_tokens": 33916197.0, "reward": 1.082091212272644, "reward_std": 0.2596330940723419, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6370561122894287, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8337092399597168, "step": 147 }, { "adv/mean_abs_final_conf": 0.5409537553787231, "adv/mean_abs_reasoning": 0.5042853355407715, "adv/mean_abs_step_conf": 0.7518453598022461, "adv/ratio_final_to_reasoning": 1.072713635026944, "adv/ratio_step_to_reasoning": 1.4909125981147222, "adv/std_final_conf": 0.776544988155365, "adv/std_reasoning": 0.7576196789741516, "adv/std_step_conf": 0.9346625804901123, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8059624673370813, "calib/avg_num_step_conf": 6.38671875, "calib/ece": 0.15252032520325212, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7073170731707317, "calib/gap": 0.43957558001425295, "calib/mean_conf": 0.7866666666666666, "calib/mu_c": 0.9171098265895954, "calib/mu_w": 0.47753424657534244, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11796747967479684, "calib/std_conf": 0.3661015600943583, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5194146100691016, "calib/step_q_c_n": 1013.0, "calib/step_q_gap": 0.19250142678935883, "calib/step_q_w": 0.3269131832797428, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 467.7109375, "completions/mean_terminated_length": 471.3937072753906, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.15786666666666666, "grad_norm": 0.039492230862379074, "kl": 0.08855438232421875, "learning_rate": 1.4444444444444445e-06, "loss": -0.091, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03712426871061325, "mask/share_reasoning": 0.8118203282356262, "mask/share_step_conf": 0.14324289560317993, "num_tokens": 34141043.0, "reward": 1.1953483819961548, "reward_std": 0.2314508557319641, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.8012219071388245, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8414207696914673, "step": 148 }, { "adv/mean_abs_final_conf": 0.6707220077514648, "adv/mean_abs_reasoning": 0.5681421756744385, "adv/mean_abs_step_conf": 0.7494305372238159, "adv/ratio_final_to_reasoning": 1.180553101792266, "adv/ratio_step_to_reasoning": 1.319089779480235, "adv/std_final_conf": 0.8762696385383606, "adv/std_reasoning": 0.7928650379180908, "adv/std_step_conf": 0.9343019127845764, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.866640153774773, "calib/avg_num_step_conf": 6.67578125, "calib/ece": 0.2108064516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6733870967741935, "calib/gap": 0.46435341684894277, "calib/mean_conf": 0.7583064516129032, "calib/mu_c": 0.9586524822695035, "calib/mu_w": 0.49429906542056073, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2002822580645162, "calib/std_conf": 0.3839937032711931, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5505997552019585, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.20829482246653247, "calib/step_q_w": 0.342304932735426, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 536.609375, "completions/mean_terminated_length": 540.8346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.15893333333333334, "grad_norm": 0.058882106095552444, "kl": 0.0788726806640625, "learning_rate": 1.4166666666666667e-06, "loss": -0.033, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034434471279382706, "mask/share_reasoning": 0.82708740234375, "mask/share_step_conf": 0.1306656002998352, "num_tokens": 34382871.0, "reward": 1.1861321926116943, "reward_std": 0.2548786401748657, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7701757550239563, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8659757375717163, "step": 149 }, { "adv/mean_abs_final_conf": 0.6370327472686768, "adv/mean_abs_reasoning": 0.47151249647140503, "adv/mean_abs_step_conf": 0.742438554763794, "adv/ratio_final_to_reasoning": 1.3510410689768637, "adv/ratio_step_to_reasoning": 1.5745893487868974, "adv/std_final_conf": 0.85017329454422, "adv/std_reasoning": 0.7206061482429504, "adv/std_step_conf": 0.9352177381515503, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7956780538302277, "calib/avg_num_step_conf": 6.5078125, "calib/ece": 0.27388000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.736, "calib/gap": 0.3759510869565218, "calib/mean_conf": 0.7894, "calib/mu_c": 0.9578260869565218, "calib/mu_w": 0.581875, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2556400000000001, "calib/std_conf": 0.3788256063150959, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5579088471849867, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.1947131950110736, "calib/step_q_w": 0.3631956521739131, "calib/step_q_w_n": 920.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 435.05859375, "completions/mean_terminated_length": 438.4842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.16, "grad_norm": 0.0466759093105793, "kl": 0.09934234619140625, "learning_rate": 1.3888888888888892e-06, "loss": -0.0764, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.04131307825446129, "mask/share_reasoning": 0.7998567819595337, "mask/share_step_conf": 0.15101763606071472, "num_tokens": 34599206.0, "reward": 1.1474685668945312, "reward_std": 0.21869435906410217, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.721463680267334, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8468990325927734, "step": 150 }, { "adv/mean_abs_final_conf": 0.6580295562744141, "adv/mean_abs_reasoning": 0.5155429244041443, "adv/mean_abs_step_conf": 0.7393295764923096, "adv/ratio_final_to_reasoning": 1.2763817038803382, "adv/ratio_step_to_reasoning": 1.4340795722234265, "adv/std_final_conf": 0.8595101833343506, "adv/std_reasoning": 0.7755288481712341, "adv/std_step_conf": 0.9352730512619019, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7508652963198418, "calib/avg_num_step_conf": 6.53125, "calib/ece": 0.28424369747899164, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.6092436974789915, "calib/gap": 0.37555484919121285, "calib/mean_conf": 0.6671008403361344, "calib/mu_c": 0.8580341880341881, "calib/mu_w": 0.4824793388429752, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.22987394957983198, "calib/std_conf": 0.43708497519991735, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.514, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.16983333333333334, "calib/step_q_w": 0.3441666666666667, "calib/step_q_w_n": 1032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 556.33203125, "completions/mean_terminated_length": 562.9288940429688, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.16106666666666666, "grad_norm": 0.032359592616558075, "kl": 0.07588958740234375, "learning_rate": 1.3611111111111112e-06, "loss": -0.07, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.03231126070022583, "mask/share_reasoning": 0.835185170173645, "mask/share_step_conf": 0.12078479677438736, "num_tokens": 34848651.0, "reward": 1.0490763187408447, "reward_std": 0.29749518632888794, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6578387022018433, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7779176235198975, "step": 151 }, { "adv/mean_abs_final_conf": 0.6492472887039185, "adv/mean_abs_reasoning": 0.5543559193611145, "adv/mean_abs_step_conf": 0.7531231641769409, "adv/ratio_final_to_reasoning": 1.1711740887554056, "adv/ratio_step_to_reasoning": 1.358555285284772, "adv/std_final_conf": 0.8588571548461914, "adv/std_reasoning": 0.7929433584213257, "adv/std_step_conf": 0.9349693655967712, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7679598662207359, "calib/avg_num_step_conf": 6.58984375, "calib/ece": 0.2981999999999998, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6122448979591837, "calib/gap": 0.32719832775919744, "calib/mean_conf": 0.7059632653061224, "calib/mu_c": 0.859546153846154, "calib/mu_w": 0.5323478260869565, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.23677551020408147, "calib/std_conf": 0.4136108230405445, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.47160194174757286, "calib/step_q_c_n": 824.0, "calib/step_q_gap": 0.12535396955753814, "calib/step_q_w": 0.3462479721900347, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 491.62890625, "completions/mean_terminated_length": 491.62890625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.16213333333333332, "grad_norm": 0.04753780737519264, "kl": 0.08827972412109375, "learning_rate": 1.3333333333333334e-06, "loss": -0.0528, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.035858429968357086, "mask/share_reasoning": 0.8217682242393494, "mask/share_step_conf": 0.14237335324287415, "num_tokens": 35079900.0, "reward": 1.095044732093811, "reward_std": 0.28394240140914917, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6688538193702698, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8204070925712585, "step": 152 }, { "adv/mean_abs_final_conf": 0.6669121980667114, "adv/mean_abs_reasoning": 0.4369834363460541, "adv/mean_abs_step_conf": 0.7152504920959473, "adv/ratio_final_to_reasoning": 1.5261727163923282, "adv/ratio_step_to_reasoning": 1.6367908543094734, "adv/std_final_conf": 0.8692089319229126, "adv/std_reasoning": 0.7208417057991028, "adv/std_step_conf": 0.9349717497825623, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6541086541086542, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.2792307692307693, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7368421052631579, "calib/gap": 0.2747229047229047, "calib/mean_conf": 0.7925910931174088, "calib/mu_c": 0.9027027027027027, "calib/mu_w": 0.627979797979798, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.23631578947368426, "calib/std_conf": 0.37290489882058686, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4857142857142857, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.1988883471477328, "calib/step_q_w": 0.2868259385665529, "calib/step_q_w_n": 879.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 494.52734375, "completions/mean_terminated_length": 498.4212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.1632, "grad_norm": 0.03760277479887009, "kl": 0.0871429443359375, "learning_rate": 1.3055555555555556e-06, "loss": 0.0606, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03430212289094925, "mask/share_reasoning": 0.8316156268119812, "mask/share_step_conf": 0.12626971304416656, "num_tokens": 35313819.0, "reward": 1.112259864807129, "reward_std": 0.2520466148853302, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6784656047821045, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8270569443702698, "step": 153 }, { "adv/mean_abs_final_conf": 0.6259250640869141, "adv/mean_abs_reasoning": 0.4594433605670929, "adv/mean_abs_step_conf": 0.7331865429878235, "adv/ratio_final_to_reasoning": 1.3623552276701356, "adv/ratio_step_to_reasoning": 1.5958148618860184, "adv/std_final_conf": 0.861115038394928, "adv/std_reasoning": 0.739260733127594, "adv/std_step_conf": 0.9349847435951233, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.732061790668348, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.32396825396825396, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6865079365079365, "calib/gap": 0.3342345523329129, "calib/mean_conf": 0.7538888888888889, "calib/mu_c": 0.9263114754098359, "calib/mu_w": 0.5920769230769231, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29686507936507933, "calib/std_conf": 0.3991160316273202, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48846938775510207, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.1685157914673991, "calib/step_q_w": 0.319953596287703, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 468.23046875, "completions/mean_terminated_length": 468.23046875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.16426666666666667, "grad_norm": 0.05206868425011635, "kl": 0.09896087646484375, "learning_rate": 1.2777777777777779e-06, "loss": 0.0221, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03563937172293663, "mask/share_reasoning": 0.8281756043434143, "mask/share_step_conf": 0.13618502020835876, "num_tokens": 35538126.0, "reward": 1.1311614513397217, "reward_std": 0.24756911396980286, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6666179895401001, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8700532913208008, "step": 154 }, { "adv/mean_abs_final_conf": 0.6815347671508789, "adv/mean_abs_reasoning": 0.501927375793457, "adv/mean_abs_step_conf": 0.7255311012268066, "adv/ratio_final_to_reasoning": 1.3578354160768673, "adv/ratio_step_to_reasoning": 1.445490196823539, "adv/std_final_conf": 0.8606975078582764, "adv/std_reasoning": 0.7576583027839661, "adv/std_step_conf": 0.9355096817016602, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7602004294917681, "calib/avg_num_step_conf": 6.38671875, "calib/ece": 0.25439516129032247, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5483870967741935, "calib/gap": 0.3882976508101777, "calib/mean_conf": 0.6551209677419356, "calib/mu_c": 0.8539669421487605, "calib/mu_w": 0.46566929133858276, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21080645161290312, "calib/std_conf": 0.42929234893097773, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4197167138810199, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.1270611702857562, "calib/step_q_w": 0.2926555435952637, "calib/step_q_w_n": 929.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 448.1640625, "completions/mean_terminated_length": 449.9216003417969, "completions/min_length": 0.0, "completions/min_terminated_length": 47.0, "epoch": 0.16533333333333333, "grad_norm": 0.03472757712006569, "kl": 0.105712890625, "learning_rate": 1.25e-06, "loss": 0.0211, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03683391213417053, "mask/share_reasoning": 0.8154951333999634, "mask/share_step_conf": 0.14376471936702728, "num_tokens": 35760072.0, "reward": 1.1191637516021729, "reward_std": 0.2559158205986023, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7044066190719604, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8309472799301147, "step": 155 }, { "adv/mean_abs_final_conf": 0.565130889415741, "adv/mean_abs_reasoning": 0.47859495878219604, "adv/mean_abs_step_conf": 0.7466103434562683, "adv/ratio_final_to_reasoning": 1.180812457477068, "adv/ratio_step_to_reasoning": 1.5600046129947713, "adv/std_final_conf": 0.796569287776947, "adv/std_reasoning": 0.7205866575241089, "adv/std_step_conf": 0.9340900182723999, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7442551246899073, "calib/avg_num_step_conf": 6.62890625, "calib/ece": 0.274417670682731, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6104417670682731, "calib/gap": 0.31649040344692514, "calib/mean_conf": 0.7129718875502009, "calib/mu_c": 0.8540579710144928, "calib/mu_w": 0.5375675675675676, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21658634538152616, "calib/std_conf": 0.40130710532457237, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4248878923766816, "calib/step_q_c_n": 892.0, "calib/step_q_gap": 0.07880093585494247, "calib/step_q_w": 0.34608695652173915, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 489.3046875, "completions/mean_terminated_length": 491.22357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.1664, "grad_norm": 0.03993004187941551, "kl": 0.091796875, "learning_rate": 1.2222222222222223e-06, "loss": -0.117, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035405173897743225, "mask/share_reasoning": 0.8169474601745605, "mask/share_step_conf": 0.14374110102653503, "num_tokens": 35990094.0, "reward": 1.1344704627990723, "reward_std": 0.1839873492717743, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7003722190856934, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8441497683525085, "step": 156 }, { "adv/mean_abs_final_conf": 0.6025352478027344, "adv/mean_abs_reasoning": 0.5503590703010559, "adv/mean_abs_step_conf": 0.7544006109237671, "adv/ratio_final_to_reasoning": 1.0948038840772392, "adv/ratio_step_to_reasoning": 1.3707425781337572, "adv/std_final_conf": 0.8273239731788635, "adv/std_reasoning": 0.7928828597068787, "adv/std_step_conf": 0.934908926486969, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8107236842105262, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.20416666666666664, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6507936507936508, "calib/gap": 0.4628789473684208, "calib/mean_conf": 0.7078968253968254, "calib/mu_c": 0.8915789473684209, "calib/mu_w": 0.42870000000000014, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1544444444444444, "calib/std_conf": 0.4203399291307785, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.46174124513618675, "calib/step_q_c_n": 1028.0, "calib/step_q_gap": 0.12420657641507737, "calib/step_q_w": 0.3375346687211094, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 470.54296875, "completions/mean_terminated_length": 472.3882751464844, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.16746666666666668, "grad_norm": 0.03731679916381836, "kl": 0.08966064453125, "learning_rate": 1.1944444444444446e-06, "loss": -0.0978, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035209380090236664, "mask/share_reasoning": 0.8131346702575684, "mask/share_step_conf": 0.147749662399292, "num_tokens": 36214281.0, "reward": 1.1939563751220703, "reward_std": 0.21433734893798828, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.778255820274353, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8632087111473083, "step": 157 }, { "adv/mean_abs_final_conf": 0.6070557832717896, "adv/mean_abs_reasoning": 0.530916154384613, "adv/mean_abs_step_conf": 0.7467933893203735, "adv/ratio_final_to_reasoning": 1.1434117765269927, "adv/ratio_step_to_reasoning": 1.4066126697274537, "adv/std_final_conf": 0.8306695818901062, "adv/std_reasoning": 0.7928876280784607, "adv/std_step_conf": 0.9353790283203125, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6861399397388684, "calib/avg_num_step_conf": 7.01953125, "calib/ece": 0.2908064516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6774193548387096, "calib/gap": 0.2564847673250755, "calib/mean_conf": 0.7820967741935483, "calib/mu_c": 0.8886206896551726, "calib/mu_w": 0.6321359223300971, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2441129032258065, "calib/std_conf": 0.3714917057068018, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44382448537378116, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.12091830688407862, "calib/step_q_w": 0.32290617848970254, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 504.9140625, "completions/mean_terminated_length": 506.8941345214844, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.16853333333333334, "grad_norm": 0.039824292063713074, "kl": 0.08563995361328125, "learning_rate": 1.1666666666666668e-06, "loss": 0.0819, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03828030824661255, "mask/share_reasoning": 0.8063184022903442, "mask/share_step_conf": 0.15149502456188202, "num_tokens": 36448779.0, "reward": 1.105135202407837, "reward_std": 0.25119927525520325, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6788246035575867, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.816797137260437, "step": 158 }, { "adv/mean_abs_final_conf": 0.658309817314148, "adv/mean_abs_reasoning": 0.5221865177154541, "adv/mean_abs_step_conf": 0.7737743258476257, "adv/ratio_final_to_reasoning": 1.2606794602706866, "adv/ratio_step_to_reasoning": 1.4817968285218444, "adv/std_final_conf": 0.85653156042099, "adv/std_reasoning": 0.7753108143806458, "adv/std_step_conf": 0.9341161251068115, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7589743589743589, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.26162698412698404, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6468253968253969, "calib/gap": 0.3909002849002847, "calib/mean_conf": 0.7204365079365079, "calib/mu_c": 0.9019259259259259, "calib/mu_w": 0.5110256410256412, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22317460317460308, "calib/std_conf": 0.4106588395209512, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4664348925410872, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.16261466782198614, "calib/step_q_w": 0.3038202247191011, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 442.78125, "completions/mean_terminated_length": 448.0316467285156, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.1696, "grad_norm": 0.05413948372006416, "kl": 0.10001373291015625, "learning_rate": 1.138888888888889e-06, "loss": -0.1049, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036744873970746994, "mask/share_reasoning": 0.8111945390701294, "mask/share_step_conf": 0.1403418779373169, "num_tokens": 36666915.0, "reward": 1.1794384717941284, "reward_std": 0.21837085485458374, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7313566207885742, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.883450984954834, "step": 159 }, { "adv/mean_abs_final_conf": 0.6163524985313416, "adv/mean_abs_reasoning": 0.5648249983787537, "adv/mean_abs_step_conf": 0.7450392246246338, "adv/ratio_final_to_reasoning": 1.0912273718417032, "adv/ratio_step_to_reasoning": 1.3190620577402883, "adv/std_final_conf": 0.8117612600326538, "adv/std_reasoning": 0.7930551767349243, "adv/std_step_conf": 0.935079038143158, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7678719008264463, "calib/avg_num_step_conf": 6.953125, "calib/ece": 0.2767768595041321, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5826446280991735, "calib/gap": 0.35518181818181827, "calib/mean_conf": 0.6590082644628099, "calib/mu_c": 0.8204545454545455, "calib/mu_w": 0.4652727272727273, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.19516528925619825, "calib/std_conf": 0.4414512223902891, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.42633136094674556, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.18621360732355713, "calib/step_q_w": 0.24011775362318843, "calib/step_q_w_n": 1104.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 495.39453125, "completions/mean_terminated_length": 505.2629699707031, "completions/min_length": 0.0, "completions/min_terminated_length": 72.0, "epoch": 0.17066666666666666, "grad_norm": 0.026633771136403084, "kl": 0.09081268310546875, "learning_rate": 1.111111111111111e-06, "loss": 0.0281, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03583589568734169, "mask/share_reasoning": 0.8143898844718933, "mask/share_step_conf": 0.1302429735660553, "num_tokens": 36898576.0, "reward": 1.1015212535858154, "reward_std": 0.26079750061035156, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6732444763183594, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8261153697967529, "step": 160 }, { "adv/mean_abs_final_conf": 0.5825837254524231, "adv/mean_abs_reasoning": 0.42827603220939636, "adv/mean_abs_step_conf": 0.7349774241447449, "adv/ratio_final_to_reasoning": 1.3602996236959188, "adv/ratio_step_to_reasoning": 1.7161301797654496, "adv/std_final_conf": 0.8111425042152405, "adv/std_reasoning": 0.7205584645271301, "adv/std_step_conf": 0.9347352385520935, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6690058479532164, "calib/avg_num_step_conf": 6.078125, "calib/ece": 0.28840597609561747, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6175298804780877, "calib/gap": 0.24872061403508772, "calib/mean_conf": 0.6920721115537849, "calib/mu_c": 0.7713456140350877, "calib/mu_w": 0.522625, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14960159362549796, "calib/std_conf": 0.42532849411659873, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42097, "calib/step_q_c_n": 1000.0, "calib/step_q_gap": 0.1140275539568345, "calib/step_q_w": 0.3069424460431655, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 450.99609375, "completions/mean_terminated_length": 452.7647399902344, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.17173333333333332, "grad_norm": 0.04227954521775246, "kl": 0.10567474365234375, "learning_rate": 1.0833333333333335e-06, "loss": 0.0908, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03879394754767418, "mask/share_reasoning": 0.8145664930343628, "mask/share_step_conf": 0.14273332059383392, "num_tokens": 37117951.0, "reward": 1.1423712968826294, "reward_std": 0.18462207913398743, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6920831799507141, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8419811725616455, "step": 161 }, { "adv/mean_abs_final_conf": 0.5636827945709229, "adv/mean_abs_reasoning": 0.44984203577041626, "adv/mean_abs_step_conf": 0.7485389709472656, "adv/ratio_final_to_reasoning": 1.2530682989764144, "adv/ratio_step_to_reasoning": 1.6640040534791058, "adv/std_final_conf": 0.7876395583152771, "adv/std_reasoning": 0.7014114260673523, "adv/std_step_conf": 0.9347339272499084, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7785826021900293, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.19414062499999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.64453125, "calib/gap": 0.45804121607835147, "calib/mean_conf": 0.706171875, "calib/mu_c": 0.8618343195266273, "calib/mu_w": 0.4037931034482758, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12007812499999997, "calib/std_conf": 0.4198755788432859, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4280067567567568, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.08075675675675675, "calib/step_q_w": 0.34725000000000006, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 413.7421875, "completions/mean_terminated_length": 415.3647155761719, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.1728, "grad_norm": 0.047483619302511215, "kl": 0.104461669921875, "learning_rate": 1.0555555555555557e-06, "loss": 0.0835, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03956054151058197, "mask/share_reasoning": 0.8172359466552734, "mask/share_step_conf": 0.1392972469329834, "num_tokens": 37328013.0, "reward": 1.216137409210205, "reward_std": 0.18878695368766785, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7969167828559875, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8699263334274292, "step": 162 }, { "adv/mean_abs_final_conf": 0.5739138126373291, "adv/mean_abs_reasoning": 0.443822979927063, "adv/mean_abs_step_conf": 0.7315582633018494, "adv/ratio_final_to_reasoning": 1.2931142338137718, "adv/ratio_step_to_reasoning": 1.648310917614208, "adv/std_final_conf": 0.8101106286048889, "adv/std_reasoning": 0.7206270098686218, "adv/std_step_conf": 0.9346047639846802, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8156389065479974, "calib/avg_num_step_conf": 6.01953125, "calib/ece": 0.22665338645418318, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.46431722822631916, "calib/mean_conf": 0.5770119521912351, "calib/mu_c": 0.8008461538461539, "calib/mu_w": 0.3365289256198347, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14286852589641424, "calib/std_conf": 0.45616388927780055, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43541944074567246, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.16036880783428004, "calib/step_q_w": 0.2750506329113924, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 469.5625, "completions/mean_terminated_length": 469.5625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.17386666666666667, "grad_norm": 0.03634432330727577, "kl": 0.09714508056640625, "learning_rate": 1.0277777777777777e-06, "loss": -0.0162, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037918318063020706, "mask/share_reasoning": 0.8201773166656494, "mask/share_step_conf": 0.14190436899662018, "num_tokens": 37553053.0, "reward": 1.1781021356582642, "reward_std": 0.1947747766971588, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7529621124267578, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8709113001823425, "step": 163 }, { "adv/mean_abs_final_conf": 0.6148437261581421, "adv/mean_abs_reasoning": 0.50270015001297, "adv/mean_abs_step_conf": 0.7660697102546692, "adv/ratio_final_to_reasoning": 1.2230824401828382, "adv/ratio_step_to_reasoning": 1.5239098501062793, "adv/std_final_conf": 0.8307430148124695, "adv/std_reasoning": 0.7575799822807312, "adv/std_step_conf": 0.9345313906669617, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8027272727272727, "calib/avg_num_step_conf": 6.6953125, "calib/ece": 0.23243999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.444, "calib/gap": 0.4427727272727273, "calib/mean_conf": 0.5366799999999999, "calib/mu_c": 0.7315, "calib/mu_w": 0.2887272727272727, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10455999999999999, "calib/std_conf": 0.45835420539141997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41191158900836317, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.15996176004599144, "calib/step_q_w": 0.2519498289623717, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 526.41015625, "completions/mean_terminated_length": 526.41015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.17493333333333333, "grad_norm": 0.035710543394088745, "kl": 0.0910186767578125, "learning_rate": 1.0000000000000002e-06, "loss": 0.0663, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032561205327510834, "mask/share_reasoning": 0.8339776992797852, "mask/share_step_conf": 0.1334611475467682, "num_tokens": 37793950.0, "reward": 1.172410488128662, "reward_std": 0.2033061385154724, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7394199371337891, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8676632642745972, "step": 164 }, { "adv/mean_abs_final_conf": 0.5234141945838928, "adv/mean_abs_reasoning": 0.373664915561676, "adv/mean_abs_step_conf": 0.7581987380981445, "adv/ratio_final_to_reasoning": 1.400758200156738, "adv/ratio_step_to_reasoning": 2.0290873093034567, "adv/std_final_conf": 0.7647106051445007, "adv/std_reasoning": 0.6814833879470825, "adv/std_step_conf": 0.9337098598480225, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7628519356460532, "calib/avg_num_step_conf": 6.0, "calib/ece": 0.231501976284585, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5810276679841897, "calib/gap": 0.48271744595274013, "calib/mean_conf": 0.615296442687747, "calib/mu_c": 0.838529411764706, "calib/mu_w": 0.35581196581196584, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1546245059288538, "calib/std_conf": 0.465140136845168, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40995, "calib/step_q_c_n": 800.0, "calib/step_q_gap": 0.13495000000000001, "calib/step_q_w": 0.27499999999999997, "calib/step_q_w_n": 736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 467.9765625, "completions/mean_terminated_length": 469.8117980957031, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.176, "grad_norm": 0.033411670476198196, "kl": 0.093292236328125, "learning_rate": 9.722222222222224e-07, "loss": -0.1565, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036157675087451935, "mask/share_reasoning": 0.8241069316864014, "mask/share_step_conf": 0.1358291506767273, "num_tokens": 38019328.0, "reward": 1.1931571960449219, "reward_std": 0.1731734275817871, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7560117244720459, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8853057622909546, "step": 165 }, { "adv/mean_abs_final_conf": 0.5332635641098022, "adv/mean_abs_reasoning": 0.46829888224601746, "adv/mean_abs_step_conf": 0.752028226852417, "adv/ratio_final_to_reasoning": 1.1387248279393845, "adv/ratio_step_to_reasoning": 1.605872350678267, "adv/std_final_conf": 0.7756200432777405, "adv/std_reasoning": 0.7206501364707947, "adv/std_step_conf": 0.9337971210479736, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8616442085204845, "calib/avg_num_step_conf": 6.8515625, "calib/ece": 0.1613545816733068, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5059760956175299, "calib/gap": 0.6092731727235605, "calib/mean_conf": 0.5751394422310756, "calib/mu_c": 0.800886075949367, "calib/mu_w": 0.19161290322580646, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.05350597609561754, "calib/std_conf": 0.46354974599286264, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.42130822596630324, "calib/step_q_c_n": 1009.0, "calib/step_q_gap": 0.14699950113408844, "calib/step_q_w": 0.2743087248322148, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 504.9765625, "completions/mean_terminated_length": 504.9765625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.17706666666666668, "grad_norm": 0.05789630860090256, "kl": 0.08490753173828125, "learning_rate": 9.444444444444445e-07, "loss": -0.0184, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03413501754403114, "mask/share_reasoning": 0.8212205171585083, "mask/share_step_conf": 0.14464449882507324, "num_tokens": 38254786.0, "reward": 1.2201125621795654, "reward_std": 0.18350180983543396, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.8095582127571106, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8751322031021118, "step": 166 }, { "adv/mean_abs_final_conf": 0.6072766780853271, "adv/mean_abs_reasoning": 0.45788443088531494, "adv/mean_abs_step_conf": 0.7228624224662781, "adv/ratio_final_to_reasoning": 1.3262662740271902, "adv/ratio_step_to_reasoning": 1.57870059278633, "adv/std_final_conf": 0.833148181438446, "adv/std_reasoning": 0.7394189238548279, "adv/std_step_conf": 0.9349060654640198, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7140316492450639, "calib/avg_num_step_conf": 5.984375, "calib/ece": 0.19868000000000008, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.696, "calib/gap": 0.38988966318234597, "calib/mean_conf": 0.7716400000000001, "calib/mu_c": 0.8995238095238095, "calib/mu_w": 0.5096341463414635, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14916000000000007, "calib/std_conf": 0.3831429372962524, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.43428714859437745, "calib/step_q_c_n": 996.0, "calib/step_q_gap": 0.048335656057064025, "calib/step_q_w": 0.3859514925373134, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 453.8203125, "completions/mean_terminated_length": 453.8203125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.17813333333333334, "grad_norm": 0.05314803496003151, "kl": 0.09221649169921875, "learning_rate": 9.166666666666666e-07, "loss": -0.0162, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03694336861371994, "mask/share_reasoning": 0.8237749934196472, "mask/share_step_conf": 0.13928166031837463, "num_tokens": 38476572.0, "reward": 1.1797784566879272, "reward_std": 0.2056841403245926, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.772200345993042, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8410501480102539, "step": 167 }, { "adv/mean_abs_final_conf": 0.5778178572654724, "adv/mean_abs_reasoning": 0.5160121321678162, "adv/mean_abs_step_conf": 0.745305597782135, "adv/ratio_final_to_reasoning": 1.1197757208498267, "adv/ratio_step_to_reasoning": 1.4443567337284011, "adv/std_final_conf": 0.7986822128295898, "adv/std_reasoning": 0.7754129767417908, "adv/std_step_conf": 0.934490442276001, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8028047091412743, "calib/avg_num_step_conf": 7.19921875, "calib/ece": 0.20668016194331984, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5384615384615384, "calib/gap": 0.45569736842105274, "calib/mean_conf": 0.6240080971659919, "calib/mu_c": 0.7992763157894738, "calib/mu_w": 0.34357894736842104, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10765182186234816, "calib/std_conf": 0.44303851462430305, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4125210970464135, "calib/step_q_c_n": 948.0, "calib/step_q_gap": 0.16366076185088277, "calib/step_q_w": 0.24886033519553072, "calib/step_q_w_n": 895.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 510.34765625, "completions/mean_terminated_length": 518.4484252929688, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.1792, "grad_norm": 0.03475351259112358, "kl": 0.11714935302734375, "learning_rate": 8.88888888888889e-07, "loss": -0.0278, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03463764488697052, "mask/share_reasoning": 0.8162198066711426, "mask/share_step_conf": 0.13351748883724213, "num_tokens": 38711893.0, "reward": 1.1767488718032837, "reward_std": 0.2032414674758911, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.755155086517334, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8582700490951538, "step": 168 }, { "adv/mean_abs_final_conf": 0.6278563737869263, "adv/mean_abs_reasoning": 0.45637246966362, "adv/mean_abs_step_conf": 0.7611607909202576, "adv/ratio_final_to_reasoning": 1.3757542698614194, "adv/ratio_step_to_reasoning": 1.6678499285491277, "adv/std_final_conf": 0.8417154550552368, "adv/std_reasoning": 0.701534628868103, "adv/std_step_conf": 0.9345760345458984, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7518236074270557, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.2073092369477912, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5542168674698795, "calib/gap": 0.4210716180371354, "calib/mean_conf": 0.6375100401606426, "calib/mu_c": 0.8133793103448277, "calib/mu_w": 0.3923076923076923, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13124497991967873, "calib/std_conf": 0.43499183496422156, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.46704738760631836, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.16497343262239555, "calib/step_q_w": 0.3020739549839228, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 458.37890625, "completions/mean_terminated_length": 461.9881896972656, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.18026666666666666, "grad_norm": 0.04523325711488724, "kl": 0.09043121337890625, "learning_rate": 8.611111111111112e-07, "loss": -0.0443, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03630748391151428, "mask/share_reasoning": 0.8291934728622437, "mask/share_step_conf": 0.1266864836215973, "num_tokens": 38933422.0, "reward": 1.1640233993530273, "reward_std": 0.22012245655059814, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7365875244140625, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.856806218624115, "step": 169 }, { "adv/mean_abs_final_conf": 0.6055898666381836, "adv/mean_abs_reasoning": 0.4949433207511902, "adv/mean_abs_step_conf": 0.749203085899353, "adv/ratio_final_to_reasoning": 1.2235539732490215, "adv/ratio_step_to_reasoning": 1.5137149133809205, "adv/std_final_conf": 0.8305554986000061, "adv/std_reasoning": 0.7575010657310486, "adv/std_step_conf": 0.9346402287483215, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8241965369624944, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.20337301587301593, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5595238095238095, "calib/gap": 0.47727237876174045, "calib/mean_conf": 0.643531746031746, "calib/mu_c": 0.853758865248227, "calib/mu_w": 0.3764864864864865, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.14369047619047623, "calib/std_conf": 0.44279844543302516, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4413757225433526, "calib/step_q_c_n": 865.0, "calib/step_q_gap": 0.14932516074559982, "calib/step_q_w": 0.2920505617977528, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 480.2578125, "completions/mean_terminated_length": 482.1412048339844, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.18133333333333335, "grad_norm": 0.05760641396045685, "kl": 0.13443756103515625, "learning_rate": 8.333333333333333e-07, "loss": 0.025, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034499507397413254, "mask/share_reasoning": 0.827689528465271, "mask/share_step_conf": 0.13390469551086426, "num_tokens": 39160520.0, "reward": 1.1697800159454346, "reward_std": 0.209752157330513, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7549155950546265, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8543462753295898, "step": 170 }, { "adv/mean_abs_final_conf": 0.6666871309280396, "adv/mean_abs_reasoning": 0.45934537053108215, "adv/mean_abs_step_conf": 0.7705562114715576, "adv/ratio_final_to_reasoning": 1.4513853272478499, "adv/ratio_step_to_reasoning": 1.67750947523573, "adv/std_final_conf": 0.8589778542518616, "adv/std_reasoning": 0.701428234577179, "adv/std_step_conf": 0.9345902800559998, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7058211382113821, "calib/avg_num_step_conf": 6.2265625, "calib/ece": 0.3018145161290323, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5040322580645161, "calib/gap": 0.3157652032520326, "calib/mean_conf": 0.5811693548387098, "calib/mu_c": 0.7403252032520325, "calib/mu_w": 0.42455999999999994, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19350806451612912, "calib/std_conf": 0.45758641386917104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4311796246648793, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.13754754919318118, "calib/step_q_w": 0.2936320754716981, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 469.86328125, "completions/mean_terminated_length": 473.56298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.1824, "grad_norm": 0.04498155787587166, "kl": 0.0904083251953125, "learning_rate": 8.055555555555557e-07, "loss": -0.0194, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.036814551800489426, "mask/share_reasoning": 0.820866584777832, "mask/share_step_conf": 0.13450628519058228, "num_tokens": 39387701.0, "reward": 1.1181427240371704, "reward_std": 0.21004648506641388, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6696425676345825, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8511993288993835, "step": 171 }, { "adv/mean_abs_final_conf": 0.6099177002906799, "adv/mean_abs_reasoning": 0.5050039291381836, "adv/mean_abs_step_conf": 0.7756307721138, "adv/ratio_final_to_reasoning": 1.2077484255053963, "adv/ratio_step_to_reasoning": 1.5358905690841964, "adv/std_final_conf": 0.8136227130889893, "adv/std_reasoning": 0.7576004862785339, "adv/std_step_conf": 0.9339342713356018, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7734019886363637, "calib/avg_num_step_conf": 6.08984375, "calib/ece": 0.21342741935483872, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6451612903225806, "calib/gap": 0.29330681818181825, "calib/mean_conf": 0.7747983870967742, "calib/mu_c": 0.8788750000000001, "calib/mu_w": 0.5855681818181818, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17153225806451616, "calib/std_conf": 0.3564655681657362, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44785467128027684, "calib/step_q_c_n": 867.0, "calib/step_q_gap": 0.11626507590455432, "calib/step_q_w": 0.3315895953757225, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 446.390625, "completions/mean_terminated_length": 449.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.18346666666666667, "grad_norm": 0.04795660451054573, "kl": 0.08995819091796875, "learning_rate": 7.777777777777779e-07, "loss": -0.0394, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03795355185866356, "mask/share_reasoning": 0.8114452362060547, "mask/share_step_conf": 0.14278870820999146, "num_tokens": 39605329.0, "reward": 1.1778701543807983, "reward_std": 0.20241671800613403, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7376941442489624, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8661972880363464, "step": 172 }, { "adv/mean_abs_final_conf": 0.622528612613678, "adv/mean_abs_reasoning": 0.5239760279655457, "adv/mean_abs_step_conf": 0.7593715190887451, "adv/ratio_final_to_reasoning": 1.188086056209069, "adv/ratio_step_to_reasoning": 1.4492485887897872, "adv/std_final_conf": 0.8453269004821777, "adv/std_reasoning": 0.7753785848617554, "adv/std_step_conf": 0.9348943829536438, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7443, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.26796000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": 0.28856666666666675, "calib/mean_conf": 0.78724, "calib/mu_c": 0.9026666666666667, "calib/mu_w": 0.6141, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.22760000000000005, "calib/std_conf": 0.36290381976496194, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4717543859649123, "calib/step_q_c_n": 912.0, "calib/step_q_gap": 0.08097105263157905, "calib/step_q_w": 0.39078333333333326, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 491.66015625, "completions/mean_terminated_length": 491.66015625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.18453333333333333, "grad_norm": 0.03766198456287384, "kl": 0.08562469482421875, "learning_rate": 7.5e-07, "loss": -0.0357, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03844533860683441, "mask/share_reasoning": 0.8244307041168213, "mask/share_step_conf": 0.1371239721775055, "num_tokens": 39834354.0, "reward": 1.1404576301574707, "reward_std": 0.22183360159397125, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7107362747192383, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.838973343372345, "step": 173 }, { "adv/mean_abs_final_conf": 0.657545804977417, "adv/mean_abs_reasoning": 0.57723069190979, "adv/mean_abs_step_conf": 0.7305585741996765, "adv/ratio_final_to_reasoning": 1.1391386740055371, "adv/ratio_step_to_reasoning": 1.2656266973306551, "adv/std_final_conf": 0.844294011592865, "adv/std_reasoning": 0.7929427623748779, "adv/std_step_conf": 0.9203072190284729, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6821343402225755, "calib/avg_num_step_conf": 6.1015625, "calib/ece": 0.31064777327935217, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5384615384615384, "calib/gap": 0.30688857975622674, "calib/mean_conf": 0.6048987854251012, "calib/mu_c": 0.7738738738738739, "calib/mu_w": 0.4669852941176471, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.23307692307692301, "calib/std_conf": 0.4544435244132277, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.40302702702702703, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.04370829223383965, "calib/step_q_w": 0.3593187347931874, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 520.83984375, "completions/mean_terminated_length": 520.83984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1856, "grad_norm": 0.03625209629535675, "kl": 0.0858917236328125, "learning_rate": 7.222222222222222e-07, "loss": -0.0454, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03234461694955826, "mask/share_reasoning": 0.838646650314331, "mask/share_step_conf": 0.1290086805820465, "num_tokens": 40071921.0, "reward": 1.0493491888046265, "reward_std": 0.23219379782676697, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6305624842643738, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7949029803276062, "step": 174 }, { "adv/mean_abs_final_conf": 0.727577805519104, "adv/mean_abs_reasoning": 0.4823678731918335, "adv/mean_abs_step_conf": 0.7414121627807617, "adv/ratio_final_to_reasoning": 1.5083463181424452, "adv/ratio_step_to_reasoning": 1.5370264148705206, "adv/std_final_conf": 0.8904538154602051, "adv/std_reasoning": 0.7394604682922363, "adv/std_step_conf": 0.9354656338691711, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7771801250588315, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.24191056910569106, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4186991869918699, "calib/gap": 0.4366859409668526, "calib/mean_conf": 0.5297967479674797, "calib/mu_c": 0.7765420560747663, "calib/mu_w": 0.33985611510791375, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.16837398373983742, "calib/std_conf": 0.4564305209746925, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4754432624113476, "calib/step_q_c_n": 564.0, "calib/step_q_gap": 0.20732419424913878, "calib/step_q_w": 0.2681190681622088, "calib/step_q_w_n": 1159.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 517.40625, "completions/mean_terminated_length": 523.54150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.18666666666666668, "grad_norm": 0.05229157581925392, "kl": 0.08309173583984375, "learning_rate": 6.944444444444446e-07, "loss": -0.1045, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03500085324048996, "mask/share_reasoning": 0.8173953294754028, "mask/share_step_conf": 0.1358851194381714, "num_tokens": 40310201.0, "reward": 1.1124995946884155, "reward_std": 0.2691600024700165, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.7121738195419312, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.826258659362793, "step": 175 }, { "adv/mean_abs_final_conf": 0.6193594932556152, "adv/mean_abs_reasoning": 0.5067633390426636, "adv/mean_abs_step_conf": 0.7527080178260803, "adv/ratio_final_to_reasoning": 1.2221868583186368, "adv/ratio_step_to_reasoning": 1.4853245288975236, "adv/std_final_conf": 0.8128105998039246, "adv/std_reasoning": 0.7575881481170654, "adv/std_step_conf": 0.9347853064537048, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8053527980535279, "calib/avg_num_step_conf": 6.37109375, "calib/ece": 0.21798387096774188, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5766129032258065, "calib/gap": 0.4824081015321892, "calib/mean_conf": 0.6607258064516128, "calib/mu_c": 0.8766423357664235, "calib/mu_w": 0.39423423423423426, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.16314516129032253, "calib/std_conf": 0.4443692038542129, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4625, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.1858415536374846, "calib/step_q_w": 0.2766584463625154, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 475.4765625, "completions/mean_terminated_length": 477.3412170410156, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.18773333333333334, "grad_norm": 0.04875698313117027, "kl": 0.08687591552734375, "learning_rate": 6.666666666666667e-07, "loss": -0.054, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03778859227895737, "mask/share_reasoning": 0.8117580413818359, "mask/share_step_conf": 0.14654704928398132, "num_tokens": 40535987.0, "reward": 1.163760781288147, "reward_std": 0.2515183985233307, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7497754096984863, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8528724312782288, "step": 176 }, { "adv/mean_abs_final_conf": 0.5847810506820679, "adv/mean_abs_reasoning": 0.4223461151123047, "adv/mean_abs_step_conf": 0.7501903772354126, "adv/ratio_final_to_reasoning": 1.38460146727423, "adv/ratio_step_to_reasoning": 1.7762454782753996, "adv/std_final_conf": 0.788453996181488, "adv/std_reasoning": 0.7013620138168335, "adv/std_step_conf": 0.9340070486068726, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7479757085020242, "calib/avg_num_step_conf": 6.36328125, "calib/ece": 0.23091633466135472, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.4112827260458839, "calib/mean_conf": 0.5870916334661355, "calib/mu_c": 0.7427564102564103, "calib/mu_w": 0.33147368421052636, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09824701195219138, "calib/std_conf": 0.45215102055416, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4445251396648045, "calib/step_q_c_n": 895.0, "calib/step_q_gap": 0.16945701977379635, "calib/step_q_w": 0.27506811989100816, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 506.98828125, "completions/mean_terminated_length": 506.98828125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1888, "grad_norm": 0.048805780708789825, "kl": 0.08119964599609375, "learning_rate": 6.388888888888889e-07, "loss": 0.1032, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034971535205841064, "mask/share_reasoning": 0.8307249546051025, "mask/share_step_conf": 0.1343035101890564, "num_tokens": 40769608.0, "reward": 1.1738266944885254, "reward_std": 0.19798734784126282, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7344551086425781, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.864007294178009, "step": 177 }, { "adv/mean_abs_final_conf": 0.6846010684967041, "adv/mean_abs_reasoning": 0.5379370450973511, "adv/mean_abs_step_conf": 0.7198708057403564, "adv/ratio_final_to_reasoning": 1.2726416125009778, "adv/ratio_step_to_reasoning": 1.3382064170912056, "adv/std_final_conf": 0.8916929364204407, "adv/std_reasoning": 0.7754216194152832, "adv/std_step_conf": 0.9352347254753113, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8307109557109558, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.1717928286852589, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5418326693227091, "calib/gap": 0.5239355089355089, "calib/mean_conf": 0.6412749003984064, "calib/mu_c": 0.8667132867132866, "calib/mu_w": 0.34277777777777774, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12167330677290833, "calib/std_conf": 0.43307660608780046, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4629019607843137, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.1892422300621595, "calib/step_q_w": 0.2736597307221542, "calib/step_q_w_n": 817.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 456.59765625, "completions/mean_terminated_length": 458.3882751464844, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.18986666666666666, "grad_norm": 0.044407956302165985, "kl": 0.09165191650390625, "learning_rate": 6.111111111111112e-07, "loss": 0.0158, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.038511939346790314, "mask/share_reasoning": 0.8155031204223633, "mask/share_step_conf": 0.142078697681427, "num_tokens": 40992569.0, "reward": 1.1956806182861328, "reward_std": 0.2603704333305359, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.791344165802002, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8625114560127258, "step": 178 }, { "adv/mean_abs_final_conf": 0.6494489312171936, "adv/mean_abs_reasoning": 0.5641993284225464, "adv/mean_abs_step_conf": 0.7575774192810059, "adv/ratio_final_to_reasoning": 1.15109837693887, "adv/ratio_step_to_reasoning": 1.3427478217656308, "adv/std_final_conf": 0.8725244402885437, "adv/std_reasoning": 0.8097829818725586, "adv/std_step_conf": 0.9347437620162964, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7800624501727346, "calib/avg_num_step_conf": 5.79296875, "calib/ece": 0.22362903225806446, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6048387096774194, "calib/gap": 0.4450199309061919, "calib/mean_conf": 0.6555645161290322, "calib/mu_c": 0.845774647887324, "calib/mu_w": 0.40075471698113213, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15330645161290318, "calib/std_conf": 0.4468416086903993, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4719305019305019, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.14313446793616763, "calib/step_q_w": 0.32879603399433427, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 466.59375, "completions/mean_terminated_length": 466.59375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.19093333333333334, "grad_norm": 0.047058381140232086, "kl": 0.08917236328125, "learning_rate": 5.833333333333334e-07, "loss": -0.0442, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03583589196205139, "mask/share_reasoning": 0.8318737745285034, "mask/share_step_conf": 0.13229036331176758, "num_tokens": 41218281.0, "reward": 1.1654024124145508, "reward_std": 0.24288588762283325, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7387410402297974, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.858771562576294, "step": 179 }, { "adv/mean_abs_final_conf": 0.5855597853660583, "adv/mean_abs_reasoning": 0.36934980750083923, "adv/mean_abs_step_conf": 0.7651547193527222, "adv/ratio_final_to_reasoning": 1.5853799662931403, "adv/ratio_step_to_reasoning": 2.0716261490158856, "adv/std_final_conf": 0.7945824861526489, "adv/std_reasoning": 0.640407383441925, "adv/std_step_conf": 0.934300422668457, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7083667200854701, "calib/avg_num_step_conf": 6.68359375, "calib/ece": 0.2598412698412698, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5912698412698413, "calib/gap": 0.3162339743589744, "calib/mean_conf": 0.6772222222222222, "calib/mu_c": 0.7976923076923077, "calib/mu_w": 0.4814583333333333, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15900793650793651, "calib/std_conf": 0.4247971297112714, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4480020181634713, "calib/step_q_c_n": 991.0, "calib/step_q_gap": 0.12418257371902686, "calib/step_q_w": 0.32381944444444444, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 540.7421875, "completions/mean_terminated_length": 540.7421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.192, "grad_norm": 0.04770103842020035, "kl": 0.08750152587890625, "learning_rate": 5.555555555555555e-07, "loss": 0.0112, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03379952535033226, "mask/share_reasoning": 0.8321336507797241, "mask/share_step_conf": 0.13406683504581451, "num_tokens": 41460567.0, "reward": 1.1437525749206543, "reward_std": 0.20699778199195862, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.713333249092102, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8385103940963745, "step": 180 }, { "adv/mean_abs_final_conf": 0.6123881340026855, "adv/mean_abs_reasoning": 0.47117334604263306, "adv/mean_abs_step_conf": 0.7513047456741333, "adv/ratio_final_to_reasoning": 1.2997087784063128, "adv/ratio_step_to_reasoning": 1.5945399967640639, "adv/std_final_conf": 0.8289145231246948, "adv/std_reasoning": 0.7391869425773621, "adv/std_step_conf": 0.9340299963951111, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8090897817460319, "calib/avg_num_step_conf": 5.796875, "calib/ece": 0.2392125984251967, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5236220472440944, "calib/gap": 0.4368229166666666, "calib/mean_conf": 0.6165354330708661, "calib/mu_c": 0.8366666666666667, "calib/mu_w": 0.39984375000000005, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17984251968503923, "calib/std_conf": 0.446855818760644, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5040673211781206, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.20187536268266015, "calib/step_q_w": 0.30219195849546043, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 429.00390625, "completions/mean_terminated_length": 430.6863098144531, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.19306666666666666, "grad_norm": 0.05041556805372238, "kl": 0.1006317138671875, "learning_rate": 5.277777777777779e-07, "loss": -0.0221, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03834264352917671, "mask/share_reasoning": 0.8177053332328796, "mask/share_step_conf": 0.14004576206207275, "num_tokens": 41676656.0, "reward": 1.1766166687011719, "reward_std": 0.21318820118904114, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.740514874458313, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8782706260681152, "step": 181 }, { "adv/mean_abs_final_conf": 0.5687905550003052, "adv/mean_abs_reasoning": 0.3563820719718933, "adv/mean_abs_step_conf": 0.7398570775985718, "adv/ratio_final_to_reasoning": 1.5960133792733655, "adv/ratio_step_to_reasoning": 2.076022156515555, "adv/std_final_conf": 0.7982797622680664, "adv/std_reasoning": 0.6815344095230103, "adv/std_step_conf": 0.9340426325798035, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8294670846394984, "calib/avg_num_step_conf": 6.7890625, "calib/ece": 0.17182539682539683, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6428571428571429, "calib/gap": 0.4777220480668757, "calib/mean_conf": 0.719920634920635, "calib/mu_c": 0.8848484848484849, "calib/mu_w": 0.4071264367816092, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11849206349206347, "calib/std_conf": 0.41007064051102304, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4741460905349794, "calib/step_q_c_n": 972.0, "calib/step_q_gap": 0.17635235685351724, "calib/step_q_w": 0.29779373368146217, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 477.92578125, "completions/mean_terminated_length": 479.8000183105469, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.19413333333333332, "grad_norm": 0.05087687447667122, "kl": 0.08599090576171875, "learning_rate": 5.000000000000001e-07, "loss": -0.0218, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035774778574705124, "mask/share_reasoning": 0.8111280202865601, "mask/share_step_conf": 0.14919093251228333, "num_tokens": 41905165.0, "reward": 1.2298622131347656, "reward_std": 0.19393043220043182, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.8012698888778687, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8889696002006531, "step": 182 }, { "adv/mean_abs_final_conf": 0.6119903922080994, "adv/mean_abs_reasoning": 0.5624978542327881, "adv/mean_abs_step_conf": 0.7621275186538696, "adv/ratio_final_to_reasoning": 1.087987069822366, "adv/ratio_step_to_reasoning": 1.354898535023505, "adv/std_final_conf": 0.8451369404792786, "adv/std_reasoning": 0.8098740577697754, "adv/std_step_conf": 0.9352811574935913, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6952368658835678, "calib/avg_num_step_conf": 6.1875, "calib/ece": 0.335582329317269, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5542168674698795, "calib/gap": 0.2515296243707241, "calib/mean_conf": 0.6394377510040161, "calib/mu_c": 0.7626771653543307, "calib/mu_w": 0.5111475409836066, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23248995983935736, "calib/std_conf": 0.4430531225691059, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.46950437317784255, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.12947096560545945, "calib/step_q_w": 0.3400334075723831, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1922.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 480.74609375, "completions/mean_terminated_length": 484.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.1952, "grad_norm": 0.034335847944021225, "kl": 0.1324462890625, "learning_rate": 4.7222222222222226e-07, "loss": -0.1044, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03612757846713066, "mask/share_reasoning": 0.8249562978744507, "mask/share_step_conf": 0.13110360503196716, "num_tokens": 42134916.0, "reward": 1.0904525518417358, "reward_std": 0.23203977942466736, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6375433206558228, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8341160416603088, "step": 183 }, { "adv/mean_abs_final_conf": 0.5991448163986206, "adv/mean_abs_reasoning": 0.4674597978591919, "adv/mean_abs_step_conf": 0.7482036352157593, "adv/ratio_final_to_reasoning": 1.2817034088118415, "adv/ratio_step_to_reasoning": 1.600573222857408, "adv/std_final_conf": 0.8104008436203003, "adv/std_reasoning": 0.7207074761390686, "adv/std_step_conf": 0.9341971278190613, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7524910767400357, "calib/avg_num_step_conf": 7.1953125, "calib/ece": 0.2244715447154471, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6747967479674797, "calib/gap": 0.38737804878048776, "calib/mean_conf": 0.7295934959349593, "calib/mu_c": 0.8587195121951219, "calib/mu_w": 0.4713414634146341, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1436991869918699, "calib/std_conf": 0.41417936959921076, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46252849740932644, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.1756824312747769, "calib/step_q_w": 0.28684606613454955, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 491.4765625, "completions/mean_terminated_length": 499.2778015136719, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.19626666666666667, "grad_norm": 0.05411386862397194, "kl": 0.22634124755859375, "learning_rate": 4.444444444444445e-07, "loss": 0.0824, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03469008952379227, "mask/share_reasoning": 0.8151426911354065, "mask/share_step_conf": 0.13454222679138184, "num_tokens": 42366014.0, "reward": 1.163267731666565, "reward_std": 0.20311492681503296, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7441898584365845, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8413553237915039, "step": 184 }, { "adv/mean_abs_final_conf": 0.5832977294921875, "adv/mean_abs_reasoning": 0.41579145193099976, "adv/mean_abs_step_conf": 0.7531085014343262, "adv/ratio_final_to_reasoning": 1.4028612824608653, "adv/ratio_step_to_reasoning": 1.8112649933921776, "adv/std_final_conf": 0.7961451411247253, "adv/std_reasoning": 0.6816564798355103, "adv/std_step_conf": 0.9346182346343994, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.844640434192673, "calib/avg_num_step_conf": 6.64453125, "calib/ece": 0.2341803278688525, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6885245901639344, "calib/gap": 0.47905834464043406, "calib/mean_conf": 0.7341803278688525, "calib/mu_c": 0.9501492537313432, "calib/mu_w": 0.4710909090909091, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20959016393442625, "calib/std_conf": 0.41856378201297734, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49303072625698324, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.20566016788134872, "calib/step_q_w": 0.2873705583756345, "calib/step_q_w_n": 985.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 494.7421875, "completions/mean_terminated_length": 502.59527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.19733333333333333, "grad_norm": 0.04859265312552452, "kl": 0.07753753662109375, "learning_rate": 4.1666666666666667e-07, "loss": -0.0234, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03509043902158737, "mask/share_reasoning": 0.8189756274223328, "mask/share_step_conf": 0.13030895590782166, "num_tokens": 42599588.0, "reward": 1.1451895236968994, "reward_std": 0.2622781991958618, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7397312521934509, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8374109268188477, "step": 185 }, { "adv/mean_abs_final_conf": 0.5210508108139038, "adv/mean_abs_reasoning": 0.4355603754520416, "adv/mean_abs_step_conf": 0.7629748582839966, "adv/ratio_final_to_reasoning": 1.1962768887622912, "adv/ratio_step_to_reasoning": 1.751708606395041, "adv/std_final_conf": 0.7428054809570312, "adv/std_reasoning": 0.7013394832611084, "adv/std_step_conf": 0.9342164993286133, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7337121712699881, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.2501568627450981, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5843137254901961, "calib/gap": 0.38337319941852777, "calib/mean_conf": 0.6436470588235295, "calib/mu_c": 0.7849689440993789, "calib/mu_w": 0.40159574468085113, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1312156862745098, "calib/std_conf": 0.4509776231799052, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4645993413830955, "calib/step_q_c_n": 911.0, "calib/step_q_gap": 0.15331992961838958, "calib/step_q_w": 0.3112794117647059, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 479.3515625, "completions/mean_terminated_length": 479.3515625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1984, "grad_norm": 0.042646367102861404, "kl": 0.0824432373046875, "learning_rate": 3.8888888888888895e-07, "loss": 0.0169, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03666163980960846, "mask/share_reasoning": 0.82917320728302, "mask/share_step_conf": 0.13416513800621033, "num_tokens": 42827342.0, "reward": 1.193156123161316, "reward_std": 0.17940464615821838, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7355316281318665, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8843746185302734, "step": 186 }, { "adv/mean_abs_final_conf": 0.6884365081787109, "adv/mean_abs_reasoning": 0.6354086399078369, "adv/mean_abs_step_conf": 0.7570756673812866, "adv/ratio_final_to_reasoning": 1.0834547485513661, "adv/ratio_step_to_reasoning": 1.1914783964711229, "adv/std_final_conf": 0.8458472490310669, "adv/std_reasoning": 0.8267138600349426, "adv/std_step_conf": 0.9348864555358887, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7563992611162422, "calib/avg_num_step_conf": 7.09375, "calib/ece": 0.25024096385542177, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6305220883534136, "calib/gap": 0.35983375115450594, "calib/mean_conf": 0.6989156626506025, "calib/mu_c": 0.8520979020979021, "calib/mu_w": 0.4922641509433962, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18742971887550208, "calib/std_conf": 0.41745455096831713, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4724045801526718, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.17693856073519604, "calib/step_q_w": 0.29546601941747574, "calib/step_q_w_n": 1030.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 535.2734375, "completions/mean_terminated_length": 537.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.19946666666666665, "grad_norm": 0.03536956384778023, "kl": 0.07340240478515625, "learning_rate": 3.611111111111111e-07, "loss": 0.0515, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03275679051876068, "mask/share_reasoning": 0.8340227603912354, "mask/share_step_conf": 0.12931418418884277, "num_tokens": 43065916.0, "reward": 1.146524429321289, "reward_std": 0.2738466262817383, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7100933790206909, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8526995182037354, "step": 187 }, { "adv/mean_abs_final_conf": 0.5937135815620422, "adv/mean_abs_reasoning": 0.4579869210720062, "adv/mean_abs_step_conf": 0.723059892654419, "adv/ratio_final_to_reasoning": 1.296354883175139, "adv/ratio_step_to_reasoning": 1.578778474638443, "adv/std_final_conf": 0.8131174445152283, "adv/std_reasoning": 0.7394041419029236, "adv/std_step_conf": 0.9349931478500366, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8136234961075726, "calib/avg_num_step_conf": 6.890625, "calib/ece": 0.21846153846153854, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6275303643724697, "calib/gap": 0.42784713375796185, "calib/mean_conf": 0.6959514170040485, "calib/mu_c": 0.8518471337579618, "calib/mu_w": 0.424, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1393927125506074, "calib/std_conf": 0.42652418481345883, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4834907597535934, "calib/step_q_c_n": 974.0, "calib/step_q_gap": 0.18722493696878328, "calib/step_q_w": 0.2962658227848101, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 546.60546875, "completions/mean_terminated_length": 550.909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.20053333333333334, "grad_norm": 0.04171891510486603, "kl": 0.0738525390625, "learning_rate": 3.3333333333333335e-07, "loss": -0.0148, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033632516860961914, "mask/share_reasoning": 0.8178136944770813, "mask/share_step_conf": 0.14074131846427917, "num_tokens": 43309919.0, "reward": 1.171053171157837, "reward_std": 0.25949686765670776, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7496523261070251, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8517402410507202, "step": 188 }, { "adv/mean_abs_final_conf": 0.5931285619735718, "adv/mean_abs_reasoning": 0.42756614089012146, "adv/mean_abs_step_conf": 0.7624196410179138, "adv/ratio_final_to_reasoning": 1.3872206081117109, "adv/ratio_step_to_reasoning": 1.7831618739282844, "adv/std_final_conf": 0.8115628957748413, "adv/std_reasoning": 0.7205988168716431, "adv/std_step_conf": 0.9347566962242126, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7398329933707293, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.2790944881889764, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.547244094488189, "calib/gap": 0.3474655787863335, "calib/mean_conf": 0.6197244094488189, "calib/mu_c": 0.7647297297297297, "calib/mu_w": 0.41726415094339625, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15807086614173235, "calib/std_conf": 0.45681661540802154, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.45375615763546795, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.1683101482457966, "calib/step_q_w": 0.28544600938967135, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 459.51953125, "completions/mean_terminated_length": 459.51953125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2016, "grad_norm": 0.046347636729478836, "kl": 0.0937652587890625, "learning_rate": 3.055555555555556e-07, "loss": -0.0272, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036958396434783936, "mask/share_reasoning": 0.8306939601898193, "mask/share_step_conf": 0.13234764337539673, "num_tokens": 43535324.0, "reward": 1.1333717107772827, "reward_std": 0.22686219215393066, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6966671347618103, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8404673933982849, "step": 189 }, { "adv/mean_abs_final_conf": 0.610427975654602, "adv/mean_abs_reasoning": 0.4907395839691162, "adv/mean_abs_step_conf": 0.7486152648925781, "adv/ratio_final_to_reasoning": 1.2438939013589296, "adv/ratio_step_to_reasoning": 1.5254837582853125, "adv/std_final_conf": 0.8305718898773193, "adv/std_reasoning": 0.7575461864471436, "adv/std_step_conf": 0.9347136616706848, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.793721386527142, "calib/avg_num_step_conf": 6.2421875, "calib/ece": 0.23028112449799204, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.4535696533682146, "calib/mean_conf": 0.6357429718875502, "calib/mu_c": 0.8361151079136692, "calib/mu_w": 0.3825454545454546, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15389558232931733, "calib/std_conf": 0.4508371844338888, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4700248756218905, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.1532364625236537, "calib/step_q_w": 0.3167884130982368, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 544.6953125, "completions/mean_terminated_length": 544.6953125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.20266666666666666, "grad_norm": 0.05424446985125542, "kl": 0.07772064208984375, "learning_rate": 2.7777777777777776e-07, "loss": 0.0097, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03243604302406311, "mask/share_reasoning": 0.8429285287857056, "mask/share_step_conf": 0.12463542073965073, "num_tokens": 43780374.0, "reward": 1.1627497673034668, "reward_std": 0.25788241624832153, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7429359555244446, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8534798622131348, "step": 190 }, { "adv/mean_abs_final_conf": 0.6643708348274231, "adv/mean_abs_reasoning": 0.514706015586853, "adv/mean_abs_step_conf": 0.7687493562698364, "adv/ratio_final_to_reasoning": 1.2907772878269677, "adv/ratio_step_to_reasoning": 1.4935697912784844, "adv/std_final_conf": 0.8442904949188232, "adv/std_reasoning": 0.7753342986106873, "adv/std_step_conf": 0.9353137016296387, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.727679176263379, "calib/avg_num_step_conf": 6.828125, "calib/ece": 0.2772427983539095, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6460905349794238, "calib/gap": 0.36340739737162975, "calib/mean_conf": 0.7232510288065843, "calib/mu_c": 0.905702479338843, "calib/mu_w": 0.5422950819672132, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25127572016460914, "calib/std_conf": 0.4076214842634009, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5024079320113314, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.1735883542762066, "calib/step_q_w": 0.32881957773512477, "calib/step_q_w_n": 1042.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 492.6484375, "completions/mean_terminated_length": 498.4901428222656, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.20373333333333332, "grad_norm": 0.037491872906684875, "kl": 0.10839080810546875, "learning_rate": 2.5000000000000004e-07, "loss": 0.0836, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03837917000055313, "mask/share_reasoning": 0.8010759353637695, "mask/share_step_conf": 0.14882618188858032, "num_tokens": 44010660.0, "reward": 1.0963408946990967, "reward_std": 0.24359621107578278, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6784878969192505, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8198792934417725, "step": 191 }, { "adv/mean_abs_final_conf": 0.6128907203674316, "adv/mean_abs_reasoning": 0.4897460341453552, "adv/mean_abs_step_conf": 0.7631070613861084, "adv/ratio_final_to_reasoning": 1.2514460100467653, "adv/ratio_step_to_reasoning": 1.5581689450896512, "adv/std_final_conf": 0.8317660689353943, "adv/std_reasoning": 0.7574921250343323, "adv/std_step_conf": 0.9336950778961182, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8238347042694869, "calib/avg_num_step_conf": 6.01953125, "calib/ece": 0.19076305220883533, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.5040423031727381, "calib/mean_conf": 0.6564658634538152, "calib/mu_c": 0.8811594202898553, "calib/mu_w": 0.37711711711711715, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.14650602409638552, "calib/std_conf": 0.43538419519899174, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4674747474747475, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.15396339901012795, "calib/step_q_w": 0.31351134846461953, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 487.4140625, "completions/mean_terminated_length": 487.4140625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2048, "grad_norm": 0.047550417482852936, "kl": 0.0874786376953125, "learning_rate": 2.2222222222222224e-07, "loss": -0.0223, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.038757845759391785, "mask/share_reasoning": 0.8241227865219116, "mask/share_step_conf": 0.137119323015213, "num_tokens": 44240414.0, "reward": 1.1892247200012207, "reward_std": 0.200741708278656, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7761476039886475, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.867159366607666, "step": 192 }, { "adv/mean_abs_final_conf": 0.6267382502555847, "adv/mean_abs_reasoning": 0.558899462223053, "adv/mean_abs_step_conf": 0.7465231418609619, "adv/ratio_final_to_reasoning": 1.1213792329709877, "adv/ratio_step_to_reasoning": 1.3357020221340445, "adv/std_final_conf": 0.8460562229156494, "adv/std_reasoning": 0.8097629547119141, "adv/std_step_conf": 0.9347980618476868, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8055693376333416, "calib/avg_num_step_conf": 6.22265625, "calib/ece": 0.22145098039215677, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5058823529411764, "calib/gap": 0.44473517737534113, "calib/mean_conf": 0.5933725490196079, "calib/mu_c": 0.7956834532374102, "calib/mu_w": 0.35094827586206906, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1348627450980391, "calib/std_conf": 0.4485283107240316, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4387948717948718, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.11264481029425683, "calib/step_q_w": 0.326150061500615, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 473.18359375, "completions/mean_terminated_length": 473.18359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.20586666666666667, "grad_norm": 0.04571477696299553, "kl": 0.08908843994140625, "learning_rate": 1.9444444444444447e-07, "loss": 0.0191, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03606698662042618, "mask/share_reasoning": 0.8280755281448364, "mask/share_step_conf": 0.13585752248764038, "num_tokens": 44467261.0, "reward": 1.199505090713501, "reward_std": 0.19832319021224976, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7623148560523987, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8864426016807556, "step": 193 }, { "adv/mean_abs_final_conf": 0.6319118142127991, "adv/mean_abs_reasoning": 0.549137532711029, "adv/mean_abs_step_conf": 0.7619778513908386, "adv/ratio_final_to_reasoning": 1.1507350646624406, "adv/ratio_step_to_reasoning": 1.3875901864311502, "adv/std_final_conf": 0.8473681211471558, "adv/std_reasoning": 0.8098301887512207, "adv/std_step_conf": 0.9348081350326538, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7936793422404933, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.24752948207171316, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6095617529880478, "calib/gap": 0.41372245632065763, "calib/mean_conf": 0.6714346613545817, "calib/mu_c": 0.8560438848920863, "calib/mu_w": 0.44232142857142864, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18258964143426298, "calib/std_conf": 0.4427708453228523, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48518471337579616, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.1487841531517065, "calib/step_q_w": 0.33640056022408965, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 454.81640625, "completions/mean_terminated_length": 456.60003662109375, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.20693333333333333, "grad_norm": 0.05285657197237015, "kl": 0.087890625, "learning_rate": 1.6666666666666668e-07, "loss": 0.0059, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037625670433044434, "mask/share_reasoning": 0.822894275188446, "mask/share_step_conf": 0.13557374477386475, "num_tokens": 44689638.0, "reward": 1.1636325120925903, "reward_std": 0.24643751978874207, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.728967547416687, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8629273772239685, "step": 194 }, { "adv/mean_abs_final_conf": 0.6661218404769897, "adv/mean_abs_reasoning": 0.4485500156879425, "adv/mean_abs_step_conf": 0.72472083568573, "adv/ratio_final_to_reasoning": 1.4850558849169957, "adv/ratio_step_to_reasoning": 1.615696823851903, "adv/std_final_conf": 0.8545449376106262, "adv/std_reasoning": 0.7206267714500427, "adv/std_step_conf": 0.9352039694786072, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7691960252935863, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.19893442622950816, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5860655737704918, "calib/gap": 0.470840108401084, "calib/mean_conf": 0.675655737704918, "calib/mu_c": 0.8338888888888889, "calib/mu_w": 0.3630487804878049, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10532786885245898, "calib/std_conf": 0.43194183675111997, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4587527352297593, "calib/step_q_c_n": 914.0, "calib/step_q_gap": 0.19261397203217256, "calib/step_q_w": 0.2661387631975867, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 457.61328125, "completions/mean_terminated_length": 468.59600830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.208, "grad_norm": 0.0644913911819458, "kl": 0.089080810546875, "learning_rate": 1.3888888888888888e-07, "loss": -0.1316, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.036113277077674866, "mask/share_reasoning": 0.8115068674087524, "mask/share_step_conf": 0.12894240021705627, "num_tokens": 44912771.0, "reward": 1.15213143825531, "reward_std": 0.24076297879219055, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.754950761795044, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.822458028793335, "step": 195 }, { "adv/mean_abs_final_conf": 0.5296871662139893, "adv/mean_abs_reasoning": 0.45094752311706543, "adv/mean_abs_step_conf": 0.7463574409484863, "adv/ratio_final_to_reasoning": 1.1746093260534067, "adv/ratio_step_to_reasoning": 1.6550871280752835, "adv/std_final_conf": 0.7749639749526978, "adv/std_reasoning": 0.739151656627655, "adv/std_step_conf": 0.9348659515380859, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7914329423763387, "calib/avg_num_step_conf": 5.43359375, "calib/ece": 0.2077952755905511, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6850393700787402, "calib/gap": 0.4285938296787352, "calib/mean_conf": 0.7494488188976377, "calib/mu_c": 0.9283108108108107, "calib/mu_w": 0.49971698113207547, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18728346456692907, "calib/std_conf": 0.39647920398366215, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5182375478927204, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.11843491631377301, "calib/step_q_w": 0.39980263157894735, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 391.96484375, "completions/mean_terminated_length": 391.96484375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.20906666666666668, "grad_norm": 0.04935779422521591, "kl": 0.0960540771484375, "learning_rate": 1.1111111111111112e-07, "loss": -0.0108, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04187465459108353, "mask/share_reasoning": 0.8143314123153687, "mask/share_step_conf": 0.14379391074180603, "num_tokens": 45115658.0, "reward": 1.1880606412887573, "reward_std": 0.16772204637527466, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7740910053253174, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8591660261154175, "step": 196 }, { "adv/mean_abs_final_conf": 0.6387770175933838, "adv/mean_abs_reasoning": 0.5504963994026184, "adv/mean_abs_step_conf": 0.7230050563812256, "adv/ratio_final_to_reasoning": 1.1603654779333066, "adv/ratio_step_to_reasoning": 1.3133692739240588, "adv/std_final_conf": 0.8457547426223755, "adv/std_reasoning": 0.8098015189170837, "adv/std_step_conf": 0.9345408082008362, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7607559055118109, "calib/avg_num_step_conf": 6.5, "calib/ece": 0.2756746031746031, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5833333333333334, "calib/gap": 0.36511937007874007, "calib/mean_conf": 0.6560714285714286, "calib/mu_c": 0.8400799999999999, "calib/mu_w": 0.47496062992125987, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21785714285714278, "calib/std_conf": 0.43884141617303263, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.461313672922252, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.14207620015536754, "calib/step_q_w": 0.31923747276688447, "calib/step_q_w_n": 918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 478.875, "completions/mean_terminated_length": 484.5533752441406, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.21013333333333334, "grad_norm": 0.04360397160053253, "kl": 0.08504486083984375, "learning_rate": 8.333333333333334e-08, "loss": -0.0231, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0352955237030983, "mask/share_reasoning": 0.8155674934387207, "mask/share_step_conf": 0.1374182403087616, "num_tokens": 45343306.0, "reward": 1.1531352996826172, "reward_std": 0.22259561717510223, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7032074332237244, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8723545074462891, "step": 197 }, { "adv/mean_abs_final_conf": 0.5296421647071838, "adv/mean_abs_reasoning": 0.46181297302246094, "adv/mean_abs_step_conf": 0.7478198409080505, "adv/ratio_final_to_reasoning": 1.146875890559757, "adv/ratio_step_to_reasoning": 1.6193131951528767, "adv/std_final_conf": 0.759784996509552, "adv/std_reasoning": 0.7206684947013855, "adv/std_step_conf": 0.934968888759613, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8539823008849556, "calib/avg_num_step_conf": 6.46484375, "calib/ece": 0.1557539682539683, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5476190476190477, "calib/gap": 0.5803622588654739, "calib/mean_conf": 0.6147222222222223, "calib/mu_c": 0.8749640287769783, "calib/mu_w": 0.29460176991150444, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10944444444444448, "calib/std_conf": 0.45056085223825754, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4732279495990836, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.1517062104686489, "calib/step_q_w": 0.3215217391304347, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 458.8359375, "completions/mean_terminated_length": 460.63531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.2112, "grad_norm": 0.03705943003296852, "kl": 0.09426116943359375, "learning_rate": 5.555555555555556e-08, "loss": 0.0507, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.039732448756694794, "mask/share_reasoning": 0.8041092157363892, "mask/share_step_conf": 0.15225210785865784, "num_tokens": 45566152.0, "reward": 1.2149150371551514, "reward_std": 0.18363887071609497, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.815900444984436, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8728281259536743, "step": 198 }, { "adv/mean_abs_final_conf": 0.6824038028717041, "adv/mean_abs_reasoning": 0.6048569679260254, "adv/mean_abs_step_conf": 0.74017333984375, "adv/ratio_final_to_reasoning": 1.1282068969323054, "adv/ratio_step_to_reasoning": 1.2237163149193875, "adv/std_final_conf": 0.876178503036499, "adv/std_reasoning": 0.826652467250824, "adv/std_step_conf": 0.9352187514305115, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7021883289124669, "calib/avg_num_step_conf": 6.5078125, "calib/ece": 0.2850200803212851, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5983935742971888, "calib/gap": 0.2835285145888594, "calib/mean_conf": 0.6811646586345381, "calib/mu_c": 0.7995862068965517, "calib/mu_w": 0.5160576923076923, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19192771084337348, "calib/std_conf": 0.42563329734788, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44136732329084594, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.12920044906917716, "calib/step_q_w": 0.3121668742216688, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 503.546875, "completions/mean_terminated_length": 509.5177917480469, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.21226666666666666, "grad_norm": 0.059340838342905045, "kl": 0.0843505859375, "learning_rate": 2.777777777777778e-08, "loss": -0.1241, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03569798544049263, "mask/share_reasoning": 0.8174113035202026, "mask/share_step_conf": 0.13517196476459503, "num_tokens": 45799260.0, "reward": 1.1359705924987793, "reward_std": 0.27152296900749207, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6812281012535095, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8563086986541748, "step": 199 }, { "adv/mean_abs_final_conf": 0.544896125793457, "adv/mean_abs_reasoning": 0.46680164337158203, "adv/mean_abs_step_conf": 0.7455019950866699, "adv/ratio_final_to_reasoning": 1.1672969312143369, "adv/ratio_step_to_reasoning": 1.5970423533690041, "adv/std_final_conf": 0.7886665463447571, "adv/std_reasoning": 0.720587432384491, "adv/std_step_conf": 0.9341596961021423, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8773305813553491, "calib/avg_num_step_conf": 6.453125, "calib/ece": 0.16403225806451616, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6491935483870968, "calib/gap": 0.5541093911248711, "calib/mean_conf": 0.6891129032258064, "calib/mu_c": 0.9013725490196078, "calib/mu_w": 0.3472631578947368, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11810483870967745, "calib/std_conf": 0.43646307920708033, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.500355421686747, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.2503432562366253, "calib/step_q_w": 0.25001216545012167, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 485.99609375, "completions/mean_terminated_length": 493.7103576660156, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.21333333333333335, "grad_norm": 0.040903497487306595, "kl": 0.09577178955078125, "learning_rate": 0.0, "loss": -0.1536, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03609899431467056, "mask/share_reasoning": 0.8205278515815735, "mask/share_step_conf": 0.12774814665317535, "num_tokens": 46031723.0, "reward": 1.1995675563812256, "reward_std": 0.2133074849843979, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7961195111274719, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8603435754776001, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.015639179518911986, "train_runtime": 14298.7628, "train_samples_per_second": 3.581, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 46031723, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }