{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7498364448547363, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5715035496705603, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9352971315383911, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04303989186882973, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0136, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 1.0788748264312744, "reward_std": 0.22853493690490723, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.770571768283844, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.509578923891962, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9354329705238342, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04042748734354973, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 1.016056776046753, "reward_std": 0.2184845209121704, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7565299868583679, "adv/mean_abs_reasoning": 0.4385569989681244, "adv/mean_abs_step_conf": 0.7518496513366699, "adv/ratio_final_to_reasoning": 1.725043697030029, "adv/ratio_step_to_reasoning": 1.7143715710972305, "adv/std_final_conf": 0.930081844329834, "adv/std_reasoning": 0.7205978035926819, "adv/std_step_conf": 0.9347808957099915, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44436368441918855, "calib/avg_num_step_conf": 4.98828125, "calib/ece": 0.2470588235294117, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.28627450980392155, "calib/gap": -0.00972974758821199, "calib/mean_conf": 0.8784313725490196, "calib/mu_c": 0.8748447204968943, "calib/mu_w": 0.8845744680851063, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2470588235294117, "calib/std_conf": 0.04320369215429589, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7916621253405993, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.024829712817578953, "calib/step_q_w": 0.7668324125230204, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 491.1875, "completions/mean_terminated_length": 493.11376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0032, "grad_norm": 0.03560638055205345, "kl": 0.00038504600524902344, "learning_rate": 7.5e-07, "loss": 0.0598, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033207207918167114, "mask/share_reasoning": 0.854034960269928, "mask/share_step_conf": 0.1088515967130661, "num_tokens": 689661.0, "reward": 1.0690152645111084, "reward_std": 0.21428318321704865, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6956777572631836, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7459434270858765, "step": 3 }, { "adv/mean_abs_final_conf": 0.7650306224822998, "adv/mean_abs_reasoning": 0.4247799217700958, "adv/mean_abs_step_conf": 0.7311046123504639, "adv/ratio_final_to_reasoning": 1.8010046691810409, "adv/ratio_step_to_reasoning": 1.7211374052330104, "adv/std_final_conf": 0.9282382726669312, "adv/std_reasoning": 0.7012653350830078, "adv/std_step_conf": 0.9353035092353821, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.48399945989738047, "calib/avg_num_step_conf": 4.9765625, "calib/ece": 0.248814229249012, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2490118577075099, "calib/gap": -0.004751552795031144, "calib/mean_conf": 0.8789328063241106, "calib/mu_c": 0.8772049689440995, "calib/mu_w": 0.8819565217391306, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24569169960474324, "calib/std_conf": 0.04266849221724298, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7867173637515843, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.00044932251447082905, "calib/step_q_w": 0.7862680412371135, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 512.50390625, "completions/mean_terminated_length": 514.5137329101562, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.004266666666666667, "grad_norm": 0.05675892159342766, "kl": 0.00032141804695129395, "learning_rate": 1.0000000000000002e-06, "loss": 0.0046, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03311063349246979, "mask/share_reasoning": 0.8511103391647339, "mask/share_step_conf": 0.11187273263931274, "num_tokens": 927030.0, "reward": 1.0533726215362549, "reward_std": 0.1900576800107956, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.697465181350708, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7238948345184326, "step": 4 }, { "adv/mean_abs_final_conf": 0.7479941844940186, "adv/mean_abs_reasoning": 0.4255312979221344, "adv/mean_abs_step_conf": 0.7695996165275574, "adv/ratio_final_to_reasoning": 1.7577888821491336, "adv/ratio_step_to_reasoning": 1.808561721042625, "adv/std_final_conf": 0.9310461282730103, "adv/std_reasoning": 0.7012953758239746, "adv/std_step_conf": 0.9351333379745483, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48754801536491676, "calib/avg_num_step_conf": 4.8984375, "calib/ece": 0.32694444444444437, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.32936507936507936, "calib/gap": -0.004367477592829627, "calib/mean_conf": 0.8803571428571427, "calib/mu_c": 0.8784507042253522, "calib/mu_w": 0.8828181818181818, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3219047619047618, "calib/std_conf": 0.04535870513494929, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7951652892561983, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.015695592286501414, "calib/step_q_w": 0.7794696969696969, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 507.01171875, "completions/mean_terminated_length": 509.0000305175781, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.005333333333333333, "grad_norm": 0.05365122854709625, "kl": 0.00030153989791870117, "learning_rate": 1.25e-06, "loss": 0.0232, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03370092064142227, "mask/share_reasoning": 0.84963059425354, "mask/share_step_conf": 0.11276228725910187, "num_tokens": 1163513.0, "reward": 0.9934109449386597, "reward_std": 0.19404548406600952, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6383934020996094, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6942647695541382, "step": 5 }, { "adv/mean_abs_final_conf": 0.7626480460166931, "adv/mean_abs_reasoning": 0.3809635639190674, "adv/mean_abs_step_conf": 0.7435075044631958, "adv/ratio_final_to_reasoning": 2.0018923546681004, "adv/ratio_step_to_reasoning": 1.9516499079716398, "adv/std_final_conf": 0.9299441576004028, "adv/std_reasoning": 0.6611693501472473, "adv/std_step_conf": 0.9353631734848022, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4705513784461153, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.330984251968504, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3188976377952756, "calib/gap": -0.004665413533834539, "calib/mean_conf": 0.8821653543307086, "calib/mu_c": 0.8800714285714286, "calib/mu_w": 0.8847368421052632, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.330984251968504, "calib/std_conf": 0.04235717339790602, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8080547112462007, "calib/step_q_c_n": 658.0, "calib/step_q_gap": -0.0005935222407423835, "calib/step_q_w": 0.808648233486943, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 443.55078125, "completions/mean_terminated_length": 445.29022216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.0064, "grad_norm": 0.03383629024028778, "kl": 0.0004996657371520996, "learning_rate": 1.5e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03738969564437866, "mask/share_reasoning": 0.8328449130058289, "mask/share_step_conf": 0.12585915625095367, "num_tokens": 1383014.0, "reward": 0.9994688630104065, "reward_std": 0.1811923086643219, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6339746117591858, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7047669291496277, "step": 6 }, { "adv/mean_abs_final_conf": 0.7722004652023315, "adv/mean_abs_reasoning": 0.5118991136550903, "adv/mean_abs_step_conf": 0.7482302188873291, "adv/ratio_final_to_reasoning": 1.5085012741839365, "adv/ratio_step_to_reasoning": 1.4616751600618614, "adv/std_final_conf": 0.9312074780464172, "adv/std_reasoning": 0.7576223015785217, "adv/std_step_conf": 0.9354740381240845, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.41784313725490196, "calib/avg_num_step_conf": 5.8671875, "calib/ece": 0.27996047430830046, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": -0.010246405228757971, "calib/mean_conf": 0.884703557312253, "calib/mu_c": 0.8806535947712419, "calib/mu_w": 0.8908999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27996047430830046, "calib/std_conf": 0.04462858434574431, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7931849315068492, "calib/step_q_c_n": 876.0, "calib/step_q_gap": 0.00176640115541149, "calib/step_q_w": 0.7914185303514377, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 569.71484375, "completions/mean_terminated_length": 569.71484375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.007466666666666667, "grad_norm": 0.041968848556280136, "kl": 0.0003533661365509033, "learning_rate": 1.75e-06, "loss": 0.0946, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03012840449810028, "mask/share_reasoning": 0.8556883931159973, "mask/share_step_conf": 0.11418319493532181, "num_tokens": 1636285.0, "reward": 1.0396735668182373, "reward_std": 0.2362067699432373, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6677848100662231, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7295831441879272, "step": 7 }, { "adv/mean_abs_final_conf": 0.7601855993270874, "adv/mean_abs_reasoning": 0.4345535933971405, "adv/mean_abs_step_conf": 0.7925187349319458, "adv/ratio_final_to_reasoning": 1.7493483217669548, "adv/ratio_step_to_reasoning": 1.8237537256024008, "adv/std_final_conf": 0.9302745461463928, "adv/std_reasoning": 0.7014146447181702, "adv/std_step_conf": 0.9354472756385803, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.535615989515072, "calib/avg_num_step_conf": 4.8203125, "calib/ece": 0.31590361445783144, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.28112449799196787, "calib/gap": 0.007483617300130918, "calib/mean_conf": 0.8781526104417672, "calib/mu_c": 0.8814285714285713, "calib/mu_w": 0.8739449541284404, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.31590361445783144, "calib/std_conf": 0.04891391507089884, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7990615384615386, "calib/step_q_c_n": 650.0, "calib/step_q_gap": 0.022982771338250885, "calib/step_q_w": 0.7760787671232877, "calib/step_q_w_n": 584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 546.8984375, "completions/mean_terminated_length": 546.8984375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.008533333333333334, "grad_norm": 0.04877391830086708, "kl": 0.00037592649459838867, "learning_rate": 2.0000000000000003e-06, "loss": 0.0366, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0320667028427124, "mask/share_reasoning": 0.8620612025260925, "mask/share_step_conf": 0.10587209463119507, "num_tokens": 1882803.0, "reward": 1.015367865562439, "reward_std": 0.21866479516029358, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6367086172103882, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7272679805755615, "step": 8 }, { "adv/mean_abs_final_conf": 0.8045555353164673, "adv/mean_abs_reasoning": 0.4165237545967102, "adv/mean_abs_step_conf": 0.7727015614509583, "adv/ratio_final_to_reasoning": 1.9315958008096326, "adv/ratio_step_to_reasoning": 1.8551200331878053, "adv/std_final_conf": 0.9291321039199829, "adv/std_reasoning": 0.681744396686554, "adv/std_step_conf": 0.9353907108306885, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4443561734524992, "calib/avg_num_step_conf": 5.1171875, "calib/ece": 0.2555905511811023, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.29133858267716534, "calib/gap": -0.00815094339622624, "calib/mean_conf": 0.8788976377952756, "calib/mu_c": 0.8758490566037735, "calib/mu_w": 0.8839999999999998, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25425196850393694, "calib/std_conf": 0.04816684170764448, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7245669291338583, "calib/step_q_c_n": 889.0, "calib/step_q_gap": -0.05692950792077345, "calib/step_q_w": 0.7814964370546318, "calib/step_q_w_n": 421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 537.3984375, "completions/mean_terminated_length": 539.5059204101562, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.0096, "grad_norm": 0.04915522783994675, "kl": 0.00032514333724975586, "learning_rate": 2.25e-06, "loss": -0.0041, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033639971166849136, "mask/share_reasoning": 0.8549748659133911, "mask/share_step_conf": 0.10747894644737244, "num_tokens": 2127913.0, "reward": 1.02982497215271, "reward_std": 0.21984942257404327, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.686966061592102, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7015809416770935, "step": 9 }, { "adv/mean_abs_final_conf": 0.7565653324127197, "adv/mean_abs_reasoning": 0.4656117558479309, "adv/mean_abs_step_conf": 0.7444936037063599, "adv/ratio_final_to_reasoning": 1.6248845157161678, "adv/ratio_step_to_reasoning": 1.598957917955817, "adv/std_final_conf": 0.9308657646179199, "adv/std_reasoning": 0.7393183708190918, "adv/std_step_conf": 0.9351793527603149, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5120526175213675, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.2644444444444444, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.004294871794871868, "calib/mean_conf": 0.8834920634920634, "calib/mu_c": 0.8851282051282052, "calib/mu_w": 0.8808333333333334, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2644444444444444, "calib/std_conf": 0.04569251024585318, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7845816733067729, "calib/step_q_c_n": 753.0, "calib/step_q_gap": -0.0060055460369231595, "calib/step_q_w": 0.790587219343696, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 532.19921875, "completions/mean_terminated_length": 532.19921875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.010666666666666666, "grad_norm": 0.043561290949583054, "kl": 0.0004936754703521729, "learning_rate": 2.5e-06, "loss": 0.0907, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03227228671312332, "mask/share_reasoning": 0.8546050786972046, "mask/share_step_conf": 0.11312257498502731, "num_tokens": 2370956.0, "reward": 1.0441138744354248, "reward_std": 0.2344008982181549, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6824515461921692, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7252049446105957, "step": 10 }, { "adv/mean_abs_final_conf": 0.7800993919372559, "adv/mean_abs_reasoning": 0.38626450300216675, "adv/mean_abs_step_conf": 0.7973983287811279, "adv/ratio_final_to_reasoning": 2.0195989687742024, "adv/ratio_step_to_reasoning": 2.0643841786742048, "adv/std_final_conf": 0.9288783669471741, "adv/std_reasoning": 0.6612656116485596, "adv/std_step_conf": 0.9352825284004211, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4638977212506624, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.3072800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.316, "calib/gap": -0.007354266030736745, "calib/mean_conf": 0.8849600000000001, "calib/mu_c": 0.8819594594594594, "calib/mu_w": 0.8893137254901962, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.30012000000000005, "calib/std_conf": 0.048524204269621986, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7817042253521128, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.025777732243383866, "calib/step_q_w": 0.755926493108729, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 538.734375, "completions/mean_terminated_length": 542.9763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.011733333333333333, "grad_norm": 36.05608367919922, "kl": 4.125522136688232, "learning_rate": 2.7500000000000004e-06, "loss": 0.8144, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032473787665367126, "mask/share_reasoning": 0.8457951545715332, "mask/share_step_conf": 0.11391851305961609, "num_tokens": 2613352.0, "reward": 1.0236568450927734, "reward_std": 0.19265316426753998, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6501550674438477, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7246680855751038, "step": 11 }, { "adv/mean_abs_final_conf": 0.7623910903930664, "adv/mean_abs_reasoning": 0.4246925711631775, "adv/mean_abs_step_conf": 0.7670254707336426, "adv/ratio_final_to_reasoning": 1.7951599395887166, "adv/ratio_step_to_reasoning": 1.8060722574752366, "adv/std_final_conf": 0.9276826977729797, "adv/std_reasoning": 0.7014279961585999, "adv/std_step_conf": 0.9347568154335022, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5192449219899912, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.20509960159362556, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.004512805416543975, "calib/mean_conf": 0.8876494023904383, "calib/mu_c": 0.8890697674418605, "calib/mu_w": 0.8845569620253165, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20374501992031877, "calib/std_conf": 0.05237442380176764, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7802771362586606, "calib/step_q_c_n": 866.0, "calib/step_q_gap": 0.0217426535000399, "calib/step_q_w": 0.7585344827586207, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 481.8671875, "completions/mean_terminated_length": 485.6614074707031, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.0128, "grad_norm": 0.05654134973883629, "kl": 0.0009008646011352539, "learning_rate": 3e-06, "loss": 0.0313, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03626004233956337, "mask/share_reasoning": 0.8287768959999084, "mask/share_step_conf": 0.1271505057811737, "num_tokens": 2840886.0, "reward": 1.1029002666473389, "reward_std": 0.20242473483085632, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7231832146644592, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7691406011581421, "step": 12 }, { "adv/mean_abs_final_conf": 0.7640227675437927, "adv/mean_abs_reasoning": 0.4206831753253937, "adv/mean_abs_step_conf": 0.7589430212974548, "adv/ratio_final_to_reasoning": 1.8161476673100363, "adv/ratio_step_to_reasoning": 1.8040726746688194, "adv/std_final_conf": 0.9303558468818665, "adv/std_reasoning": 0.7012991309165955, "adv/std_step_conf": 0.935462236404419, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6333466082570024, "calib/avg_num_step_conf": 5.01953125, "calib/ece": 0.25407843137254904, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.396078431372549, "calib/gap": 0.022385503783353244, "calib/mean_conf": 0.8861568627450982, "calib/mu_c": 0.894320987654321, "calib/mu_w": 0.8719354838709678, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25247058823529417, "calib/std_conf": 0.0559283058395228, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7778553299492384, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.012020319888876307, "calib/step_q_w": 0.7658350100603621, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 478.39453125, "completions/mean_terminated_length": 480.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.013866666666666666, "grad_norm": 0.055792853236198425, "kl": 0.0011698007583618164, "learning_rate": 3.2500000000000002e-06, "loss": 0.0596, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034841444343328476, "mask/share_reasoning": 0.8473033905029297, "mask/share_step_conf": 0.11394891887903214, "num_tokens": 3067947.0, "reward": 1.0867152214050293, "reward_std": 0.1965121030807495, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.708683967590332, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7598310112953186, "step": 13 }, { "adv/mean_abs_final_conf": 0.7682212591171265, "adv/mean_abs_reasoning": 0.4448583722114563, "adv/mean_abs_step_conf": 0.7434722185134888, "adv/ratio_final_to_reasoning": 1.7268895160906734, "adv/ratio_step_to_reasoning": 1.6712559883218094, "adv/std_final_conf": 0.9279747605323792, "adv/std_reasoning": 0.7206664681434631, "adv/std_step_conf": 0.9349461197853088, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.36925911304118963, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.3090438247011953, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.47410358565737054, "calib/gap": -0.0182945124358469, "calib/mean_conf": 0.9026693227091633, "calib/mu_c": 0.895234899328859, "calib/mu_w": 0.9135294117647059, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3090438247011953, "calib/std_conf": 0.04100877147377484, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7693725992317542, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.02968838870543833, "calib/step_q_w": 0.7396842105263158, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 542.46484375, "completions/mean_terminated_length": 544.5921630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.014933333333333333, "grad_norm": 0.0377659797668457, "kl": 0.0017580986022949219, "learning_rate": 3.5e-06, "loss": 0.0361, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03264409676194191, "mask/share_reasoning": 0.8481143712997437, "mask/share_step_conf": 0.11533529311418533, "num_tokens": 3312218.0, "reward": 1.0354915857315063, "reward_std": 0.2152160406112671, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6392582058906555, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7466707229614258, "step": 14 }, { "adv/mean_abs_final_conf": 0.7421205043792725, "adv/mean_abs_reasoning": 0.460532546043396, "adv/mean_abs_step_conf": 0.7523643970489502, "adv/ratio_final_to_reasoning": 1.6114398662051181, "adv/ratio_step_to_reasoning": 1.6336834465072851, "adv/std_final_conf": 0.9272037148475647, "adv/std_reasoning": 0.7392576336860657, "adv/std_step_conf": 0.9352161884307861, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5078076396665423, "calib/avg_num_step_conf": 4.6875, "calib/ece": 0.3491372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5490196078431373, "calib/gap": 0.0012206047032473633, "calib/mean_conf": 0.902078431372549, "calib/mu_c": 0.9026241134751772, "calib/mu_w": 0.9014035087719299, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3491372549019608, "calib/std_conf": 0.04888555788868965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.760141065830721, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.0033083256171978492, "calib/step_q_w": 0.7568327402135232, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 458.53125, "completions/mean_terminated_length": 460.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.016, "grad_norm": 0.04049689695239067, "kl": 0.0028886795043945312, "learning_rate": 3.7500000000000005e-06, "loss": 0.0327, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035021279007196426, "mask/share_reasoning": 0.8494170904159546, "mask/share_step_conf": 0.11165538430213928, "num_tokens": 3537482.0, "reward": 1.0145270824432373, "reward_std": 0.2300584316253662, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.622326135635376, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.732610285282135, "step": 15 }, { "adv/mean_abs_final_conf": 0.7558896541595459, "adv/mean_abs_reasoning": 0.3702784776687622, "adv/mean_abs_step_conf": 0.7783836722373962, "adv/ratio_final_to_reasoning": 2.041408560709644, "adv/ratio_step_to_reasoning": 2.1021574819525704, "adv/std_final_conf": 0.9276329874992371, "adv/std_reasoning": 0.661292552947998, "adv/std_step_conf": 0.9352163672447205, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5419096209912536, "calib/avg_num_step_conf": 6.28515625, "calib/ece": 0.3036507936507936, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6547619047619048, "calib/gap": 0.009628942486085479, "calib/mean_conf": 0.9147619047619048, "calib/mu_c": 0.9185064935064934, "calib/mu_w": 0.9088775510204079, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3036507936507936, "calib/std_conf": 0.0450081877156951, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7208686868686869, "calib/step_q_c_n": 990.0, "calib/step_q_gap": -0.014010149964915786, "calib/step_q_w": 0.7348788368336027, "calib/step_q_w_n": 619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 647.96484375, "completions/mean_terminated_length": 647.96484375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.017066666666666667, "grad_norm": 0.0510561466217041, "kl": 0.004111528396606445, "learning_rate": 4.000000000000001e-06, "loss": 0.0373, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.026287715882062912, "mask/share_reasoning": 0.8636431694030762, "mask/share_step_conf": 0.110069140791893, "num_tokens": 3812209.0, "reward": 1.0580646991729736, "reward_std": 0.1909414380788803, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6621820330619812, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7578397989273071, "step": 16 }, { "adv/mean_abs_final_conf": 0.7609502077102661, "adv/mean_abs_reasoning": 0.5324435234069824, "adv/mean_abs_step_conf": 0.7632114887237549, "adv/ratio_final_to_reasoning": 1.4291660509665371, "adv/ratio_step_to_reasoning": 1.4334130385137223, "adv/std_final_conf": 0.9293497204780579, "adv/std_reasoning": 0.7753995060920715, "adv/std_step_conf": 0.9355502724647522, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.44006581653640475, "calib/avg_num_step_conf": 5.37890625, "calib/ece": 0.1842063492063492, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6785714285714286, "calib/gap": -0.00877828054298635, "calib/mean_conf": 0.9157936507936508, "calib/mu_c": 0.9135294117647059, "calib/mu_w": 0.9223076923076923, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.17896825396825397, "calib/std_conf": 0.05148359539317561, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7065024630541873, "calib/step_q_c_n": 1015.0, "calib/step_q_gap": 0.007110197860817125, "calib/step_q_w": 0.6993922651933702, "calib/step_q_w_n": 362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 498.37109375, "completions/mean_terminated_length": 502.2952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.018133333333333335, "grad_norm": 0.03440679609775543, "kl": 0.006146430969238281, "learning_rate": 4.25e-06, "loss": 0.0597, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03338541090488434, "mask/share_reasoning": 0.8393343687057495, "mask/share_step_conf": 0.11946772038936615, "num_tokens": 4043320.0, "reward": 1.146507740020752, "reward_std": 0.24300463497638702, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7447628974914551, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8056056499481201, "step": 17 }, { "adv/mean_abs_final_conf": 0.7621678113937378, "adv/mean_abs_reasoning": 0.4120538830757141, "adv/mean_abs_step_conf": 0.7573996782302856, "adv/ratio_final_to_reasoning": 1.8496799634665522, "adv/ratio_step_to_reasoning": 1.838108338105662, "adv/std_final_conf": 0.9268068671226501, "adv/std_reasoning": 0.6816505789756775, "adv/std_step_conf": 0.9356744885444641, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5002542588354946, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.37226190476190474, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7261904761904762, "calib/gap": 0.010118230358504676, "calib/mean_conf": 0.9169444444444445, "calib/mu_c": 0.9215217391304347, "calib/mu_w": 0.91140350877193, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3707936507936508, "calib/std_conf": 0.07787727479472598, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.716140350877193, "calib/step_q_c_n": 627.0, "calib/step_q_gap": 0.046362902805976325, "calib/step_q_w": 0.6697774480712166, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 512.97265625, "completions/mean_terminated_length": 514.9843139648438, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.0192, "grad_norm": 0.03835965692996979, "kl": 0.007046699523925781, "learning_rate": 4.5e-06, "loss": 0.0275, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033493079245090485, "mask/share_reasoning": 0.8562003374099731, "mask/share_step_conf": 0.10640032589435577, "num_tokens": 4285361.0, "reward": 1.0004057884216309, "reward_std": 0.21100187301635742, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5974195599555969, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7335113286972046, "step": 18 }, { "adv/mean_abs_final_conf": 0.7491005659103394, "adv/mean_abs_reasoning": 0.3827943205833435, "adv/mean_abs_step_conf": 0.7685535550117493, "adv/ratio_final_to_reasoning": 1.956927064039975, "adv/ratio_step_to_reasoning": 2.0077454488889597, "adv/std_final_conf": 0.9226001501083374, "adv/std_reasoning": 0.6612175703048706, "adv/std_step_conf": 0.9356136322021484, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5578053142565151, "calib/avg_num_step_conf": 4.5625, "calib/ece": 0.31964705882352945, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7686274509803922, "calib/gap": 0.03273377618804252, "calib/mean_conf": 0.9157254901960784, "calib/mu_c": 0.9289473684210525, "calib/mu_w": 0.89621359223301, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31964705882352945, "calib/std_conf": 0.08228217314167494, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6997557471264367, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.04212862848236898, "calib/step_q_w": 0.6576271186440678, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 469.78125, "completions/mean_terminated_length": 469.78125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.020266666666666665, "grad_norm": 0.035939835011959076, "kl": 0.010945320129394531, "learning_rate": 4.75e-06, "loss": 0.0018, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03303908556699753, "mask/share_reasoning": 0.8579931259155273, "mask/share_step_conf": 0.10896774381399155, "num_tokens": 4510385.0, "reward": 1.0813804864883423, "reward_std": 0.19825337827205658, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6634472608566284, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7875632047653198, "step": 19 }, { "adv/mean_abs_final_conf": 0.7338466048240662, "adv/mean_abs_reasoning": 0.39686164259910583, "adv/mean_abs_step_conf": 0.7599142789840698, "adv/ratio_final_to_reasoning": 1.8491245463229848, "adv/ratio_step_to_reasoning": 1.9148090856230862, "adv/std_final_conf": 0.9257152080535889, "adv/std_reasoning": 0.7012358903884888, "adv/std_step_conf": 0.9355400800704956, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4792622324159021, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.3584189723320159, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7984189723320159, "calib/gap": 0.007530581039755679, "calib/mean_conf": 0.9275889328063242, "calib/mu_c": 0.9308333333333335, "calib/mu_w": 0.9233027522935778, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3584189723320159, "calib/std_conf": 0.06596888861326226, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6546230440967283, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.03510367789954527, "calib/step_q_w": 0.6195193661971831, "calib/step_q_w_n": 568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 483.69140625, "completions/mean_terminated_length": 483.69140625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.021333333333333333, "grad_norm": 0.027182994410395622, "kl": 0.015130996704101562, "learning_rate": 5e-06, "loss": 0.0232, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03616435080766678, "mask/share_reasoning": 0.8422824144363403, "mask/share_step_conf": 0.1215532124042511, "num_tokens": 4739082.0, "reward": 1.054613709449768, "reward_std": 0.19555900990962982, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6180988550186157, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7878357172012329, "step": 20 }, { "adv/mean_abs_final_conf": 0.7652933597564697, "adv/mean_abs_reasoning": 0.48477011919021606, "adv/mean_abs_step_conf": 0.7567232847213745, "adv/ratio_final_to_reasoning": 1.5786727140584769, "adv/ratio_step_to_reasoning": 1.5609940769151416, "adv/std_final_conf": 0.9191557765007019, "adv/std_reasoning": 0.7392932772636414, "adv/std_step_conf": 0.9357109069824219, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5639366001515534, "calib/avg_num_step_conf": 5.0859375, "calib/ece": 0.358627450980392, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8705882352941177, "calib/gap": 0.00974362212679969, "calib/mean_conf": 0.9390196078431372, "calib/mu_c": 0.9431081081081082, "calib/mu_w": 0.9333644859813085, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.358627450980392, "calib/std_conf": 0.03530566259980002, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6201876675603216, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.015709250294134613, "calib/step_q_w": 0.604478417266187, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 501.375, "completions/mean_terminated_length": 501.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.0224, "grad_norm": 0.025243666023015976, "kl": 0.017017364501953125, "learning_rate": 4.9722222222222224e-06, "loss": 0.0133, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03457701951265335, "mask/share_reasoning": 0.8496730327606201, "mask/share_step_conf": 0.11574994772672653, "num_tokens": 4970394.0, "reward": 1.075460433959961, "reward_std": 0.21974530816078186, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6288824081420898, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8047963976860046, "step": 21 }, { "adv/mean_abs_final_conf": 0.7464021444320679, "adv/mean_abs_reasoning": 0.4103482961654663, "adv/mean_abs_step_conf": 0.7724449038505554, "adv/ratio_final_to_reasoning": 1.818947833844772, "adv/ratio_step_to_reasoning": 1.8824128455478697, "adv/std_final_conf": 0.9118995070457458, "adv/std_reasoning": 0.6815477609634399, "adv/std_step_conf": 0.9356730580329895, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4308304498269896, "calib/avg_num_step_conf": 5.41015625, "calib/ece": 0.27878431372549023, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9019607843137255, "calib/gap": -0.006294117647058783, "calib/mean_conf": 0.9454509803921568, "calib/mu_c": 0.9433529411764706, "calib/mu_w": 0.9496470588235294, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27878431372549023, "calib/std_conf": 0.03242627458331524, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5930643402399127, "calib/step_q_c_n": 917.0, "calib/step_q_gap": -0.0035809589053864554, "calib/step_q_w": 0.5966452991452992, "calib/step_q_w_n": 468.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 460.67578125, "completions/mean_terminated_length": 460.67578125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.023466666666666667, "grad_norm": 0.027960635721683502, "kl": 0.024440765380859375, "learning_rate": 4.944444444444445e-06, "loss": 0.0032, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03475780785083771, "mask/share_reasoning": 0.839755117893219, "mask/share_step_conf": 0.1254870593547821, "num_tokens": 5190143.0, "reward": 1.1180078983306885, "reward_std": 0.20199835300445557, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6934887170791626, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8069971799850464, "step": 22 }, { "adv/mean_abs_final_conf": 0.7499644160270691, "adv/mean_abs_reasoning": 0.45654457807540894, "adv/mean_abs_step_conf": 0.7715545296669006, "adv/ratio_final_to_reasoning": 1.6426970158940208, "adv/ratio_step_to_reasoning": 1.6899872799266067, "adv/std_final_conf": 0.9265665411949158, "adv/std_reasoning": 0.7392275333404541, "adv/std_step_conf": 0.9358251690864563, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5026871641044869, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.4441897233201581, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8932806324110671, "calib/gap": 0.0004255718035246492, "calib/mean_conf": 0.9461660079051383, "calib/mu_c": 0.9463779527559054, "calib/mu_w": 0.9459523809523808, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4441897233201581, "calib/std_conf": 0.034018190185690626, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6132223454833597, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.02573731554323999, "calib/step_q_w": 0.5874850299401198, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 496.140625, "completions/mean_terminated_length": 496.140625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.024533333333333334, "grad_norm": 0.04829743877053261, "kl": 0.04451179504394531, "learning_rate": 4.9166666666666665e-06, "loss": 0.0351, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03656258434057236, "mask/share_reasoning": 0.8382663726806641, "mask/share_step_conf": 0.12517108023166656, "num_tokens": 5421091.0, "reward": 0.9915996789932251, "reward_std": 0.22186483442783356, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5374960899353027, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7669271230697632, "step": 23 }, { "adv/mean_abs_final_conf": 0.7683982849121094, "adv/mean_abs_reasoning": 0.5511972904205322, "adv/mean_abs_step_conf": 0.768555760383606, "adv/ratio_final_to_reasoning": 1.3940530881889226, "adv/ratio_step_to_reasoning": 1.394338785296353, "adv/std_final_conf": 0.9279333353042603, "adv/std_reasoning": 0.7928519248962402, "adv/std_step_conf": 0.9359160661697388, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5019592476489028, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.4192741935483872, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9274193548387096, "calib/gap": 0.0017450365726229267, "calib/mean_conf": 0.9515322580645161, "calib/mu_c": 0.9523484848484849, "calib/mu_w": 0.950603448275862, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4192741935483872, "calib/std_conf": 0.028766141467029323, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5671428571428572, "calib/step_q_c_n": 826.0, "calib/step_q_gap": -0.004703989703989664, "calib/step_q_w": 0.5718468468468468, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 577.125, "completions/mean_terminated_length": 577.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0256, "grad_norm": 0.02429387718439102, "kl": 0.023302078247070312, "learning_rate": 4.888888888888889e-06, "loss": 0.0614, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03134583681821823, "mask/share_reasoning": 0.8475775718688965, "mask/share_step_conf": 0.12107663601636887, "num_tokens": 5673347.0, "reward": 1.0050941705703735, "reward_std": 0.25456106662750244, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5573132634162903, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7706666588783264, "step": 24 }, { "adv/mean_abs_final_conf": 0.7260433435440063, "adv/mean_abs_reasoning": 0.38976427912712097, "adv/mean_abs_step_conf": 0.762317419052124, "adv/ratio_final_to_reasoning": 1.8627754836076411, "adv/ratio_step_to_reasoning": 1.9558421843051847, "adv/std_final_conf": 0.9239747524261475, "adv/std_reasoning": 0.681587278842926, "adv/std_step_conf": 0.9359145760536194, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5706342939860507, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.3552191235059762, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9203187250996016, "calib/gap": 0.0056507435188840605, "calib/mean_conf": 0.9488446215139443, "calib/mu_c": 0.9511409395973154, "calib/mu_w": 0.9454901960784313, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3552191235059762, "calib/std_conf": 0.03377715465001707, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5745710627400769, "calib/step_q_c_n": 781.0, "calib/step_q_gap": -0.0027074182725814744, "calib/step_q_w": 0.5772784810126583, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 506.0859375, "completions/mean_terminated_length": 508.07061767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.02666666666666667, "grad_norm": 0.02284402586519718, "kl": 0.029443740844726562, "learning_rate": 4.861111111111111e-06, "loss": 0.0858, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03279361128807068, "mask/share_reasoning": 0.841393768787384, "mask/share_step_conf": 0.1219063550233841, "num_tokens": 5906129.0, "reward": 1.0461409091949463, "reward_std": 0.20731407403945923, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6217843294143677, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7719982862472534, "step": 25 }, { "adv/mean_abs_final_conf": 0.7309408187866211, "adv/mean_abs_reasoning": 0.47793784737586975, "adv/mean_abs_step_conf": 0.76811283826828, "adv/ratio_final_to_reasoning": 1.5293637505375872, "adv/ratio_step_to_reasoning": 1.6071395945845754, "adv/std_final_conf": 0.9263646602630615, "adv/std_reasoning": 0.757454514503479, "adv/std_step_conf": 0.9359714984893799, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5484908303616913, "calib/avg_num_step_conf": 5.125, "calib/ece": 0.35827450980392156, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9450980392156862, "calib/gap": 0.005112710137544441, "calib/mean_conf": 0.9504313725490194, "calib/mu_c": 0.9525165562913906, "calib/mu_w": 0.9474038461538462, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35827450980392156, "calib/std_conf": 0.02953579078561796, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.588168044077135, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.03574483588600863, "calib/step_q_w": 0.5524232081911263, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 489.28515625, "completions/mean_terminated_length": 489.28515625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.027733333333333332, "grad_norm": 0.02680124156177044, "kl": 0.03493499755859375, "learning_rate": 4.833333333333333e-06, "loss": 0.0069, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03132498264312744, "mask/share_reasoning": 0.8564911484718323, "mask/share_step_conf": 0.11218388378620148, "num_tokens": 6136626.0, "reward": 1.0738930702209473, "reward_std": 0.22200658917427063, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6292617321014404, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8008912801742554, "step": 26 }, { "adv/mean_abs_final_conf": 0.7657771110534668, "adv/mean_abs_reasoning": 0.5099864602088928, "adv/mean_abs_step_conf": 0.7635960578918457, "adv/ratio_final_to_reasoning": 1.5015636115903959, "adv/ratio_step_to_reasoning": 1.4972869232235562, "adv/std_final_conf": 0.9304526448249817, "adv/std_reasoning": 0.7575855255126953, "adv/std_step_conf": 0.935930609703064, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4673167996011965, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.4856692913385828, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8858267716535433, "calib/gap": -0.012193419740777589, "calib/mean_conf": 0.9255118110236221, "calib/mu_c": 0.9189830508474577, "calib/mu_w": 0.9311764705882353, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4733070866141733, "calib/std_conf": 0.13233344106566278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.545643879173291, "calib/step_q_c_n": 629.0, "calib/step_q_gap": -0.018696648404646576, "calib/step_q_w": 0.5643405275779376, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 501.94140625, "completions/mean_terminated_length": 501.94140625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.0288, "grad_norm": 0.031133631244301796, "kl": 0.03224945068359375, "learning_rate": 4.805555555555556e-06, "loss": 0.0273, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032666970044374466, "mask/share_reasoning": 0.844009518623352, "mask/share_step_conf": 0.12332353740930557, "num_tokens": 6370339.0, "reward": 0.9932471513748169, "reward_std": 0.23951643705368042, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.5111820101737976, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7897913455963135, "step": 27 }, { "adv/mean_abs_final_conf": 0.7506414651870728, "adv/mean_abs_reasoning": 0.4154139459133148, "adv/mean_abs_step_conf": 0.747469425201416, "adv/ratio_final_to_reasoning": 1.8069722323277766, "adv/ratio_step_to_reasoning": 1.7993363789413845, "adv/std_final_conf": 0.8933430314064026, "adv/std_reasoning": 0.6817694306373596, "adv/std_step_conf": 0.9356337189674377, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.539148305370258, "calib/avg_num_step_conf": 5.14453125, "calib/ece": 0.3367600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.924, "calib/gap": 0.01903038878781782, "calib/mean_conf": 0.9487599999999999, "calib/mu_c": 0.9561437908496734, "calib/mu_w": 0.9371134020618556, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3367600000000001, "calib/std_conf": 0.05744965099981025, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5563813229571984, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.04348755006342553, "calib/step_q_w": 0.5128937728937729, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 575.5234375, "completions/mean_terminated_length": 575.5234375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.029866666666666666, "grad_norm": 0.03289984166622162, "kl": 0.0348358154296875, "learning_rate": 4.777777777777778e-06, "loss": 0.014, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030212881043553352, "mask/share_reasoning": 0.8617681264877319, "mask/share_step_conf": 0.10801897943019867, "num_tokens": 6624617.0, "reward": 1.0612573623657227, "reward_std": 0.22830414772033691, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6390644311904907, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7801127433776855, "step": 28 }, { "adv/mean_abs_final_conf": 0.760231614112854, "adv/mean_abs_reasoning": 0.5034997463226318, "adv/mean_abs_step_conf": 0.7915377616882324, "adv/ratio_final_to_reasoning": 1.5098947311598323, "adv/ratio_step_to_reasoning": 1.5720718182468199, "adv/std_final_conf": 0.9243661165237427, "adv/std_reasoning": 0.757530689239502, "adv/std_step_conf": 0.9357538819313049, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5606481481481483, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.4767843137254901, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9098039215686274, "calib/gap": 0.009055555555555816, "calib/mean_conf": 0.9473725490196078, "calib/mu_c": 0.9521666666666667, "calib/mu_w": 0.9431111111111109, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4767843137254901, "calib/std_conf": 0.0380061712142284, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5177503852080123, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.008317091229263829, "calib/step_q_w": 0.5094332939787485, "calib/step_q_w_n": 847.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 579.4765625, "completions/mean_terminated_length": 579.4765625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.030933333333333334, "grad_norm": 0.025897309184074402, "kl": 0.031734466552734375, "learning_rate": 4.75e-06, "loss": -0.0289, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.029150307178497314, "mask/share_reasoning": 0.8561632037162781, "mask/share_step_conf": 0.11468647420406342, "num_tokens": 6880091.0, "reward": 1.0111935138702393, "reward_std": 0.22310924530029297, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5168156623840332, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.809443473815918, "step": 29 }, { "adv/mean_abs_final_conf": 0.7617216110229492, "adv/mean_abs_reasoning": 0.5963394641876221, "adv/mean_abs_step_conf": 0.7448407411575317, "adv/ratio_final_to_reasoning": 1.2773288651299022, "adv/ratio_step_to_reasoning": 1.2490213810890567, "adv/std_final_conf": 0.9236946105957031, "adv/std_reasoning": 0.8265910148620605, "adv/std_step_conf": 0.9360052943229675, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5465801886792453, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.3615199999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.88, "calib/gap": 0.02419287211740051, "calib/mean_conf": 0.93752, "calib/mu_c": 0.9477777777777779, "calib/mu_w": 0.9235849056603774, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3615199999999998, "calib/std_conf": 0.08375828078464839, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5081538461538462, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.03649030937763337, "calib/step_q_w": 0.4716635367762128, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 586.51953125, "completions/mean_terminated_length": 586.51953125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.032, "grad_norm": 0.02579519897699356, "kl": 0.032093048095703125, "learning_rate": 4.722222222222222e-06, "loss": 0.0179, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028735563158988953, "mask/share_reasoning": 0.8600101470947266, "mask/share_step_conf": 0.1112542599439621, "num_tokens": 7137224.0, "reward": 1.061484932899475, "reward_std": 0.2636525630950928, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6148117184638977, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.800751268863678, "step": 30 }, { "adv/mean_abs_final_conf": 0.732903003692627, "adv/mean_abs_reasoning": 0.3279497027397156, "adv/mean_abs_step_conf": 0.758866548538208, "adv/ratio_final_to_reasoning": 2.23480307367228, "adv/ratio_step_to_reasoning": 2.313972362830586, "adv/std_final_conf": 0.9095056056976318, "adv/std_reasoning": 0.6187405586242676, "adv/std_step_conf": 0.9357991814613342, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5910506094329624, "calib/avg_num_step_conf": 6.1484375, "calib/ece": 0.4873684210526316, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8663967611336032, "calib/gap": 0.03189785373608889, "calib/mean_conf": 0.9367611336032389, "calib/mu_c": 0.9543243243243243, "calib/mu_w": 0.9224264705882355, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4873684210526316, "calib/std_conf": 0.08962379552452024, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5029333333333333, "calib/step_q_c_n": 600.0, "calib/step_q_gap": 0.07074647501711151, "calib/step_q_w": 0.43218685831622183, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 593.453125, "completions/mean_terminated_length": 598.1259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.03306666666666667, "grad_norm": 0.03246685862541199, "kl": 0.03501129150390625, "learning_rate": 4.694444444444445e-06, "loss": -0.0714, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02911219745874405, "mask/share_reasoning": 0.8506971597671509, "mask/share_step_conf": 0.11237817257642746, "num_tokens": 7395060.0, "reward": 0.979390025138855, "reward_std": 0.20983630418777466, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.5042523145675659, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.783747673034668, "step": 31 }, { "adv/mean_abs_final_conf": 0.7484625577926636, "adv/mean_abs_reasoning": 0.41868317127227783, "adv/mean_abs_step_conf": 0.7483275532722473, "adv/ratio_final_to_reasoning": 1.787658566543922, "adv/ratio_step_to_reasoning": 1.787336116229031, "adv/std_final_conf": 0.9268016815185547, "adv/std_reasoning": 0.6816485524177551, "adv/std_step_conf": 0.9360527992248535, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6109027336300064, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.39816733067729093, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7569721115537849, "calib/gap": 0.04753083280355985, "calib/mean_conf": 0.911394422310757, "calib/mu_c": 0.9343076923076922, "calib/mu_w": 0.8867768595041323, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3958167330677292, "calib/std_conf": 0.1273857890494772, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4802011494252874, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.05791927023065652, "calib/step_q_w": 0.4222818791946309, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 534.71484375, "completions/mean_terminated_length": 541.0553588867188, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.034133333333333335, "grad_norm": 0.028719009831547737, "kl": 0.038970947265625, "learning_rate": 4.666666666666667e-06, "loss": -0.0573, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030235659331083298, "mask/share_reasoning": 0.8475057482719421, "mask/share_step_conf": 0.11053981631994247, "num_tokens": 7638651.0, "reward": 1.043382167816162, "reward_std": 0.2221207618713379, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5912359356880188, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7985814809799194, "step": 32 }, { "adv/mean_abs_final_conf": 0.7398884892463684, "adv/mean_abs_reasoning": 0.4711639881134033, "adv/mean_abs_step_conf": 0.7429441809654236, "adv/ratio_final_to_reasoning": 1.5703417661629233, "adv/ratio_step_to_reasoning": 1.576827176330391, "adv/std_final_conf": 0.9138897061347961, "adv/std_reasoning": 0.7392632365226746, "adv/std_step_conf": 0.9357470273971558, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5652297220646625, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.4093253968253968, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8293650793650794, "calib/gap": 0.039725846095670425, "calib/mean_conf": 0.9212301587301587, "calib/mu_c": 0.9406201550387597, "calib/mu_w": 0.9008943089430893, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4093253968253968, "calib/std_conf": 0.10921288826629788, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5037347767253044, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.017423416276496573, "calib/step_q_w": 0.48631136044880785, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 552.71484375, "completions/mean_terminated_length": 552.71484375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.0352, "grad_norm": 0.026177920401096344, "kl": 0.03797149658203125, "learning_rate": 4.638888888888889e-06, "loss": 0.0651, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030195150524377823, "mask/share_reasoning": 0.8575115203857422, "mask/share_step_conf": 0.11229334771633148, "num_tokens": 7887018.0, "reward": 1.0466405153274536, "reward_std": 0.21856100857257843, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5812917947769165, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.809555172920227, "step": 33 }, { "adv/mean_abs_final_conf": 0.7491120100021362, "adv/mean_abs_reasoning": 0.5132975578308105, "adv/mean_abs_step_conf": 0.7715127468109131, "adv/ratio_final_to_reasoning": 1.459410820436931, "adv/ratio_step_to_reasoning": 1.5030516608559699, "adv/std_final_conf": 0.928077220916748, "adv/std_reasoning": 0.7575961947441101, "adv/std_step_conf": 0.9354490637779236, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49954850361197123, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.32480314960629925, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7047244094488189, "calib/gap": 0.0014060887512901132, "calib/mean_conf": 0.8845669291338583, "calib/mu_c": 0.8851315789473684, "calib/mu_w": 0.8837254901960783, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3054724409448819, "calib/std_conf": 0.16156106524765057, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4707071960297767, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.04542390991079648, "calib/step_q_w": 0.4252832861189802, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 483.0546875, "completions/mean_terminated_length": 484.94903564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.03626666666666667, "grad_norm": 0.0237668976187706, "kl": 0.049846649169921875, "learning_rate": 4.611111111111112e-06, "loss": -0.0303, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03377118334174156, "mask/share_reasoning": 0.8307417631149292, "mask/share_step_conf": 0.13158085942268372, "num_tokens": 8115792.0, "reward": 1.1046805381774902, "reward_std": 0.21319980919361115, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6436253786087036, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.832886278629303, "step": 34 }, { "adv/mean_abs_final_conf": 0.768444299697876, "adv/mean_abs_reasoning": 0.6530827283859253, "adv/mean_abs_step_conf": 0.7854447364807129, "adv/ratio_final_to_reasoning": 1.1766415896452558, "adv/ratio_step_to_reasoning": 1.2026726513223163, "adv/std_final_conf": 0.9365459084510803, "adv/std_reasoning": 0.8746518492698669, "adv/std_step_conf": 0.9357449412345886, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5909457364341085, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.35590551181102364, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.562992125984252, "calib/gap": 0.08058790697674423, "calib/mean_conf": 0.8480314960629921, "calib/mu_c": 0.8889600000000001, "calib/mu_w": 0.8083720930232559, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35590551181102364, "calib/std_conf": 0.18002641700323216, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4165258215962442, "calib/step_q_c_n": 639.0, "calib/step_q_gap": -0.02817463923324892, "calib/step_q_w": 0.4447004608294931, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 550.47265625, "completions/mean_terminated_length": 550.47265625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.037333333333333336, "grad_norm": 0.031982872635126114, "kl": 0.043140411376953125, "learning_rate": 4.583333333333333e-06, "loss": -0.0319, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.029373832046985626, "mask/share_reasoning": 0.8712924718856812, "mask/share_step_conf": 0.09933367371559143, "num_tokens": 8365969.0, "reward": 1.0584797859191895, "reward_std": 0.2510361969470978, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6224359273910522, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7994740009307861, "step": 35 }, { "adv/mean_abs_final_conf": 0.7417552471160889, "adv/mean_abs_reasoning": 0.40401333570480347, "adv/mean_abs_step_conf": 0.7431538105010986, "adv/ratio_final_to_reasoning": 1.8359672356411028, "adv/ratio_step_to_reasoning": 1.8394289119309954, "adv/std_final_conf": 0.9318743348121643, "adv/std_reasoning": 0.7204494476318359, "adv/std_step_conf": 0.9359287619590759, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6210376794258373, "calib/avg_num_step_conf": 5.359375, "calib/ece": 0.18619047619047624, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5515873015873016, "calib/gap": 0.058914473684210655, "calib/mean_conf": 0.8628571428571429, "calib/mu_c": 0.8806250000000001, "calib/mu_w": 0.8217105263157894, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17531746031746037, "calib/std_conf": 0.1513581747120194, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4776652452025586, "calib/step_q_c_n": 938.0, "calib/step_q_gap": 0.03552238805970148, "calib/step_q_w": 0.4421428571428571, "calib/step_q_w_n": 434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 500.69140625, "completions/mean_terminated_length": 502.6549377441406, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.0384, "grad_norm": 0.04614810645580292, "kl": 0.048313140869140625, "learning_rate": 4.555555555555556e-06, "loss": -0.0188, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03415819630026817, "mask/share_reasoning": 0.8374646902084351, "mask/share_step_conf": 0.12447094917297363, "num_tokens": 8596858.0, "reward": 1.1519205570220947, "reward_std": 0.19022798538208008, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7522937059402466, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8114482760429382, "step": 36 }, { "adv/mean_abs_final_conf": 0.7809498310089111, "adv/mean_abs_reasoning": 0.45968902111053467, "adv/mean_abs_step_conf": 0.7330397367477417, "adv/ratio_final_to_reasoning": 1.6988655267908337, "adv/ratio_step_to_reasoning": 1.5946426890440752, "adv/std_final_conf": 0.9347269535064697, "adv/std_reasoning": 0.7394886016845703, "adv/std_step_conf": 0.9361408352851868, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.618536280233528, "calib/avg_num_step_conf": 5.35546875, "calib/ece": 0.3527800829875518, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4107883817427386, "calib/gap": 0.08926675006950224, "calib/mean_conf": 0.792116182572614, "calib/mu_c": 0.8410091743119265, "calib/mu_w": 0.7517424242424242, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.34630705394190864, "calib/std_conf": 0.222647412558595, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.46208413001912046, "calib/step_q_c_n": 523.0, "calib/step_q_gap": 0.0653860168115733, "calib/step_q_w": 0.39669811320754716, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 592.18359375, "completions/mean_terminated_length": 594.5059204101562, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.039466666666666664, "grad_norm": 0.03609446808695793, "kl": 0.041744232177734375, "learning_rate": 4.527777777777778e-06, "loss": 0.0311, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03040693700313568, "mask/share_reasoning": 0.854725182056427, "mask/share_step_conf": 0.11096163839101791, "num_tokens": 8855553.0, "reward": 1.0080500841140747, "reward_std": 0.2737266421318054, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.5899812579154968, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7694959044456482, "step": 37 }, { "adv/mean_abs_final_conf": 0.7715527415275574, "adv/mean_abs_reasoning": 0.3854137659072876, "adv/mean_abs_step_conf": 0.7639725804328918, "adv/ratio_final_to_reasoning": 2.0018816393630234, "adv/ratio_step_to_reasoning": 1.9822140463365485, "adv/std_final_conf": 0.9348305463790894, "adv/std_reasoning": 0.7012956142425537, "adv/std_step_conf": 0.9358208179473877, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5929146537842189, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.27399999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.432, "calib/gap": 0.07270853462157834, "calib/mean_conf": 0.80048, "calib/mu_c": 0.833925925925926, "calib/mu_w": 0.7612173913043476, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26724, "calib/std_conf": 0.20289447897860602, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4582369942196532, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.024215908948335363, "calib/step_q_w": 0.43402108527131783, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 525.8125, "completions/mean_terminated_length": 527.87451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.04053333333333333, "grad_norm": 0.028932897374033928, "kl": 0.04598236083984375, "learning_rate": 4.5e-06, "loss": -0.01, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031843964010477066, "mask/share_reasoning": 0.8506335020065308, "mask/share_step_conf": 0.1136162132024765, "num_tokens": 9097049.0, "reward": 1.0886796712875366, "reward_std": 0.19496215879917145, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6627984642982483, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8091864585876465, "step": 38 }, { "adv/mean_abs_final_conf": 0.7793285846710205, "adv/mean_abs_reasoning": 0.46733757853507996, "adv/mean_abs_step_conf": 0.7500290870666504, "adv/ratio_final_to_reasoning": 1.6675923796111367, "adv/ratio_step_to_reasoning": 1.604897875787557, "adv/std_final_conf": 0.9338850378990173, "adv/std_reasoning": 0.7205843329429626, "adv/std_step_conf": 0.9357541799545288, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6334584115071921, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.29735177865612655, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4268774703557312, "calib/gap": 0.10142464040025001, "calib/mean_conf": 0.7642292490118577, "calib/mu_c": 0.8135384615384614, "calib/mu_w": 0.7121138211382114, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2738735177865613, "calib/std_conf": 0.2433444077939215, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4634579439252336, "calib/step_q_c_n": 642.0, "calib/step_q_gap": 0.0573815108042145, "calib/step_q_w": 0.4060764331210191, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 513.671875, "completions/mean_terminated_length": 513.671875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0416, "grad_norm": 0.027603862807154655, "kl": 0.04595947265625, "learning_rate": 4.472222222222223e-06, "loss": 0.0346, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03283419460058212, "mask/share_reasoning": 0.8497879505157471, "mask/share_step_conf": 0.1173778772354126, "num_tokens": 9334637.0, "reward": 1.10282564163208, "reward_std": 0.19559051096439362, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6709941625595093, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8231047987937927, "step": 39 }, { "adv/mean_abs_final_conf": 0.7777461409568787, "adv/mean_abs_reasoning": 0.5213450193405151, "adv/mean_abs_step_conf": 0.7524310350418091, "adv/ratio_final_to_reasoning": 1.491806984059621, "adv/ratio_step_to_reasoning": 1.4432496851962073, "adv/std_final_conf": 0.9362614154815674, "adv/std_reasoning": 0.7753744125366211, "adv/std_step_conf": 0.9359205961227417, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6162286217303824, "calib/avg_num_step_conf": 4.71484375, "calib/ece": 0.31303149606299213, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.24803149606299213, "calib/gap": 0.09046403420523152, "calib/mean_conf": 0.7386220472440945, "calib/mu_c": 0.7891964285714287, "calib/mu_w": 0.6987323943661972, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30535433070866147, "calib/std_conf": 0.22126175146989444, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46174757281553397, "calib/step_q_c_n": 515.0, "calib/step_q_gap": 0.038727341601661136, "calib/step_q_w": 0.42302023121387283, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 501.30859375, "completions/mean_terminated_length": 503.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.042666666666666665, "grad_norm": 0.037154071033000946, "kl": 0.05255126953125, "learning_rate": 4.444444444444444e-06, "loss": -0.0292, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033608511090278625, "mask/share_reasoning": 0.8529207706451416, "mask/share_step_conf": 0.10956442356109619, "num_tokens": 9569732.0, "reward": 1.0853300094604492, "reward_std": 0.20308294892311096, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6553597450256348, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8195751905441284, "step": 40 }, { "adv/mean_abs_final_conf": 0.7587120532989502, "adv/mean_abs_reasoning": 0.5101221799850464, "adv/mean_abs_step_conf": 0.7564547061920166, "adv/ratio_final_to_reasoning": 1.4873143789223024, "adv/ratio_step_to_reasoning": 1.4828892682419557, "adv/std_final_conf": 0.9357188940048218, "adv/std_reasoning": 0.7575936317443848, "adv/std_step_conf": 0.9356542229652405, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6559618441971382, "calib/avg_num_step_conf": 4.875, "calib/ece": 0.12010988142292489, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.30039525691699603, "calib/gap": 0.14302249602543704, "calib/mean_conf": 0.7359375494071146, "calib/mu_c": 0.7743783783783783, "calib/mu_w": 0.6313558823529413, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06241106719367588, "calib/std_conf": 0.2340747275470123, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.464314381270903, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.040867087823609594, "calib/step_q_w": 0.4234472934472934, "calib/step_q_w_n": 351.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 479.34375, "completions/mean_terminated_length": 479.34375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.04373333333333333, "grad_norm": 0.04717167466878891, "kl": 0.10785293579101562, "learning_rate": 4.416666666666667e-06, "loss": -0.0067, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03561156243085861, "mask/share_reasoning": 0.8464666604995728, "mask/share_step_conf": 0.11792174726724625, "num_tokens": 9799692.0, "reward": 1.181377649307251, "reward_std": 0.18686681985855103, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7954376935958862, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8167534470558167, "step": 41 }, { "adv/mean_abs_final_conf": 0.7862992286682129, "adv/mean_abs_reasoning": 0.42737993597984314, "adv/mean_abs_step_conf": 0.7851310968399048, "adv/ratio_final_to_reasoning": 1.8398131556304453, "adv/ratio_step_to_reasoning": 1.8370799158829358, "adv/std_final_conf": 0.9355712532997131, "adv/std_reasoning": 0.6816651225090027, "adv/std_step_conf": 0.9355069994926453, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.597852564102564, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.22527559055118113, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.31496062992125984, "calib/gap": 0.07996923076923079, "calib/mean_conf": 0.7464566929133858, "calib/mu_c": 0.7792, "calib/mu_w": 0.6992307692307692, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19059055118110238, "calib/std_conf": 0.2410493057644755, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46577608142493643, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.02478270394149268, "calib/step_q_w": 0.44099337748344375, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 441.1875, "completions/mean_terminated_length": 442.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.0448, "grad_norm": 0.030178029090166092, "kl": 0.058757781982421875, "learning_rate": 4.388888888888889e-06, "loss": 0.0395, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03646181896328926, "mask/share_reasoning": 0.8297966718673706, "mask/share_step_conf": 0.12983526289463043, "num_tokens": 10017004.0, "reward": 1.1211426258087158, "reward_std": 0.19580897688865662, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7046023607254028, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8157469630241394, "step": 42 }, { "adv/mean_abs_final_conf": 0.7810938954353333, "adv/mean_abs_reasoning": 0.5661770105361938, "adv/mean_abs_step_conf": 0.7464275360107422, "adv/ratio_final_to_reasoning": 1.3795930970344485, "adv/ratio_step_to_reasoning": 1.318364260858708, "adv/std_final_conf": 0.9360543489456177, "adv/std_reasoning": 0.7928243279457092, "adv/std_step_conf": 0.9359555244445801, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.626399749765405, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.18007874015748032, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.24803149606299213, "calib/gap": 0.11477760400375348, "calib/mean_conf": 0.6747244094488188, "calib/mu_c": 0.7266906474820143, "calib/mu_w": 0.6119130434782608, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15377952755905513, "calib/std_conf": 0.26266150418767414, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44504098360655736, "calib/step_q_c_n": 732.0, "calib/step_q_gap": -0.009753023884079393, "calib/step_q_w": 0.45479400749063675, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 506.80078125, "completions/mean_terminated_length": 506.80078125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.04586666666666667, "grad_norm": 0.032175462692976, "kl": 0.052890777587890625, "learning_rate": 4.361111111111112e-06, "loss": 0.0282, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033847928047180176, "mask/share_reasoning": 0.8536717891693115, "mask/share_step_conf": 0.11248025298118591, "num_tokens": 10251969.0, "reward": 1.1071585416793823, "reward_std": 0.2217944860458374, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.711056649684906, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.798527717590332, "step": 43 }, { "adv/mean_abs_final_conf": 0.7781606912612915, "adv/mean_abs_reasoning": 0.44944122433662415, "adv/mean_abs_step_conf": 0.7437357306480408, "adv/ratio_final_to_reasoning": 1.7313958958924112, "adv/ratio_step_to_reasoning": 1.6548008735642703, "adv/std_final_conf": 0.9344425201416016, "adv/std_reasoning": 0.7206430435180664, "adv/std_step_conf": 0.9358530044555664, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6591481854838711, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.23027777777777772, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.23412698412698413, "calib/gap": 0.11502016129032266, "calib/mean_conf": 0.6952777777777778, "calib/mu_c": 0.7518750000000001, "calib/mu_w": 0.6368548387096774, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20880952380952378, "calib/std_conf": 0.23684668334375034, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47359237536656895, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.05480800488900023, "calib/step_q_w": 0.4187843704775687, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 530.0, "completions/mean_terminated_length": 532.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.046933333333333334, "grad_norm": 0.031568072736263275, "kl": 0.045703887939453125, "learning_rate": 4.333333333333334e-06, "loss": -0.0645, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03036835603415966, "mask/share_reasoning": 0.8552473783493042, "mask/share_step_conf": 0.11047796905040741, "num_tokens": 10493969.0, "reward": 1.1189405918121338, "reward_std": 0.19654977321624756, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7051723003387451, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8238892555236816, "step": 44 }, { "adv/mean_abs_final_conf": 0.7513673305511475, "adv/mean_abs_reasoning": 0.46464505791664124, "adv/mean_abs_step_conf": 0.7417216300964355, "adv/ratio_final_to_reasoning": 1.6170780636731643, "adv/ratio_step_to_reasoning": 1.5963187759321928, "adv/std_final_conf": 0.9353601336479187, "adv/std_reasoning": 0.75745689868927, "adv/std_step_conf": 0.9355478882789612, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.670344387755102, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.17829365079365092, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": 0.15907142857142853, "calib/mean_conf": 0.7176587301587302, "calib/mu_c": 0.7883571428571429, "calib/mu_w": 0.6292857142857143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17019841269841282, "calib/std_conf": 0.2480296853296732, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4704521963824289, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.04230933923957181, "calib/step_q_w": 0.4281428571428571, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 506.3046875, "completions/mean_terminated_length": 508.29022216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.048, "grad_norm": 0.02938169613480568, "kl": 0.047039031982421875, "learning_rate": 4.305555555555556e-06, "loss": 0.0387, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03402433916926384, "mask/share_reasoning": 0.8358112573623657, "mask/share_step_conf": 0.12625813484191895, "num_tokens": 10728631.0, "reward": 1.1400502920150757, "reward_std": 0.18121424317359924, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7322214841842651, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8277525901794434, "step": 45 }, { "adv/mean_abs_final_conf": 0.7710835337638855, "adv/mean_abs_reasoning": 0.4401756227016449, "adv/mean_abs_step_conf": 0.7695859670639038, "adv/ratio_final_to_reasoning": 1.7517633735172404, "adv/ratio_step_to_reasoning": 1.748361170799175, "adv/std_final_conf": 0.9323577284812927, "adv/std_reasoning": 0.7014464735984802, "adv/std_step_conf": 0.9356615543365479, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6382832080200502, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.2381818181818182, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.31620553359683795, "calib/gap": 0.12006453634085212, "calib/mean_conf": 0.7455335968379447, "calib/mu_c": 0.8024812030075188, "calib/mu_w": 0.6824166666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2290118577075099, "calib/std_conf": 0.23834784561973427, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4649563953488372, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.039651310603074474, "calib/step_q_w": 0.42530508474576273, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 530.8359375, "completions/mean_terminated_length": 532.9176635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.04906666666666667, "grad_norm": 0.028869692236185074, "kl": 0.04286956787109375, "learning_rate": 4.277777777777778e-06, "loss": -0.04, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034582871943712234, "mask/share_reasoning": 0.8415893316268921, "mask/share_step_conf": 0.1199214980006218, "num_tokens": 10969293.0, "reward": 1.102731466293335, "reward_std": 0.1884532868862152, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.691124677658081, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8095587491989136, "step": 46 }, { "adv/mean_abs_final_conf": 0.7545320391654968, "adv/mean_abs_reasoning": 0.36301976442337036, "adv/mean_abs_step_conf": 0.7577338218688965, "adv/ratio_final_to_reasoning": 2.0784874905200117, "adv/ratio_step_to_reasoning": 2.0873073483271627, "adv/std_final_conf": 0.9296429753303528, "adv/std_reasoning": 0.6611769199371338, "adv/std_step_conf": 0.9353532791137695, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6598508230452675, "calib/avg_num_step_conf": 5.78515625, "calib/ece": 0.21817460317460327, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.39285714285714285, "calib/gap": 0.14166666666666683, "calib/mean_conf": 0.7792857142857142, "calib/mu_c": 0.84, "calib/mu_w": 0.6983333333333331, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2130158730158731, "calib/std_conf": 0.22678630796072777, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4585783718104495, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.03772730798066237, "calib/step_q_w": 0.42085106382978715, "calib/step_q_w_n": 658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2823.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 547.4375, "completions/mean_terminated_length": 549.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.050133333333333335, "grad_norm": 0.027648447081446648, "kl": 0.041889190673828125, "learning_rate": 4.25e-06, "loss": -0.0455, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03150080889463425, "mask/share_reasoning": 0.8493713140487671, "mask/share_step_conf": 0.11522158980369568, "num_tokens": 11215413.0, "reward": 1.137692928314209, "reward_std": 0.17622481286525726, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7184492349624634, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8317077159881592, "step": 47 }, { "adv/mean_abs_final_conf": 0.767864465713501, "adv/mean_abs_reasoning": 0.5330666303634644, "adv/mean_abs_step_conf": 0.7646841406822205, "adv/ratio_final_to_reasoning": 1.440466204365377, "adv/ratio_step_to_reasoning": 1.43450011147918, "adv/std_final_conf": 0.9259854555130005, "adv/std_reasoning": 0.7753552198410034, "adv/std_step_conf": 0.9356560111045837, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5637729756582215, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.36204724409448824, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4330708661417323, "calib/gap": 0.023260059612518647, "calib/mean_conf": 0.7809448818897639, "calib/mu_c": 0.7930327868852459, "calib/mu_w": 0.7697727272727273, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3313385826771654, "calib/std_conf": 0.2423098591228505, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47289121061359873, "calib/step_q_c_n": 603.0, "calib/step_q_gap": 0.026414658015373127, "calib/step_q_w": 0.4464765525982256, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 466.66015625, "completions/mean_terminated_length": 468.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.0512, "grad_norm": 0.03668780252337456, "kl": 0.04692840576171875, "learning_rate": 4.222222222222223e-06, "loss": 0.0596, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03637230396270752, "mask/share_reasoning": 0.8307284116744995, "mask/share_step_conf": 0.12899301946163177, "num_tokens": 11438566.0, "reward": 1.063523292541504, "reward_std": 0.19171887636184692, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6065890789031982, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8183259963989258, "step": 48 }, { "adv/mean_abs_final_conf": 0.7678335309028625, "adv/mean_abs_reasoning": 0.4053008556365967, "adv/mean_abs_step_conf": 0.7596707344055176, "adv/ratio_final_to_reasoning": 1.8944779420631723, "adv/ratio_step_to_reasoning": 1.874337850119562, "adv/std_final_conf": 0.9254300594329834, "adv/std_reasoning": 0.6613235473632812, "adv/std_step_conf": 0.935420572757721, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6970525082959204, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.2572933333333333, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.492, "calib/gap": 0.142554709697009, "calib/mean_conf": 0.8173733333333332, "calib/mu_c": 0.8795271867612292, "calib/mu_w": 0.7369724770642202, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2553333333333333, "calib/std_conf": 0.1957939692636114, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.500190336749634, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.08254507681079609, "calib/step_q_w": 0.4176452599388379, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2547.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 487.578125, "completions/mean_terminated_length": 489.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.05226666666666667, "grad_norm": 0.03198077902197838, "kl": 0.044513702392578125, "learning_rate": 4.194444444444445e-06, "loss": -0.0769, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0343097522854805, "mask/share_reasoning": 0.8420140743255615, "mask/share_step_conf": 0.11976996064186096, "num_tokens": 11667922.0, "reward": 1.1165080070495605, "reward_std": 0.1887185424566269, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7010080218315125, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8182135224342346, "step": 49 }, { "adv/mean_abs_final_conf": 0.7431448101997375, "adv/mean_abs_reasoning": 0.4307158887386322, "adv/mean_abs_step_conf": 0.7490586042404175, "adv/ratio_final_to_reasoning": 1.7253712473344442, "adv/ratio_step_to_reasoning": 1.7391013979866496, "adv/std_final_conf": 0.9296773672103882, "adv/std_reasoning": 0.7013258934020996, "adv/std_step_conf": 0.9353389143943787, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6683548387096774, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.2447054901960784, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5058823529411764, "calib/gap": 0.09710064516129058, "calib/mean_conf": 0.8279219607843138, "calib/mu_c": 0.8660006451612905, "calib/mu_w": 0.7688999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23239215686274506, "calib/std_conf": 0.19343981033801264, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4770713391739675, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.034895959747323235, "calib/step_q_w": 0.44217537942664425, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 483.6875, "completions/mean_terminated_length": 485.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.05333333333333334, "grad_norm": 0.03353298455476761, "kl": 0.040256500244140625, "learning_rate": 4.166666666666667e-06, "loss": -0.0523, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034353211522102356, "mask/share_reasoning": 0.836700975894928, "mask/share_step_conf": 0.12503957748413086, "num_tokens": 11897106.0, "reward": 1.1516687870025635, "reward_std": 0.17538967728614807, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7192476987838745, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8425183296203613, "step": 50 }, { "adv/mean_abs_final_conf": 0.7491422891616821, "adv/mean_abs_reasoning": 0.4412249028682709, "adv/mean_abs_step_conf": 0.7408877611160278, "adv/ratio_final_to_reasoning": 1.6978694636040093, "adv/ratio_step_to_reasoning": 1.6791612538180383, "adv/std_final_conf": 0.9151452779769897, "adv/std_reasoning": 0.7205055952072144, "adv/std_step_conf": 0.93574458360672, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7797020123839009, "calib/avg_num_step_conf": 4.98046875, "calib/ece": 0.1741732283464567, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.43700787401574803, "calib/gap": 0.2205959752321981, "calib/mean_conf": 0.7725984251968504, "calib/mu_c": 0.8611842105263157, "calib/mu_w": 0.6405882352941176, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1741732283464567, "calib/std_conf": 0.23705447262105386, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4693412384716732, "calib/step_q_c_n": 759.0, "calib/step_q_gap": 0.04350790513833991, "calib/step_q_w": 0.4258333333333333, "calib/step_q_w_n": 516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 506.9375, "completions/mean_terminated_length": 506.9375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0544, "grad_norm": 0.032079264521598816, "kl": 0.040374755859375, "learning_rate": 4.138888888888889e-06, "loss": 0.0071, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03356383740901947, "mask/share_reasoning": 0.8532578945159912, "mask/share_step_conf": 0.11317827552556992, "num_tokens": 12136178.0, "reward": 1.1587438583374023, "reward_std": 0.19736525416374207, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7653058767318726, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8243710994720459, "step": 51 }, { "adv/mean_abs_final_conf": 0.7362066507339478, "adv/mean_abs_reasoning": 0.455300509929657, "adv/mean_abs_step_conf": 0.7593135833740234, "adv/ratio_final_to_reasoning": 1.6169686496676452, "adv/ratio_step_to_reasoning": 1.6677195979669248, "adv/std_final_conf": 0.9249099493026733, "adv/std_reasoning": 0.7391347885131836, "adv/std_step_conf": 0.9352485537528992, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.779368100494861, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.06097656249999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.44921875, "calib/gap": 0.28866920441568344, "calib/mean_conf": 0.7441015625, "calib/mu_c": 0.8241621621621622, "calib/mu_w": 0.5354929577464788, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.041210937499999975, "calib/std_conf": 0.2688836394428984, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48005662514156283, "calib/step_q_c_n": 883.0, "calib/step_q_gap": 0.10493585219470292, "calib/step_q_w": 0.3751207729468599, "calib/step_q_w_n": 414.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 460.40625, "completions/mean_terminated_length": 462.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.055466666666666664, "grad_norm": 0.06597699224948883, "kl": 0.044063568115234375, "learning_rate": 4.111111111111111e-06, "loss": -0.021, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03548233211040497, "mask/share_reasoning": 0.8447404503822327, "mask/share_step_conf": 0.11587096750736237, "num_tokens": 12361994.0, "reward": 1.2292792797088623, "reward_std": 0.13924862444400787, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8377586007118225, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8518874645233154, "step": 52 }, { "adv/mean_abs_final_conf": 0.7574901580810547, "adv/mean_abs_reasoning": 0.44077375531196594, "adv/mean_abs_step_conf": 0.7509580850601196, "adv/ratio_final_to_reasoning": 1.718546417413003, "adv/ratio_step_to_reasoning": 1.703726857622489, "adv/std_final_conf": 0.9158998131752014, "adv/std_reasoning": 0.7014303803443909, "adv/std_step_conf": 0.9353622198104858, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6886863136863137, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.240156862745098, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.1623033216783215, "calib/mean_conf": 0.8003921568627452, "calib/mu_c": 0.8716783216783215, "calib/mu_w": 0.709375, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23988235294117646, "calib/std_conf": 0.22842822517397507, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4806020942408376, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.07710135568840037, "calib/step_q_w": 0.40350073855243723, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 504.078125, "completions/mean_terminated_length": 506.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.05653333333333333, "grad_norm": 0.031964704394340515, "kl": 0.038970947265625, "learning_rate": 4.083333333333334e-06, "loss": -0.1266, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03250448405742645, "mask/share_reasoning": 0.8465020060539246, "mask/share_step_conf": 0.11708725988864899, "num_tokens": 12596862.0, "reward": 1.1403965950012207, "reward_std": 0.17160692811012268, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7205559015274048, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8333872556686401, "step": 53 }, { "adv/mean_abs_final_conf": 0.6885330677032471, "adv/mean_abs_reasoning": 0.3714979887008667, "adv/mean_abs_step_conf": 0.7581756114959717, "adv/ratio_final_to_reasoning": 1.8533964883929954, "adv/ratio_step_to_reasoning": 2.0408606090905677, "adv/std_final_conf": 0.897620677947998, "adv/std_reasoning": 0.6611714959144592, "adv/std_step_conf": 0.935348391532898, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.752979414951246, "calib/avg_num_step_conf": 5.24609375, "calib/ece": 0.1518181818181819, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6719367588932806, "calib/gap": 0.20716375174121626, "calib/mean_conf": 0.848181818181818, "calib/mu_c": 0.9063186813186812, "calib/mu_w": 0.699154929577465, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14031620553359694, "calib/std_conf": 0.22338364203121278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5111764705882353, "calib/step_q_c_n": 884.0, "calib/step_q_gap": 0.13037037037037036, "calib/step_q_w": 0.380806100217865, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 455.734375, "completions/mean_terminated_length": 457.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.0576, "grad_norm": 0.0508962981402874, "kl": 0.044342041015625, "learning_rate": 4.055555555555556e-06, "loss": 0.0584, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036514509469270706, "mask/share_reasoning": 0.8346399068832397, "mask/share_step_conf": 0.12493934482336044, "num_tokens": 12819762.0, "reward": 1.2022638320922852, "reward_std": 0.15340590476989746, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.8057184219360352, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.839310348033905, "step": 54 }, { "adv/mean_abs_final_conf": 0.7237907648086548, "adv/mean_abs_reasoning": 0.5115185379981995, "adv/mean_abs_step_conf": 0.776884913444519, "adv/ratio_final_to_reasoning": 1.414984425864938, "adv/ratio_step_to_reasoning": 1.5187815411046817, "adv/std_final_conf": 0.888651967048645, "adv/std_reasoning": 0.775328516960144, "adv/std_step_conf": 0.9353832006454468, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8313030984507745, "calib/avg_num_step_conf": 4.91015625, "calib/ece": 0.23161417322834646, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.610236220472441, "calib/gap": 0.2941929035482257, "calib/mean_conf": 0.7703543307086614, "calib/mu_c": 0.9047101449275362, "calib/mu_w": 0.6105172413793105, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22933070866141733, "calib/std_conf": 0.3026605652489857, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5009267563527653, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.06662403526432986, "calib/step_q_w": 0.43430272108843543, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 474.203125, "completions/mean_terminated_length": 476.0627746582031, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.058666666666666666, "grad_norm": 0.05889483913779259, "kl": 0.050426483154296875, "learning_rate": 4.027777777777779e-06, "loss": -0.0184, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03481750190258026, "mask/share_reasoning": 0.8439182043075562, "mask/share_step_conf": 0.11735805869102478, "num_tokens": 13048982.0, "reward": 1.160832405090332, "reward_std": 0.20851004123687744, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7488183379173279, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8443976640701294, "step": 55 }, { "adv/mean_abs_final_conf": 0.7262025475502014, "adv/mean_abs_reasoning": 0.4655173420906067, "adv/mean_abs_step_conf": 0.7637791633605957, "adv/ratio_final_to_reasoning": 1.5599903202077827, "adv/ratio_step_to_reasoning": 1.64071044041134, "adv/std_final_conf": 0.893416702747345, "adv/std_reasoning": 0.73926842212677, "adv/std_step_conf": 0.9353504180908203, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.697511811023622, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.35551587301587306, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6428571428571429, "calib/gap": 0.14661228346456678, "calib/mean_conf": 0.8290079365079365, "calib/mu_c": 0.9017322834645669, "calib/mu_w": 0.7551200000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3402777777777778, "calib/std_conf": 0.2505791069221469, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4982884097035041, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.03434555256064692, "calib/step_q_w": 0.46394285714285716, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 489.61328125, "completions/mean_terminated_length": 489.61328125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.05973333333333333, "grad_norm": 0.032840728759765625, "kl": 0.04604339599609375, "learning_rate": 4.000000000000001e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03335815668106079, "mask/share_reasoning": 0.8387473821640015, "mask/share_step_conf": 0.12789444625377655, "num_tokens": 13281163.0, "reward": 1.087411642074585, "reward_std": 0.19331833720207214, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6446441411972046, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8227236270904541, "step": 56 }, { "adv/mean_abs_final_conf": 0.6837377548217773, "adv/mean_abs_reasoning": 0.43261250853538513, "adv/mean_abs_step_conf": 0.7604891657829285, "adv/ratio_final_to_reasoning": 1.5804854028298436, "adv/ratio_step_to_reasoning": 1.7578991609779702, "adv/std_final_conf": 0.8755529522895813, "adv/std_reasoning": 0.7012878656387329, "adv/std_step_conf": 0.9352318644523621, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7409663865546218, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.18948818897637798, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7047244094488189, "calib/gap": 0.27577871148459376, "calib/mean_conf": 0.8283858267716536, "calib/mu_c": 0.9195882352941176, "calib/mu_w": 0.6438095238095238, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1742913385826772, "calib/std_conf": 0.2828360184672855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5188636363636364, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.0707114624505929, "calib/step_q_w": 0.4481521739130435, "calib/step_q_w_n": 460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 486.140625, "completions/mean_terminated_length": 489.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0608, "grad_norm": 0.03787647560238838, "kl": 0.0426483154296875, "learning_rate": 3.972222222222223e-06, "loss": -0.0413, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03327067941427231, "mask/share_reasoning": 0.8391324877738953, "mask/share_step_conf": 0.11978430300951004, "num_tokens": 13512407.0, "reward": 1.1986336708068848, "reward_std": 0.1828002631664276, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7892199754714966, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8511983156204224, "step": 57 }, { "adv/mean_abs_final_conf": 0.7383530139923096, "adv/mean_abs_reasoning": 0.5903604030609131, "adv/mean_abs_step_conf": 0.7504562139511108, "adv/ratio_final_to_reasoning": 1.2506818041387622, "adv/ratio_step_to_reasoning": 1.2711831790549122, "adv/std_final_conf": 0.9139520525932312, "adv/std_reasoning": 0.8266242742538452, "adv/std_step_conf": 0.9352567195892334, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.588135026737968, "calib/avg_num_step_conf": 7.5, "calib/ece": 0.29032520325203254, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6138211382113821, "calib/gap": 0.10848128342245988, "calib/mean_conf": 0.775609756097561, "calib/mu_c": 0.8241176470588235, "calib/mu_w": 0.7156363636363636, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.25654471544715446, "calib/std_conf": 0.30846761657250876, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4922186495176849, "calib/step_q_c_n": 933.0, "calib/step_q_gap": 0.11768977413774573, "calib/step_q_w": 0.3745288753799392, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 593.53515625, "completions/mean_terminated_length": 602.9563598632812, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.06186666666666667, "grad_norm": 0.030713409185409546, "kl": 0.037036895751953125, "learning_rate": 3.944444444444445e-06, "loss": -0.0777, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02800177037715912, "mask/share_reasoning": 0.8364847302436829, "mask/share_step_conf": 0.11988846957683563, "num_tokens": 13770672.0, "reward": 1.0557935237884521, "reward_std": 0.2658042907714844, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6286625266075134, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.791220486164093, "step": 58 }, { "adv/mean_abs_final_conf": 0.6902940273284912, "adv/mean_abs_reasoning": 0.4767693281173706, "adv/mean_abs_step_conf": 0.7880936861038208, "adv/ratio_final_to_reasoning": 1.4478574577233612, "adv/ratio_step_to_reasoning": 1.6529873874558656, "adv/std_final_conf": 0.8746950626373291, "adv/std_reasoning": 0.7206186056137085, "adv/std_step_conf": 0.9348664879798889, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6117868059093224, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.3316078431372549, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7411764705882353, "calib/gap": 0.08438104941416191, "calib/mean_conf": 0.8422745098039216, "calib/mu_c": 0.8766887417218543, "calib/mu_w": 0.7923076923076924, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2908627450980392, "calib/std_conf": 0.2866211599834351, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5637988165680473, "calib/step_q_c_n": 845.0, "calib/step_q_gap": 0.04055158409572257, "calib/step_q_w": 0.5232472324723247, "calib/step_q_w_n": 542.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 518.16015625, "completions/mean_terminated_length": 520.1921997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.06293333333333333, "grad_norm": 0.036398790776729584, "kl": 0.04123687744140625, "learning_rate": 3.916666666666667e-06, "loss": 0.0292, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03273506462574005, "mask/share_reasoning": 0.848429799079895, "mask/share_step_conf": 0.11492891609668732, "num_tokens": 14009569.0, "reward": 1.1045868396759033, "reward_std": 0.20788058638572693, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6519827842712402, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8266689777374268, "step": 59 }, { "adv/mean_abs_final_conf": 0.6342928409576416, "adv/mean_abs_reasoning": 0.42958423495292664, "adv/mean_abs_step_conf": 0.7749303579330444, "adv/ratio_final_to_reasoning": 1.476527277652884, "adv/ratio_step_to_reasoning": 1.803907813372063, "adv/std_final_conf": 0.8228415846824646, "adv/std_reasoning": 0.7013217806816101, "adv/std_step_conf": 0.9355958104133606, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7317820156605204, "calib/avg_num_step_conf": 5.2109375, "calib/ece": 0.24665921568627455, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7098039215686275, "calib/gap": 0.30096544581965146, "calib/mean_conf": 0.795536862745098, "calib/mu_c": 0.9218243243243244, "calib/mu_w": 0.6208588785046729, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23090196078431377, "calib/std_conf": 0.33709905516639943, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5604714673913044, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.07986946070234119, "calib/step_q_w": 0.4806020066889632, "calib/step_q_w_n": 598.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 490.54296875, "completions/mean_terminated_length": 490.54296875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.064, "grad_norm": 0.035494614392519, "kl": 0.043849945068359375, "learning_rate": 3.88888888888889e-06, "loss": 0.0345, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03389373794198036, "mask/share_reasoning": 0.8445810675621033, "mask/share_step_conf": 0.12152522802352905, "num_tokens": 14244004.0, "reward": 1.1639959812164307, "reward_std": 0.20684689283370972, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7402294874191284, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.848612368106842, "step": 60 }, { "adv/mean_abs_final_conf": 0.593455970287323, "adv/mean_abs_reasoning": 0.38843584060668945, "adv/mean_abs_step_conf": 0.7466306686401367, "adv/ratio_final_to_reasoning": 1.5278095073832967, "adv/ratio_step_to_reasoning": 1.9221466986001874, "adv/std_final_conf": 0.8274534344673157, "adv/std_reasoning": 0.6612657308578491, "adv/std_step_conf": 0.9348787069320679, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6280560839850631, "calib/avg_num_step_conf": 5.46875, "calib/ece": 0.2711417322834646, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8582677165354331, "calib/gap": 0.12214683294581852, "calib/mean_conf": 0.9068110236220471, "calib/mu_c": 0.9467251461988305, "calib/mu_w": 0.824578313253012, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2523622047244095, "calib/std_conf": 0.240056726638651, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5864446952595938, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.09193107658255101, "calib/step_q_w": 0.49451361867704274, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 459.90234375, "completions/mean_terminated_length": 459.90234375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.06506666666666666, "grad_norm": 0.03402427211403847, "kl": 0.05214691162109375, "learning_rate": 3.861111111111112e-06, "loss": 0.047, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03737509623169899, "mask/share_reasoning": 0.8354184031486511, "mask/share_step_conf": 0.1272064745426178, "num_tokens": 14465803.0, "reward": 1.1424577236175537, "reward_std": 0.19859057664871216, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.715925395488739, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8246393203735352, "step": 61 }, { "adv/mean_abs_final_conf": 0.6989138126373291, "adv/mean_abs_reasoning": 0.5794708132743835, "adv/mean_abs_step_conf": 0.7784146666526794, "adv/ratio_final_to_reasoning": 1.206124271709244, "adv/ratio_step_to_reasoning": 1.3433198856973225, "adv/std_final_conf": 0.8719269037246704, "adv/std_reasoning": 0.7929136157035828, "adv/std_step_conf": 0.9353814125061035, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5977225939269172, "calib/avg_num_step_conf": 5.80078125, "calib/ece": 0.3476800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": 0.12513381369016985, "calib/mean_conf": 0.80552, "calib/mu_c": 0.8635820895522388, "calib/mu_w": 0.738448275862069, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3086000000000001, "calib/std_conf": 0.3257335254467983, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.556652719665272, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.06268136549860531, "calib/step_q_w": 0.49397135416666665, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 528.84765625, "completions/mean_terminated_length": 530.9215698242188, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.06613333333333334, "grad_norm": 0.04951612651348114, "kl": 0.04522705078125, "learning_rate": 3.833333333333334e-06, "loss": 0.0221, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032482948154211044, "mask/share_reasoning": 0.8465949296951294, "mask/share_step_conf": 0.11701588332653046, "num_tokens": 14708268.0, "reward": 1.0481505393981934, "reward_std": 0.2715667188167572, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6199171543121338, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7842558026313782, "step": 62 }, { "adv/mean_abs_final_conf": 0.6785953044891357, "adv/mean_abs_reasoning": 0.4569165110588074, "adv/mean_abs_step_conf": 0.7319062352180481, "adv/ratio_final_to_reasoning": 1.485162580176047, "adv/ratio_step_to_reasoning": 1.6018380108917714, "adv/std_final_conf": 0.8627391457557678, "adv/std_reasoning": 0.7206259369850159, "adv/std_step_conf": 0.9355800747871399, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6900068681318682, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.228207171314741, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.28906662087912094, "calib/mean_conf": 0.733386454183267, "calib/mu_c": 0.8381875000000001, "calib/mu_w": 0.5491208791208791, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16207171314741034, "calib/std_conf": 0.38128959993644995, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5746572104018913, "calib/step_q_c_n": 846.0, "calib/step_q_gap": 0.046964902709583756, "calib/step_q_w": 0.5276923076923076, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2154.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 534.83984375, "completions/mean_terminated_length": 536.9373168945312, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0672, "grad_norm": 0.03342531993985176, "kl": 0.044696807861328125, "learning_rate": 3.8055555555555556e-06, "loss": -0.0119, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03186768665909767, "mask/share_reasoning": 0.8521930575370789, "mask/share_step_conf": 0.11203300207853317, "num_tokens": 14953827.0, "reward": 1.1439241170883179, "reward_std": 0.2590183615684509, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7329293489456177, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8230709433555603, "step": 63 }, { "adv/mean_abs_final_conf": 0.6654384732246399, "adv/mean_abs_reasoning": 0.48860666155815125, "adv/mean_abs_step_conf": 0.7677325010299683, "adv/ratio_final_to_reasoning": 1.3619103577150946, "adv/ratio_step_to_reasoning": 1.5712690010850312, "adv/std_final_conf": 0.8467351198196411, "adv/std_reasoning": 0.739334762096405, "adv/std_step_conf": 0.9357771873474121, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6519786910197869, "calib/avg_num_step_conf": 5.35546875, "calib/ece": 0.23972332015810277, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.616600790513834, "calib/gap": 0.1977564687975647, "calib/mean_conf": 0.7331620553359685, "calib/mu_c": 0.7902222222222223, "calib/mu_w": 0.5924657534246576, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1307114624505929, "calib/std_conf": 0.36325314486712246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5402518711018711, "calib/step_q_c_n": 962.0, "calib/step_q_gap": 0.01497069750773905, "calib/step_q_w": 0.5252811735941321, "calib/step_q_w_n": 409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 476.40625, "completions/mean_terminated_length": 482.05535888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.06826666666666667, "grad_norm": 0.07387295365333557, "kl": 0.04605865478515625, "learning_rate": 3.777777777777778e-06, "loss": -0.0693, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03335306793451309, "mask/share_reasoning": 0.8404361009597778, "mask/share_step_conf": 0.11449208855628967, "num_tokens": 15179563.0, "reward": 1.1393107175827026, "reward_std": 0.21538883447647095, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.734772264957428, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8037118911743164, "step": 64 }, { "adv/mean_abs_final_conf": 0.5958322286605835, "adv/mean_abs_reasoning": 0.33800065517425537, "adv/mean_abs_step_conf": 0.7735382318496704, "adv/ratio_final_to_reasoning": 1.7628138275454044, "adv/ratio_step_to_reasoning": 2.2885702143117883, "adv/std_final_conf": 0.8070009350776672, "adv/std_reasoning": 0.6401605010032654, "adv/std_step_conf": 0.9356332421302795, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6378621378621377, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.36969934640522883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8823529411764706, "calib/gap": 0.12563457375957354, "calib/mean_conf": 0.9171503267973856, "calib/mu_c": 0.9723310023310022, "calib/mu_w": 0.8466964285714287, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3630326797385622, "calib/std_conf": 0.23368243085687596, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6310841950399327, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.029611746060340893, "calib/step_q_w": 0.6014724489795918, "calib/step_q_w_n": 490.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 411.94921875, "completions/mean_terminated_length": 413.5647277832031, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.06933333333333333, "grad_norm": 0.06507488340139389, "kl": 0.05391693115234375, "learning_rate": 3.7500000000000005e-06, "loss": -0.0078, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03861987590789795, "mask/share_reasoning": 0.8285122513771057, "mask/share_step_conf": 0.12896165251731873, "num_tokens": 15390046.0, "reward": 1.0620718002319336, "reward_std": 0.21548622846603394, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6281029582023621, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7900686264038086, "step": 65 }, { "adv/mean_abs_final_conf": 0.6308272480964661, "adv/mean_abs_reasoning": 0.4362999498844147, "adv/mean_abs_step_conf": 0.7803380489349365, "adv/ratio_final_to_reasoning": 1.4458567970580467, "adv/ratio_step_to_reasoning": 1.7885357290131823, "adv/std_final_conf": 0.8291295766830444, "adv/std_reasoning": 0.7013883590698242, "adv/std_step_conf": 0.9355806708335876, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7408925318761385, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.27556451612903216, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6532258064516129, "calib/gap": 0.30061670569867294, "calib/mean_conf": 0.7483064516129032, "calib/mu_c": 0.8961904761904762, "calib/mu_w": 0.5955737704918033, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2579032258064515, "calib/std_conf": 0.3661376917407103, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5993078880407124, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.11990978877988878, "calib/step_q_w": 0.47939809926082366, "calib/step_q_w_n": 947.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 573.8828125, "completions/mean_terminated_length": 578.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.0704, "grad_norm": 0.04619403928518295, "kl": 0.041259765625, "learning_rate": 3.7222222222222225e-06, "loss": -0.1041, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031027261167764664, "mask/share_reasoning": 0.8438991904258728, "mask/share_step_conf": 0.11726106703281403, "num_tokens": 15643312.0, "reward": 1.090234637260437, "reward_std": 0.222636416554451, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6864187717437744, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8012419939041138, "step": 66 }, { "adv/mean_abs_final_conf": 0.5185285210609436, "adv/mean_abs_reasoning": 0.3017883002758026, "adv/mean_abs_step_conf": 0.7628713846206665, "adv/ratio_final_to_reasoning": 1.7181862934615535, "adv/ratio_step_to_reasoning": 2.5278361815997594, "adv/std_final_conf": 0.7563133239746094, "adv/std_reasoning": 0.5960524678230286, "adv/std_step_conf": 0.9346969127655029, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7570650323459313, "calib/avg_num_step_conf": 5.3984375, "calib/ece": 0.22109448818897628, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7047244094488189, "calib/gap": 0.2940209737827715, "calib/mean_conf": 0.8211102362204725, "calib/mu_c": 0.9241333333333334, "calib/mu_w": 0.6301123595505619, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1962992125984251, "calib/std_conf": 0.3068948556344743, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6197363885624755, "calib/step_q_c_n": 851.0, "calib/step_q_gap": 0.07519778215946238, "calib/step_q_w": 0.5445386064030131, "calib/step_q_w_n": 531.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 509.9296875, "completions/mean_terminated_length": 509.9296875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.07146666666666666, "grad_norm": 0.03376888856291771, "kl": 0.0507659912109375, "learning_rate": 3.694444444444445e-06, "loss": 0.1016, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033446695655584335, "mask/share_reasoning": 0.8498560190200806, "mask/share_step_conf": 0.1166972666978836, "num_tokens": 15878862.0, "reward": 1.1770553588867188, "reward_std": 0.146753191947937, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7765185832977295, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8334989547729492, "step": 67 }, { "adv/mean_abs_final_conf": 0.5941345691680908, "adv/mean_abs_reasoning": 0.3606022596359253, "adv/mean_abs_step_conf": 0.7671284079551697, "adv/ratio_final_to_reasoning": 1.6476174324807245, "adv/ratio_step_to_reasoning": 2.127353302582422, "adv/std_final_conf": 0.8287256360054016, "adv/std_reasoning": 0.6611523628234863, "adv/std_step_conf": 0.9351922273635864, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7173514538558786, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.30529644268774697, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.758893280632411, "calib/gap": 0.2553065739570166, "calib/mean_conf": 0.8456126482213439, "calib/mu_c": 0.9596428571428574, "calib/mu_w": 0.7043362831858407, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29877470355731217, "calib/std_conf": 0.3038841445004291, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6220592592592592, "calib/step_q_c_n": 675.0, "calib/step_q_gap": 0.14553062855386495, "calib/step_q_w": 0.47652863070539425, "calib/step_q_w_n": 723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 489.4921875, "completions/mean_terminated_length": 489.4921875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.07253333333333334, "grad_norm": 0.9474548101425171, "kl": 1.2303543090820312, "learning_rate": 3.6666666666666666e-06, "loss": 0.0606, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036423999816179276, "mask/share_reasoning": 0.8375931978225708, "mask/share_step_conf": 0.12598282098770142, "num_tokens": 16108260.0, "reward": 1.1203961372375488, "reward_std": 0.1978849619626999, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6930711269378662, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8276473879814148, "step": 68 }, { "adv/mean_abs_final_conf": 0.6566118001937866, "adv/mean_abs_reasoning": 0.47426342964172363, "adv/mean_abs_step_conf": 0.765344500541687, "adv/ratio_final_to_reasoning": 1.384487521396739, "adv/ratio_step_to_reasoning": 1.6137539871456186, "adv/std_final_conf": 0.8505991697311401, "adv/std_reasoning": 0.7206820249557495, "adv/std_step_conf": 0.9354822039604187, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.70145585176782, "calib/avg_num_step_conf": 5.05078125, "calib/ece": 0.24475555555555553, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5158730158730159, "calib/gap": 0.2759776139156741, "calib/mean_conf": 0.6880222222222222, "calib/mu_c": 0.8227255813953489, "calib/mu_w": 0.5467479674796748, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21043650793650792, "calib/std_conf": 0.3722595145497822, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6086429725363489, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.07013481229896013, "calib/step_q_w": 0.5385081602373888, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 547.37890625, "completions/mean_terminated_length": 553.8695678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.0736, "grad_norm": 0.056202232837677, "kl": 0.037876129150390625, "learning_rate": 3.638888888888889e-06, "loss": -0.0615, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030722718685865402, "mask/share_reasoning": 0.8574061393737793, "mask/share_step_conf": 0.1001524031162262, "num_tokens": 16352885.0, "reward": 1.1116247177124023, "reward_std": 0.22466173768043518, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.707231879234314, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8122408390045166, "step": 69 }, { "adv/mean_abs_final_conf": 0.6317081451416016, "adv/mean_abs_reasoning": 0.43293261528015137, "adv/mean_abs_step_conf": 0.7767082452774048, "adv/ratio_final_to_reasoning": 1.4591373411144417, "adv/ratio_step_to_reasoning": 1.7940626736444785, "adv/std_final_conf": 0.8324445486068726, "adv/std_reasoning": 0.7013914585113525, "adv/std_step_conf": 0.9354316592216492, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7892195506503745, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.17151147098515523, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5465587044534413, "calib/gap": 0.42268690053869395, "calib/mean_conf": 0.659851551956815, "calib/mu_c": 0.861782945736434, "calib/mu_w": 0.43909604519774004, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1545479082321188, "calib/std_conf": 0.39630890942845737, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5988285570638512, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.11718604498655644, "calib/step_q_w": 0.48164251207729475, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 557.1015625, "completions/mean_terminated_length": 557.1015625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.07466666666666667, "grad_norm": 0.05277327448129654, "kl": 0.04144287109375, "learning_rate": 3.6111111111111115e-06, "loss": 0.068, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.034348566085100174, "mask/share_reasoning": 0.8481545448303223, "mask/share_step_conf": 0.11749683320522308, "num_tokens": 16602495.0, "reward": 1.129831314086914, "reward_std": 0.22639545798301697, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7542862892150879, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8082718849182129, "step": 70 }, { "adv/mean_abs_final_conf": 0.6897363066673279, "adv/mean_abs_reasoning": 0.5863041877746582, "adv/mean_abs_step_conf": 0.7683776021003723, "adv/ratio_final_to_reasoning": 1.176413747418811, "adv/ratio_step_to_reasoning": 1.3105442842166646, "adv/std_final_conf": 0.873703122138977, "adv/std_reasoning": 0.7929514050483704, "adv/std_step_conf": 0.9352450966835022, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6145792563600783, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.2747410358565736, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4701195219123506, "calib/gap": 0.18351272015655573, "calib/mean_conf": 0.6196015936254979, "calib/mu_c": 0.6963698630136986, "calib/mu_w": 0.5128571428571429, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15633466135458163, "calib/std_conf": 0.39579845246986506, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5789646464646465, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.08642930674736432, "calib/step_q_w": 0.4925353397172822, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 523.3671875, "completions/mean_terminated_length": 527.4881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.07573333333333333, "grad_norm": 0.04673464596271515, "kl": 0.042041778564453125, "learning_rate": 3.5833333333333335e-06, "loss": -0.0022, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03401986509561539, "mask/share_reasoning": 0.8340585231781006, "mask/share_step_conf": 0.12410911917686462, "num_tokens": 16840885.0, "reward": 1.0984848737716675, "reward_std": 0.22213947772979736, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6739937663078308, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8090673685073853, "step": 71 }, { "adv/mean_abs_final_conf": 0.6500765681266785, "adv/mean_abs_reasoning": 0.39389580488204956, "adv/mean_abs_step_conf": 0.7558625936508179, "adv/ratio_final_to_reasoning": 1.650376978047129, "adv/ratio_step_to_reasoning": 1.9189404514657316, "adv/std_final_conf": 0.8562666773796082, "adv/std_reasoning": 0.6815637350082397, "adv/std_step_conf": 0.9354879856109619, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.825126809352963, "calib/avg_num_step_conf": 5.28125, "calib/ece": 0.14211764705882357, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4392156862745098, "calib/gap": 0.4700569095632809, "calib/mean_conf": 0.604235294117647, "calib/mu_c": 0.8217518248175182, "calib/mu_w": 0.3516949152542373, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10454901960784319, "calib/std_conf": 0.39829256112124584, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5813125845737483, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.07480361230621158, "calib/step_q_w": 0.5065089722675368, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 460.70703125, "completions/mean_terminated_length": 462.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.0768, "grad_norm": 0.052089888602495193, "kl": 0.0478057861328125, "learning_rate": 3.555555555555556e-06, "loss": -0.0143, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034116968512535095, "mask/share_reasoning": 0.8442560434341431, "mask/share_step_conf": 0.11772073805332184, "num_tokens": 17063234.0, "reward": 1.1859991550445557, "reward_std": 0.17564892768859863, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.814927339553833, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8344014286994934, "step": 72 }, { "adv/mean_abs_final_conf": 0.7459884881973267, "adv/mean_abs_reasoning": 0.5748550295829773, "adv/mean_abs_step_conf": 0.7456186413764954, "adv/ratio_final_to_reasoning": 1.2976984627558994, "adv/ratio_step_to_reasoning": 1.2970550886845276, "adv/std_final_conf": 0.913640558719635, "adv/std_reasoning": 0.7929010391235352, "adv/std_step_conf": 0.9357115030288696, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7170414758955022, "calib/avg_num_step_conf": 4.7421875, "calib/ece": 0.20872843915343914, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3888888888888889, "calib/gap": 0.3000431412155489, "calib/mean_conf": 0.5766683862433862, "calib/mu_c": 0.6885892405063291, "calib/mu_w": 0.38854609929078016, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07920634920634921, "calib/std_conf": 0.3867900005611315, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6130971128608923, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.11475640489629052, "calib/step_q_w": 0.4983407079646018, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 467.76953125, "completions/mean_terminated_length": 469.60394287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.07786666666666667, "grad_norm": 0.0889710932970047, "kl": 0.0463409423828125, "learning_rate": 3.5277777777777784e-06, "loss": -0.0218, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034170910716056824, "mask/share_reasoning": 0.8531113266944885, "mask/share_step_conf": 0.10881149023771286, "num_tokens": 17290015.0, "reward": 1.1512796878814697, "reward_std": 0.221408873796463, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7386392951011658, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8295924663543701, "step": 73 }, { "adv/mean_abs_final_conf": 0.7070341110229492, "adv/mean_abs_reasoning": 0.44265496730804443, "adv/mean_abs_step_conf": 0.7692466974258423, "adv/ratio_final_to_reasoning": 1.59725782661538, "adv/ratio_step_to_reasoning": 1.7378020224282766, "adv/std_final_conf": 0.891591489315033, "adv/std_reasoning": 0.7205852270126343, "adv/std_step_conf": 0.9350951313972473, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7420948616600789, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.16842105263157894, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.24696356275303644, "calib/gap": 0.34510408432147566, "calib/mean_conf": 0.4353846153846154, "calib/mu_c": 0.5960606060606061, "calib/mu_w": 0.2509565217391304, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.034696356275303666, "calib/std_conf": 0.3724333946459744, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5536338259441708, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.07437566109191568, "calib/step_q_w": 0.4792581648522551, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1949.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 486.6796875, "completions/mean_terminated_length": 486.6796875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07893333333333333, "grad_norm": 3.0365121364593506, "kl": 11.238975524902344, "learning_rate": 3.5e-06, "loss": 0.2472, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.035113103687763214, "mask/share_reasoning": 0.8513389825820923, "mask/share_step_conf": 0.11354796588420868, "num_tokens": 17518533.0, "reward": 1.1350901126861572, "reward_std": 0.1803215742111206, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7471804618835449, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8179372549057007, "step": 74 }, { "adv/mean_abs_final_conf": 0.692913830280304, "adv/mean_abs_reasoning": 0.3745307922363281, "adv/mean_abs_step_conf": 0.7376118898391724, "adv/ratio_final_to_reasoning": 1.8500850788339902, "adv/ratio_step_to_reasoning": 1.9694292301972887, "adv/std_final_conf": 0.871324360370636, "adv/std_reasoning": 0.6613178849220276, "adv/std_step_conf": 0.9350234866142273, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.841669758812616, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.11362817460317463, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4880952380952381, "calib/gap": 0.49556233766233765, "calib/mean_conf": 0.6377210317460317, "calib/mu_c": 0.7891428571428571, "calib/mu_w": 0.2935805194805195, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.02845238095238099, "calib/std_conf": 0.3893535529573639, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5927868852459016, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.11303243803060653, "calib/step_q_w": 0.4797544472152951, "calib/step_q_w_n": 401.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 435.890625, "completions/mean_terminated_length": 437.60003662109375, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.08, "grad_norm": 0.08381512761116028, "kl": 0.0576171875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0208, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036993805319070816, "mask/share_reasoning": 0.8341166973114014, "mask/share_step_conf": 0.1249832808971405, "num_tokens": 17734873.0, "reward": 1.2211979627609253, "reward_std": 0.16637729108333588, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.8301264047622681, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8524503707885742, "step": 75 }, { "adv/mean_abs_final_conf": 0.6946251392364502, "adv/mean_abs_reasoning": 0.46490082144737244, "adv/mean_abs_step_conf": 0.7597237825393677, "adv/ratio_final_to_reasoning": 1.4941361838722476, "adv/ratio_step_to_reasoning": 1.6341631322012402, "adv/std_final_conf": 0.9021607637405396, "adv/std_reasoning": 0.7573848962783813, "adv/std_step_conf": 0.9354786276817322, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.719203268641471, "calib/avg_num_step_conf": 4.7109375, "calib/ece": 0.23349081364829408, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.38976377952755903, "calib/gap": 0.3165533991601407, "calib/mean_conf": 0.5485564304461943, "calib/mu_c": 0.6594747474747474, "calib/mu_w": 0.34292134831460674, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.06622047244094494, "calib/std_conf": 0.3989292515947965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5832443257676903, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.08530121854668371, "calib/step_q_w": 0.49794310722100654, "calib/step_q_w_n": 457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 465.90625, "completions/mean_terminated_length": 465.90625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.08106666666666666, "grad_norm": 0.1563568115234375, "kl": 0.28119659423828125, "learning_rate": 3.444444444444445e-06, "loss": 0.0588, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03585202246904373, "mask/share_reasoning": 0.8539218306541443, "mask/share_step_conf": 0.11022613197565079, "num_tokens": 17957201.0, "reward": 1.1630396842956543, "reward_std": 0.18635006248950958, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7373994588851929, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8414114713668823, "step": 76 }, { "adv/mean_abs_final_conf": 0.7443857192993164, "adv/mean_abs_reasoning": 0.45837777853012085, "adv/mean_abs_step_conf": 0.7630868554115295, "adv/ratio_final_to_reasoning": 1.6239568193867004, "adv/ratio_step_to_reasoning": 1.66475534188965, "adv/std_final_conf": 0.9162412285804749, "adv/std_reasoning": 0.7015013694763184, "adv/std_step_conf": 0.9357102513313293, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6684374999999999, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.24876, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.34, "calib/gap": 0.24712500000000004, "calib/mean_conf": 0.5291600000000001, "calib/mu_c": 0.618125, "calib/mu_w": 0.371, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.06896000000000001, "calib/std_conf": 0.3863020766188036, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.555174978127734, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.06198439996927574, "calib/step_q_w": 0.49319057815845824, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 453.71875, "completions/mean_terminated_length": 453.71875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08213333333333334, "grad_norm": 0.07061842828989029, "kl": 0.0658111572265625, "learning_rate": 3.416666666666667e-06, "loss": -0.0496, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03902812302112579, "mask/share_reasoning": 0.8365130424499512, "mask/share_step_conf": 0.12445880472660065, "num_tokens": 18178017.0, "reward": 1.1348567008972168, "reward_std": 0.20125140249729156, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7050395011901855, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8295742273330688, "step": 77 }, { "adv/mean_abs_final_conf": 0.70162034034729, "adv/mean_abs_reasoning": 0.47509390115737915, "adv/mean_abs_step_conf": 0.7606037855148315, "adv/ratio_final_to_reasoning": 1.4768035090285698, "adv/ratio_step_to_reasoning": 1.6009546400446732, "adv/std_final_conf": 0.8772461414337158, "adv/std_reasoning": 0.7392054200172424, "adv/std_step_conf": 0.9353312849998474, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7230111851200202, "calib/avg_num_step_conf": 5.41015625, "calib/ece": 0.2087843137254901, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4549019607843137, "calib/gap": 0.3062146537639814, "calib/mean_conf": 0.6443137254901962, "calib/mu_c": 0.7752054794520548, "calib/mu_w": 0.4689908256880734, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14027450980392148, "calib/std_conf": 0.38189750653530774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5682592592592592, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.07429404186795496, "calib/step_q_w": 0.4939652173913043, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 506.15234375, "completions/mean_terminated_length": 508.1372985839844, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.0832, "grad_norm": 0.05703939124941826, "kl": 0.07027435302734375, "learning_rate": 3.3888888888888893e-06, "loss": 0.0122, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03177928924560547, "mask/share_reasoning": 0.8523037433624268, "mask/share_step_conf": 0.11201069504022598, "num_tokens": 18415616.0, "reward": 1.1688693761825562, "reward_std": 0.17837125062942505, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7512054443359375, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8488346338272095, "step": 78 }, { "adv/mean_abs_final_conf": 0.6945815086364746, "adv/mean_abs_reasoning": 0.4822331368923187, "adv/mean_abs_step_conf": 0.7536656260490417, "adv/ratio_final_to_reasoning": 1.4403437995003912, "adv/ratio_step_to_reasoning": 1.562865693771959, "adv/std_final_conf": 0.8733789324760437, "adv/std_reasoning": 0.7392861247062683, "adv/std_step_conf": 0.9356251955032349, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7021109976166156, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.20346456692913395, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5905511811023622, "calib/gap": 0.2385195778004766, "calib/mean_conf": 0.7600000000000001, "calib/mu_c": 0.8435757575757575, "calib/mu_w": 0.6050561797752809, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15692913385826784, "calib/std_conf": 0.33442158314856196, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.601807182320442, "calib/step_q_c_n": 905.0, "calib/step_q_gap": 0.058073557866293535, "calib/step_q_w": 0.5437336244541484, "calib/step_q_w_n": 458.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 497.23828125, "completions/mean_terminated_length": 499.1882629394531, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.08426666666666667, "grad_norm": 0.040150903165340424, "kl": 0.06087493896484375, "learning_rate": 3.3611111111111117e-06, "loss": -0.079, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033050116151571274, "mask/share_reasoning": 0.8556927442550659, "mask/share_step_conf": 0.1073509156703949, "num_tokens": 18649285.0, "reward": 1.1646015644073486, "reward_std": 0.22102072834968567, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.74712073802948, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8370131254196167, "step": 79 }, { "adv/mean_abs_final_conf": 0.6892921924591064, "adv/mean_abs_reasoning": 0.5974498987197876, "adv/mean_abs_step_conf": 0.7859007120132446, "adv/ratio_final_to_reasoning": 1.1537238418419988, "adv/ratio_step_to_reasoning": 1.3154252995895863, "adv/std_final_conf": 0.8678206205368042, "adv/std_reasoning": 0.8266484141349792, "adv/std_step_conf": 0.9356747269630432, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6667343073593074, "calib/avg_num_step_conf": 5.86328125, "calib/ece": 0.2576, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.756, "calib/gap": 0.20345643939393931, "calib/mean_conf": 0.8596, "calib/mu_c": 0.9377272727272727, "calib/mu_w": 0.7342708333333334, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2506, "calib/std_conf": 0.2794377211473068, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5855282555282555, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.06464033704208383, "calib/step_q_w": 0.5208879184861717, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 474.1015625, "completions/mean_terminated_length": 474.1015625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08533333333333333, "grad_norm": 0.04135843738913536, "kl": 0.078033447265625, "learning_rate": 3.3333333333333333e-06, "loss": -0.0204, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03415698558092117, "mask/share_reasoning": 0.8395260572433472, "mask/share_step_conf": 0.12631690502166748, "num_tokens": 18872815.0, "reward": 1.1109731197357178, "reward_std": 0.27404117584228516, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7014476656913757, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8037697076797485, "step": 80 }, { "adv/mean_abs_final_conf": 0.6472058296203613, "adv/mean_abs_reasoning": 0.5512617826461792, "adv/mean_abs_step_conf": 0.7223784923553467, "adv/ratio_final_to_reasoning": 1.174044437678283, "adv/ratio_step_to_reasoning": 1.310409165111663, "adv/std_final_conf": 0.8557976484298706, "adv/std_reasoning": 0.7928785681724548, "adv/std_step_conf": 0.9358363151550293, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7318347953216374, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.20165289256198346, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.731404958677686, "calib/gap": 0.3349956140350877, "calib/mean_conf": 0.8207438016528926, "calib/mu_c": 0.945328947368421, "calib/mu_w": 0.6103333333333333, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.19714876033057852, "calib/std_conf": 0.31975627414997615, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6455555555555557, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.21428990882398408, "calib/step_q_w": 0.4312656467315716, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 502.4921875, "completions/mean_terminated_length": 506.4488220214844, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.0864, "grad_norm": 0.02329258807003498, "kl": 0.07903289794921875, "learning_rate": 3.3055555555555558e-06, "loss": 0.0445, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.034032806754112244, "mask/share_reasoning": 0.8530516624450684, "mask/share_step_conf": 0.1051030158996582, "num_tokens": 19107701.0, "reward": 1.1032999753952026, "reward_std": 0.30462607741355896, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7289878726005554, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.781428873538971, "step": 81 }, { "adv/mean_abs_final_conf": 0.6715503931045532, "adv/mean_abs_reasoning": 0.6163297891616821, "adv/mean_abs_step_conf": 0.7347931861877441, "adv/ratio_final_to_reasoning": 1.0895958704478343, "adv/ratio_step_to_reasoning": 1.192207806776942, "adv/std_final_conf": 0.8576227426528931, "adv/std_reasoning": 0.8266252875328064, "adv/std_step_conf": 0.9357723593711853, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7495399858457183, "calib/avg_num_step_conf": 4.40234375, "calib/ece": 0.2589473684210527, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7611336032388664, "calib/gap": 0.23805944798301482, "calib/mean_conf": 0.8519838056680162, "calib/mu_c": 0.9387261146496815, "calib/mu_w": 0.7006666666666667, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2376518218623483, "calib/std_conf": 0.2900099672670955, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6533693415637861, "calib/step_q_c_n": 648.0, "calib/step_q_gap": 0.09132341254499687, "calib/step_q_w": 0.5620459290187892, "calib/step_q_w_n": 479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 428.57421875, "completions/mean_terminated_length": 430.2549133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.08746666666666666, "grad_norm": 0.03629959747195244, "kl": 0.09059906005859375, "learning_rate": 3.277777777777778e-06, "loss": -0.0165, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03759324923157692, "mask/share_reasoning": 0.8498210310935974, "mask/share_step_conf": 0.10867946594953537, "num_tokens": 19322968.0, "reward": 1.1178183555603027, "reward_std": 0.3032802641391754, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7175562381744385, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8021576404571533, "step": 82 }, { "adv/mean_abs_final_conf": 0.6760067939758301, "adv/mean_abs_reasoning": 0.5050960183143616, "adv/mean_abs_step_conf": 0.7669603228569031, "adv/ratio_final_to_reasoning": 1.3383728429137944, "adv/ratio_step_to_reasoning": 1.5184446027043563, "adv/std_final_conf": 0.8897445797920227, "adv/std_reasoning": 0.757703423500061, "adv/std_step_conf": 0.9363173842430115, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7130994989262704, "calib/avg_num_step_conf": 4.45703125, "calib/ece": 0.3174964838255978, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.7426160337552743, "calib/gap": 0.24436912431400637, "calib/mean_conf": 0.8375246132208157, "calib/mu_c": 0.9509448818897638, "calib/mu_w": 0.7065757575757574, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.30957805907172997, "calib/std_conf": 0.30996841328257, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.6469359375, "calib/step_q_c_n": 512.0, "calib/step_q_gap": 0.13972131110890296, "calib/step_q_w": 0.507214626391097, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 517.15234375, "completions/mean_terminated_length": 521.2244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.08853333333333334, "grad_norm": 0.03925936296582222, "kl": 0.0845184326171875, "learning_rate": 3.2500000000000002e-06, "loss": -0.0406, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.032422006130218506, "mask/share_reasoning": 0.8642917275428772, "mask/share_step_conf": 0.0954737514257431, "num_tokens": 19562623.0, "reward": 1.004056692123413, "reward_std": 0.3257407248020172, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6180038452148438, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7408022284507751, "step": 83 }, { "adv/mean_abs_final_conf": 0.7383853793144226, "adv/mean_abs_reasoning": 0.5751377940177917, "adv/mean_abs_step_conf": 0.7391847968101501, "adv/ratio_final_to_reasoning": 1.2838408238766879, "adv/ratio_step_to_reasoning": 1.285230782081561, "adv/std_final_conf": 0.9037018418312073, "adv/std_reasoning": 0.8099603652954102, "adv/std_step_conf": 0.9362303614616394, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7144144144144143, "calib/avg_num_step_conf": 3.8203125, "calib/ece": 0.3360975609756096, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7845528455284553, "calib/gap": 0.16928528528528552, "calib/mean_conf": 0.8765040650406504, "calib/mu_c": 0.952888888888889, "calib/mu_w": 0.7836036036036035, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.33191056910569094, "calib/std_conf": 0.2616069283157941, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.616244966442953, "calib/step_q_c_n": 596.0, "calib/step_q_gap": 0.024098369584314328, "calib/step_q_w": 0.5921465968586387, "calib/step_q_w_n": 382.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 427.90234375, "completions/mean_terminated_length": 429.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0896, "grad_norm": 0.032014407217502594, "kl": 0.098663330078125, "learning_rate": 3.2222222222222227e-06, "loss": -0.0648, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03773626685142517, "mask/share_reasoning": 0.8612334728240967, "mask/share_step_conf": 0.09712400287389755, "num_tokens": 19778086.0, "reward": 1.0297787189483643, "reward_std": 0.3220594525337219, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6267675161361694, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.758318305015564, "step": 84 }, { "adv/mean_abs_final_conf": 0.7482867240905762, "adv/mean_abs_reasoning": 0.6304100751876831, "adv/mean_abs_step_conf": 0.7630487680435181, "adv/ratio_final_to_reasoning": 1.1869840815405739, "adv/ratio_step_to_reasoning": 1.2104006551867788, "adv/std_final_conf": 0.9141106009483337, "adv/std_reasoning": 0.8590542674064636, "adv/std_step_conf": 0.9360108971595764, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6614173228346456, "calib/avg_num_step_conf": 4.203125, "calib/ece": 0.3278439716312057, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.7702127659574468, "calib/gap": 0.22586905803441248, "calib/mean_conf": 0.8366950354609929, "calib/mu_c": 0.9404986876640421, "calib/mu_w": 0.7146296296296296, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.31205673758865254, "calib/std_conf": 0.30288713054935484, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.6443396226415095, "calib/step_q_c_n": 477.0, "calib/step_q_gap": 0.13430623365987338, "calib/step_q_w": 0.5100333889816361, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 524.30859375, "completions/mean_terminated_length": 526.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.09066666666666667, "grad_norm": 0.03913387656211853, "kl": 0.08313751220703125, "learning_rate": 3.1944444444444443e-06, "loss": -0.123, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.033390868455171585, "mask/share_reasoning": 0.8677021265029907, "mask/share_step_conf": 0.09500078111886978, "num_tokens": 20020133.0, "reward": 1.0035605430603027, "reward_std": 0.34797096252441406, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6201419830322266, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7381943464279175, "step": 85 }, { "adv/mean_abs_final_conf": 0.7321887612342834, "adv/mean_abs_reasoning": 0.5524415969848633, "adv/mean_abs_step_conf": 0.7457665801048279, "adv/ratio_final_to_reasoning": 1.3253686276168397, "adv/ratio_step_to_reasoning": 1.349946463436318, "adv/std_final_conf": 0.9024088978767395, "adv/std_reasoning": 0.8267043828964233, "adv/std_step_conf": 0.936254620552063, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.6933198380566802, "calib/avg_num_step_conf": 3.41796875, "calib/ece": 0.3585714285714286, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.7012987012987013, "calib/gap": 0.16702654071075118, "calib/mean_conf": 0.8194372294372294, "calib/mu_c": 0.9040350877192983, "calib/mu_w": 0.7370085470085471, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.34225108225108225, "calib/std_conf": 0.30601493073827, "calib/step_conf_rate": 0.90234375, "calib/step_q_c": 0.6428678304239402, "calib/step_q_c_n": 401.0, "calib/step_q_gap": 0.1263699401285816, "calib/step_q_w": 0.5164978902953586, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 487.5078125, "completions/mean_terminated_length": 489.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.09173333333333333, "grad_norm": 0.045741576701402664, "kl": 0.0944671630859375, "learning_rate": 3.1666666666666667e-06, "loss": -0.1206, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.03539574146270752, "mask/share_reasoning": 0.8742038607597351, "mask/share_step_conf": 0.08649415522813797, "num_tokens": 20250447.0, "reward": 0.9497416019439697, "reward_std": 0.3561251163482666, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.5509187579154968, "rewards/format_reward_step": 0.87890625, "rewards/step_l2_reward": 0.7219595909118652, "step": 86 }, { "adv/mean_abs_final_conf": 0.6731104850769043, "adv/mean_abs_reasoning": 0.5100500583648682, "adv/mean_abs_step_conf": 0.7500249147415161, "adv/ratio_final_to_reasoning": 1.3196949476582347, "adv/ratio_step_to_reasoning": 1.4704927534876981, "adv/std_final_conf": 0.8464348316192627, "adv/std_reasoning": 0.7577330470085144, "adv/std_step_conf": 0.9360089302062988, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6721701621300016, "calib/avg_num_step_conf": 3.375, "calib/ece": 0.2669795918367348, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8, "calib/gap": 0.15429619713421605, "calib/mean_conf": 0.8810204081632654, "calib/mu_c": 0.933292181069959, "calib/mu_w": 0.778995983935743, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.2433877551020409, "calib/std_conf": 0.25129036973915836, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.6871928166351606, "calib/step_q_c_n": 529.0, "calib/step_q_gap": 0.08473510519237448, "calib/step_q_w": 0.6024577114427861, "calib/step_q_w_n": 335.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 423.375, "completions/mean_terminated_length": 425.0353088378906, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.0928, "grad_norm": 0.034470539540052414, "kl": 0.1080780029296875, "learning_rate": 3.138888888888889e-06, "loss": 0.0384, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.04032812640070915, "mask/share_reasoning": 0.8646848201751709, "mask/share_step_conf": 0.09108079969882965, "num_tokens": 20464327.0, "reward": 1.0932562351226807, "reward_std": 0.292274534702301, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6880762577056885, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7885407209396362, "step": 87 }, { "adv/mean_abs_final_conf": 0.6974610090255737, "adv/mean_abs_reasoning": 0.5662336349487305, "adv/mean_abs_step_conf": 0.7676079273223877, "adv/ratio_final_to_reasoning": 1.2317548198787682, "adv/ratio_step_to_reasoning": 1.3556381676123683, "adv/std_final_conf": 0.8608344793319702, "adv/std_reasoning": 0.8100785613059998, "adv/std_step_conf": 0.9360609650611877, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7919852941176472, "calib/avg_num_step_conf": 3.6875, "calib/ece": 0.25492937853107345, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.7161016949152542, "calib/gap": 0.3310803921568626, "calib/mean_conf": 0.8156920903954802, "calib/mu_c": 0.9559803921568627, "calib/mu_w": 0.6249000000000001, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.24717514124293788, "calib/std_conf": 0.3247186835694789, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.6507528957528959, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.10406275490782546, "calib/step_q_w": 0.5466901408450704, "calib/step_q_w_n": 426.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 490.2421875, "completions/mean_terminated_length": 492.16473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.09386666666666667, "grad_norm": 0.041649460792541504, "kl": 0.0970458984375, "learning_rate": 3.1111111111111116e-06, "loss": -0.0827, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.033657241612672806, "mask/share_reasoning": 0.8734369874000549, "mask/share_step_conf": 0.08899955451488495, "num_tokens": 20699677.0, "reward": 1.053786039352417, "reward_std": 0.3305705785751343, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6846892833709717, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7558801174163818, "step": 88 }, { "adv/mean_abs_final_conf": 0.7344450950622559, "adv/mean_abs_reasoning": 0.635488748550415, "adv/mean_abs_step_conf": 0.7786235213279724, "adv/ratio_final_to_reasoning": 1.155716913537125, "adv/ratio_step_to_reasoning": 1.2252357309300215, "adv/std_final_conf": 0.9001230597496033, "adv/std_reasoning": 0.8431754112243652, "adv/std_step_conf": 0.936394214630127, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.7706845238095239, "calib/avg_num_step_conf": 3.35546875, "calib/ece": 0.2964367816091954, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.5775862068965517, "calib/gap": 0.26714285714285724, "calib/mean_conf": 0.7518678160919541, "calib/mu_c": 0.8808333333333335, "calib/mu_w": 0.6136904761904762, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.2655316091954023, "calib/std_conf": 0.34185410860418225, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.6363118811881188, "calib/step_q_c_n": 404.0, "calib/step_q_gap": 0.1137140789903166, "calib/step_q_w": 0.5225978021978022, "calib/step_q_w_n": 455.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 495.140625, "completions/mean_terminated_length": 497.0823974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.09493333333333333, "grad_norm": 0.04524797201156616, "kl": 0.1009368896484375, "learning_rate": 3.0833333333333336e-06, "loss": -0.1369, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.035608772188425064, "mask/share_reasoning": 0.8790637254714966, "mask/share_step_conf": 0.08142121136188507, "num_tokens": 20935321.0, "reward": 0.9926495552062988, "reward_std": 0.3590254783630371, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6327042579650879, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7194381952285767, "step": 89 }, { "adv/mean_abs_final_conf": 0.7435449361801147, "adv/mean_abs_reasoning": 0.5475056171417236, "adv/mean_abs_step_conf": 0.7501516342163086, "adv/ratio_final_to_reasoning": 1.3580590096259153, "adv/ratio_step_to_reasoning": 1.3701259142006745, "adv/std_final_conf": 0.894040048122406, "adv/std_reasoning": 0.7929933071136475, "adv/std_step_conf": 0.9362837076187134, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7313868613138687, "calib/avg_num_step_conf": 4.1640625, "calib/ece": 0.2791701828410689, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.6919831223628692, "calib/gap": 0.21087469586374696, "calib/mean_conf": 0.8173980309423348, "calib/mu_c": 0.906374695863747, "calib/mu_w": 0.6955, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.25925457102672295, "calib/std_conf": 0.30748016228322245, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5799717514124293, "calib/step_q_c_n": 531.0, "calib/step_q_gap": 0.05376614393579382, "calib/step_q_w": 0.5262056074766355, "calib/step_q_w_n": 535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 492.58203125, "completions/mean_terminated_length": 494.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.096, "grad_norm": 0.03284657001495361, "kl": 0.10390472412109375, "learning_rate": 3.055555555555556e-06, "loss": -0.098, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.0350940003991127, "mask/share_reasoning": 0.8628687858581543, "mask/share_step_conf": 0.09813091158866882, "num_tokens": 21164742.0, "reward": 1.0324347019195557, "reward_std": 0.3419674038887024, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.642932653427124, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.7542078495025635, "step": 90 }, { "adv/mean_abs_final_conf": 0.7523043751716614, "adv/mean_abs_reasoning": 0.6084607839584351, "adv/mean_abs_step_conf": 0.7678749561309814, "adv/ratio_final_to_reasoning": 1.2364056895785949, "adv/ratio_step_to_reasoning": 1.261995803797663, "adv/std_final_conf": 0.9162842035293579, "adv/std_reasoning": 0.8269948363304138, "adv/std_step_conf": 0.9362143278121948, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.6887827170751973, "calib/avg_num_step_conf": 3.7890625, "calib/ece": 0.27266081871345027, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.7017543859649122, "calib/gap": 0.1607998892120206, "calib/mean_conf": 0.8428654970760234, "calib/mu_c": 0.9014022988505748, "calib/mu_w": 0.7406024096385542, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.23978070175438596, "calib/std_conf": 0.2834935485075731, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.5960701107011072, "calib/step_q_c_n": 542.0, "calib/step_q_gap": 0.1316775873366211, "calib/step_q_w": 0.46439252336448605, "calib/step_q_w_n": 428.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 489.32421875, "completions/mean_terminated_length": 491.2431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.09706666666666666, "grad_norm": 0.04172343760728836, "kl": 0.1055908203125, "learning_rate": 3.0277777777777776e-06, "loss": -0.1185, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.03135697543621063, "mask/share_reasoning": 0.8781741857528687, "mask/share_step_conf": 0.08656258881092072, "num_tokens": 21397721.0, "reward": 1.0270675420761108, "reward_std": 0.3646523356437683, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6375107765197754, "rewards/format_reward_step": 0.87890625, "rewards/step_l2_reward": 0.7517077922821045, "step": 91 }, { "adv/mean_abs_final_conf": 0.7131840586662292, "adv/mean_abs_reasoning": 0.5634198188781738, "adv/mean_abs_step_conf": 0.7377828359603882, "adv/ratio_final_to_reasoning": 1.2658128712728836, "adv/ratio_step_to_reasoning": 1.3094726369927647, "adv/std_final_conf": 0.8928073644638062, "adv/std_reasoning": 0.8266968727111816, "adv/std_step_conf": 0.9362941384315491, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7849395077179808, "calib/avg_num_step_conf": 3.14453125, "calib/ece": 0.22082321187584347, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.659919028340081, "calib/gap": 0.26598688175033613, "calib/mean_conf": 0.7991228070175438, "calib/mu_c": 0.9003485838779957, "calib/mu_w": 0.6343617021276595, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.20025641025641028, "calib/std_conf": 0.3049184381507363, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.6151875, "calib/step_q_c_n": 480.0, "calib/step_q_gap": 0.06992596153846165, "calib/step_q_w": 0.5452615384615384, "calib/step_q_w_n": 325.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 408.37890625, "completions/mean_terminated_length": 409.98040771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.09813333333333334, "grad_norm": 0.0428118035197258, "kl": 0.1136627197265625, "learning_rate": 3e-06, "loss": -0.1626, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03909327834844589, "mask/share_reasoning": 0.87115478515625, "mask/share_step_conf": 0.0858457013964653, "num_tokens": 21608986.0, "reward": 1.0958373546600342, "reward_std": 0.3293219208717346, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.717007040977478, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7784241437911987, "step": 92 }, { "adv/mean_abs_final_conf": 0.7348818778991699, "adv/mean_abs_reasoning": 0.5190085172653198, "adv/mean_abs_step_conf": 0.7640940546989441, "adv/ratio_final_to_reasoning": 1.4159341387522828, "adv/ratio_step_to_reasoning": 1.4722187195019292, "adv/std_final_conf": 0.8839680552482605, "adv/std_reasoning": 0.7756069898605347, "adv/std_step_conf": 0.9361538290977478, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6999163821336492, "calib/avg_num_step_conf": 3.984375, "calib/ece": 0.26783333333333337, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5541666666666667, "calib/gap": 0.23909553341230583, "calib/mean_conf": 0.7275833333333334, "calib/mu_c": 0.8401574803149606, "calib/mu_w": 0.6010619469026548, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.23312500000000003, "calib/std_conf": 0.3476324587677176, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.5878935698447894, "calib/step_q_c_n": 451.0, "calib/step_q_gap": 0.15009040640014965, "calib/step_q_w": 0.4378031634446397, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 474.6640625, "completions/mean_terminated_length": 476.5255126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.0992, "grad_norm": 0.027074242010712624, "kl": 0.103118896484375, "learning_rate": 2.9722222222222225e-06, "loss": -0.0436, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03518085181713104, "mask/share_reasoning": 0.8667430877685547, "mask/share_step_conf": 0.09416983276605606, "num_tokens": 21836276.0, "reward": 1.0502357482910156, "reward_std": 0.32360124588012695, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6509683728218079, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7772729396820068, "step": 93 }, { "adv/mean_abs_final_conf": 0.7581357955932617, "adv/mean_abs_reasoning": 0.5422264337539673, "adv/mean_abs_step_conf": 0.7568789720535278, "adv/ratio_final_to_reasoning": 1.3981904023831901, "adv/ratio_step_to_reasoning": 1.3958725081207644, "adv/std_final_conf": 0.9000263214111328, "adv/std_reasoning": 0.7930181622505188, "adv/std_step_conf": 0.9360536336898804, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.711711711711712, "calib/avg_num_step_conf": 3.6171875, "calib/ece": 0.2289711934156379, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.4691358024691358, "calib/gap": 0.2531674856674858, "calib/mean_conf": 0.6909465020576131, "calib/mu_c": 0.8065909090909091, "calib/mu_w": 0.5534234234234233, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.18835390946502062, "calib/std_conf": 0.3527766538595497, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5513870246085012, "calib/step_q_c_n": 447.0, "calib/step_q_gap": 0.07401750477551577, "calib/step_q_w": 0.4773695198329854, "calib/step_q_w_n": 479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2849.0, "completions/max_terminated_length": 2849.0, "completions/mean_length": 464.6875, "completions/mean_terminated_length": 466.50982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.10026666666666667, "grad_norm": 0.041096802800893784, "kl": 0.11529541015625, "learning_rate": 2.944444444444445e-06, "loss": -0.1382, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03776441887021065, "mask/share_reasoning": 0.8714349865913391, "mask/share_step_conf": 0.08689434826374054, "num_tokens": 22063916.0, "reward": 1.0612201690673828, "reward_std": 0.31181395053863525, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6851406097412109, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7654915452003479, "step": 94 }, { "adv/mean_abs_final_conf": 0.6974964141845703, "adv/mean_abs_reasoning": 0.4942771792411804, "adv/mean_abs_step_conf": 0.7726743221282959, "adv/ratio_final_to_reasoning": 1.4111442799268503, "adv/ratio_step_to_reasoning": 1.5632409396576101, "adv/std_final_conf": 0.8614376783370972, "adv/std_reasoning": 0.7395118474960327, "adv/std_step_conf": 0.9359301924705505, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.8170363663799607, "calib/avg_num_step_conf": 3.82421875, "calib/ece": 0.1715319148936169, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.5829787234042553, "calib/gap": 0.35925531914893627, "calib/mean_conf": 0.7333191489361703, "calib/mu_c": 0.8770212765957447, "calib/mu_w": 0.5177659574468084, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.1524255319148935, "calib/std_conf": 0.3477918860702431, "calib/step_conf_rate": 0.89453125, "calib/step_q_c": 0.5590125391849531, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.10220902012336958, "calib/step_q_w": 0.4568035190615835, "calib/step_q_w_n": 341.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1924.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 467.44921875, "completions/mean_terminated_length": 469.2823791503906, "completions/min_length": 0.0, "completions/min_terminated_length": 60.0, "epoch": 0.10133333333333333, "grad_norm": 0.03094375506043434, "kl": 0.10405731201171875, "learning_rate": 2.916666666666667e-06, "loss": -0.1834, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.0356331393122673, "mask/share_reasoning": 0.8669276833534241, "mask/share_step_conf": 0.09353290498256683, "num_tokens": 22289711.0, "reward": 1.0407851934432983, "reward_std": 0.33165183663368225, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7023831605911255, "rewards/format_reward_step": 0.87890625, "rewards/step_l2_reward": 0.727791428565979, "step": 95 }, { "adv/mean_abs_final_conf": 0.7360479831695557, "adv/mean_abs_reasoning": 0.5627431869506836, "adv/mean_abs_step_conf": 0.7364088892936707, "adv/ratio_final_to_reasoning": 1.307964272580451, "adv/ratio_step_to_reasoning": 1.308605606198492, "adv/std_final_conf": 0.903143584728241, "adv/std_reasoning": 0.8101238012313843, "adv/std_step_conf": 0.9362661242485046, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8285256410256411, "calib/avg_num_step_conf": 4.0390625, "calib/ece": 0.13099173553719007, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6115702479338843, "calib/gap": 0.4288267740011925, "calib/mean_conf": 0.7756198347107438, "calib/mu_c": 0.9280128205128205, "calib/mu_w": 0.499186046511628, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.13099173553719007, "calib/std_conf": 0.330956271766317, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5867115384615385, "calib/step_q_c_n": 520.0, "calib/step_q_gap": 0.1595909158934451, "calib/step_q_w": 0.42712062256809336, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2613.0, "completions/max_terminated_length": 2613.0, "completions/mean_length": 421.73828125, "completions/mean_terminated_length": 421.73828125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1024, "grad_norm": 0.031053684651851654, "kl": 0.12139892578125, "learning_rate": 2.888888888888889e-06, "loss": 0.0149, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.039800334721803665, "mask/share_reasoning": 0.8641690611839294, "mask/share_step_conf": 0.0960305854678154, "num_tokens": 22503492.0, "reward": 1.1341462135314941, "reward_std": 0.3485493063926697, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7765765190124512, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7897896766662598, "step": 96 }, { "adv/mean_abs_final_conf": 0.7563111186027527, "adv/mean_abs_reasoning": 0.5842485427856445, "adv/mean_abs_step_conf": 0.7468534111976624, "adv/ratio_final_to_reasoning": 1.2945023619515237, "adv/ratio_step_to_reasoning": 1.278314546813814, "adv/std_final_conf": 0.9269362092018127, "adv/std_reasoning": 0.8266255259513855, "adv/std_step_conf": 0.9361647367477417, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7186663922617823, "calib/avg_num_step_conf": 4.36328125, "calib/ece": 0.21413223140495874, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.4793388429752066, "calib/gap": 0.29765109418947666, "calib/mean_conf": 0.667603305785124, "calib/mu_c": 0.8065891472868217, "calib/mu_w": 0.5089380530973451, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.1743388429752067, "calib/std_conf": 0.37392495997399144, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.5463506261180681, "calib/step_q_c_n": 559.0, "calib/step_q_gap": 0.136171414648534, "calib/step_q_w": 0.41017921146953407, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 444.62890625, "completions/mean_terminated_length": 444.62890625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10346666666666667, "grad_norm": 0.050117410719394684, "kl": 0.1236724853515625, "learning_rate": 2.861111111111111e-06, "loss": -0.1227, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03786986321210861, "mask/share_reasoning": 0.8509825468063354, "mask/share_step_conf": 0.11114759743213654, "num_tokens": 22722389.0, "reward": 1.0693061351776123, "reward_std": 0.3145286738872528, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6830066442489624, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.780820369720459, "step": 97 }, { "adv/mean_abs_final_conf": 0.7835785746574402, "adv/mean_abs_reasoning": 0.6385958790779114, "adv/mean_abs_step_conf": 0.7745162844657898, "adv/ratio_final_to_reasoning": 1.227033559610303, "adv/ratio_step_to_reasoning": 1.2128425970805483, "adv/std_final_conf": 0.9290453791618347, "adv/std_reasoning": 0.843262791633606, "adv/std_step_conf": 0.9362633228302002, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.6779699248120301, "calib/avg_num_step_conf": 3.12109375, "calib/ece": 0.2271101573676681, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.49356223175965663, "calib/gap": 0.27745363408521306, "calib/mean_conf": 0.6723748211731044, "calib/mu_c": 0.7914536340852131, "calib/mu_w": 0.514, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.16433476394849786, "calib/std_conf": 0.37874272681994453, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.572358276643991, "calib/step_q_c_n": 441.0, "calib/step_q_gap": 0.10626889116913057, "calib/step_q_w": 0.4660893854748604, "calib/step_q_w_n": 358.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 480.21484375, "completions/mean_terminated_length": 482.0980529785156, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.10453333333333334, "grad_norm": 0.04808899015188217, "kl": 0.10540771484375, "learning_rate": 2.8333333333333335e-06, "loss": -0.1129, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.03742651268839836, "mask/share_reasoning": 0.8740274310112, "mask/share_step_conf": 0.08463980257511139, "num_tokens": 22951508.0, "reward": 1.0163490772247314, "reward_std": 0.34398770332336426, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6527763605117798, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7319269776344299, "step": 98 }, { "adv/mean_abs_final_conf": 0.7636774778366089, "adv/mean_abs_reasoning": 0.6109292507171631, "adv/mean_abs_step_conf": 0.7603302597999573, "adv/ratio_final_to_reasoning": 1.2500260495632454, "adv/ratio_step_to_reasoning": 1.244547153221775, "adv/std_final_conf": 0.9156864881515503, "adv/std_reasoning": 0.843126118183136, "adv/std_step_conf": 0.9363462328910828, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6918441289559303, "calib/avg_num_step_conf": 4.42578125, "calib/ece": 0.22887005649717512, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.288135593220339, "calib/gap": 0.26150054224588376, "calib/mean_conf": 0.48759887005649716, "calib/mu_c": 0.6405102040816326, "calib/mu_w": 0.37900966183574886, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.15060734463276837, "calib/std_conf": 0.3836272235226863, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.4688373655913979, "calib/step_q_c_n": 496.0, "calib/step_q_gap": 0.07384521488496143, "calib/step_q_w": 0.39499215070643645, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 556.5703125, "completions/mean_terminated_length": 558.7529907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.1056, "grad_norm": 0.030870946124196053, "kl": 0.1034698486328125, "learning_rate": 2.805555555555556e-06, "loss": -0.1052, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.031477123498916626, "mask/share_reasoning": 0.875032901763916, "mask/share_step_conf": 0.08958369493484497, "num_tokens": 23199790.0, "reward": 1.0112491846084595, "reward_std": 0.3343077301979065, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6536698937416077, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7406773567199707, "step": 99 }, { "adv/mean_abs_final_conf": 0.7110211849212646, "adv/mean_abs_reasoning": 0.5518664121627808, "adv/mean_abs_step_conf": 0.7303281426429749, "adv/ratio_final_to_reasoning": 1.2883936569626544, "adv/ratio_step_to_reasoning": 1.3233784962212092, "adv/std_final_conf": 0.9070796966552734, "adv/std_reasoning": 0.8099352717399597, "adv/std_step_conf": 0.9358618855476379, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8162401574803149, "calib/avg_num_step_conf": 4.3203125, "calib/ece": 0.1366261808367072, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.39271255060728744, "calib/gap": 0.4239192913385827, "calib/mean_conf": 0.5830499325236167, "calib/mu_c": 0.7890026246719161, "calib/mu_w": 0.36508333333333337, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.10275303643724698, "calib/std_conf": 0.3924053656183008, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5050714285714285, "calib/step_q_c_n": 560.0, "calib/step_q_gap": 0.12179304029304028, "calib/step_q_w": 0.3832783882783882, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2199.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 499.890625, "completions/mean_terminated_length": 499.890625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.10666666666666667, "grad_norm": 0.03531169518828392, "kl": 0.10375213623046875, "learning_rate": 2.7777777777777783e-06, "loss": -0.063, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.035181716084480286, "mask/share_reasoning": 0.8687215447425842, "mask/share_step_conf": 0.09609673917293549, "num_tokens": 23435170.0, "reward": 1.1387224197387695, "reward_std": 0.2929524779319763, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7556166648864746, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8228855133056641, "step": 100 }, { "adv/mean_abs_final_conf": 0.780475914478302, "adv/mean_abs_reasoning": 0.5842746496200562, "adv/mean_abs_step_conf": 0.7489468455314636, "adv/ratio_final_to_reasoning": 1.3358031449521766, "adv/ratio_step_to_reasoning": 1.2818403913613075, "adv/std_final_conf": 0.924403965473175, "adv/std_reasoning": 0.8266467452049255, "adv/std_step_conf": 0.935892641544342, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7109784202807458, "calib/avg_num_step_conf": 4.3359375, "calib/ece": 0.21908333333333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.2708333333333333, "calib/gap": 0.2959983239053006, "calib/mean_conf": 0.4466666666666666, "calib/mu_c": 0.6057657657657657, "calib/mu_w": 0.30976744186046506, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.10162500000000002, "calib/std_conf": 0.3945024996400178, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.44976545842217486, "calib/step_q_c_n": 469.0, "calib/step_q_gap": 0.1172225567061062, "calib/step_q_w": 0.33254290171606865, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 522.67578125, "completions/mean_terminated_length": 522.67578125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.10773333333333333, "grad_norm": 0.03128324821591377, "kl": 0.116485595703125, "learning_rate": 2.7500000000000004e-06, "loss": -0.104, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03342951089143753, "mask/share_reasoning": 0.8713258504867554, "mask/share_step_conf": 0.09524467587471008, "num_tokens": 23675967.0, "reward": 1.076228141784668, "reward_std": 0.2763334810733795, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6834926009178162, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7985799908638, "step": 101 }, { "adv/mean_abs_final_conf": 0.6880002021789551, "adv/mean_abs_reasoning": 0.4698541760444641, "adv/mean_abs_step_conf": 0.7566457986831665, "adv/ratio_final_to_reasoning": 1.4642845317902355, "adv/ratio_step_to_reasoning": 1.610384321904084, "adv/std_final_conf": 0.8732596039772034, "adv/std_reasoning": 0.7207974195480347, "adv/std_step_conf": 0.9353622794151306, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8203735144312395, "calib/avg_num_step_conf": 4.25390625, "calib/ece": 0.17494666666666664, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.38, "calib/gap": 0.43766270514997174, "calib/mean_conf": 0.5403333333333333, "calib/mu_c": 0.7066451612903226, "calib/mu_w": 0.2689824561403509, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.04764, "calib/std_conf": 0.3995536676509594, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5064505672609401, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.14831497404060107, "calib/step_q_w": 0.358135593220339, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 426.96484375, "completions/mean_terminated_length": 426.96484375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1088, "grad_norm": 0.040686286985874176, "kl": 0.1273956298828125, "learning_rate": 2.7222222222222224e-06, "loss": 0.0351, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.040620945394039154, "mask/share_reasoning": 0.8511543273925781, "mask/share_step_conf": 0.10822470486164093, "num_tokens": 23891966.0, "reward": 1.1479109525680542, "reward_std": 0.22687333822250366, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7660393714904785, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8120423555374146, "step": 102 }, { "adv/mean_abs_final_conf": 0.7139617204666138, "adv/mean_abs_reasoning": 0.5544531345367432, "adv/mean_abs_step_conf": 0.7314602136611938, "adv/ratio_final_to_reasoning": 1.2876863272910222, "adv/ratio_step_to_reasoning": 1.3192462412036747, "adv/std_final_conf": 0.8981577754020691, "adv/std_reasoning": 0.7929568886756897, "adv/std_step_conf": 0.9361253976821899, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7499628749628751, "calib/avg_num_step_conf": 3.859375, "calib/ece": 0.19418410041841005, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.28451882845188287, "calib/gap": 0.31697727947727944, "calib/mean_conf": 0.5151882845188285, "calib/mu_c": 0.6358783783783784, "calib/mu_w": 0.3189010989010989, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.04506276150627617, "calib/std_conf": 0.37221246534829466, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5089373848987109, "calib/step_q_c_n": 543.0, "calib/step_q_gap": 0.1450946882694974, "calib/step_q_w": 0.3638426966292135, "calib/step_q_w_n": 445.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 536.14453125, "completions/mean_terminated_length": 536.14453125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.10986666666666667, "grad_norm": 0.039745040237903595, "kl": 0.10501861572265625, "learning_rate": 2.6944444444444444e-06, "loss": -0.1255, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03567434847354889, "mask/share_reasoning": 0.8767580986022949, "mask/share_step_conf": 0.08756758272647858, "num_tokens": 24133771.0, "reward": 1.09434175491333, "reward_std": 0.3003833293914795, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7061066031455994, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7878636121749878, "step": 103 }, { "adv/mean_abs_final_conf": 0.7238519787788391, "adv/mean_abs_reasoning": 0.5018754005432129, "adv/mean_abs_step_conf": 0.752862811088562, "adv/ratio_final_to_reasoning": 1.442294198909463, "adv/ratio_step_to_reasoning": 1.50009904903426, "adv/std_final_conf": 0.8926047086715698, "adv/std_reasoning": 0.7394863963127136, "adv/std_step_conf": 0.9357385635375977, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7445542046605875, "calib/avg_num_step_conf": 4.21484375, "calib/ece": 0.16699604743083005, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2134387351778656, "calib/gap": 0.32023239614994925, "calib/mean_conf": 0.37679841897233207, "calib/mu_c": 0.5552678571428571, "calib/mu_w": 0.2350354609929078, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.050553359683794465, "calib/std_conf": 0.36494791831233164, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5176548672566371, "calib/step_q_c_n": 452.0, "calib/step_q_gap": 0.14708070457721129, "calib/step_q_w": 0.37057416267942583, "calib/step_q_w_n": 627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 458.49609375, "completions/mean_terminated_length": 458.49609375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.11093333333333333, "grad_norm": 0.0384674109518528, "kl": 0.1230621337890625, "learning_rate": 2.666666666666667e-06, "loss": -0.0462, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036535318940877914, "mask/share_reasoning": 0.8619049787521362, "mask/share_step_conf": 0.10155968368053436, "num_tokens": 24357826.0, "reward": 1.1272393465042114, "reward_std": 0.19814637303352356, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7452765703201294, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8186347484588623, "step": 104 }, { "adv/mean_abs_final_conf": 0.793289065361023, "adv/mean_abs_reasoning": 0.68181312084198, "adv/mean_abs_step_conf": 0.7531487941741943, "adv/ratio_final_to_reasoning": 1.1634992655779048, "adv/ratio_step_to_reasoning": 1.104626430838, "adv/std_final_conf": 0.9366859793663025, "adv/std_reasoning": 0.8904972672462463, "adv/std_step_conf": 0.936499834060669, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7122390360095279, "calib/avg_num_step_conf": 4.3984375, "calib/ece": 0.2141422594142259, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.2510460251046025, "calib/gap": 0.281342300686563, "calib/mean_conf": 0.4468619246861924, "calib/mu_c": 0.5845901639344263, "calib/mu_w": 0.3032478632478633, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.07527196652719664, "calib/std_conf": 0.3829599891896605, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.4785921325051759, "calib/step_q_c_n": 483.0, "calib/step_q_gap": 0.10702137045229876, "calib/step_q_w": 0.37157076205287715, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 495.2734375, "completions/mean_terminated_length": 499.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.112, "grad_norm": 0.06704078614711761, "kl": 0.1043853759765625, "learning_rate": 2.6388888888888893e-06, "loss": -0.1896, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03599028289318085, "mask/share_reasoning": 0.8575528860092163, "mask/share_step_conf": 0.09864436089992523, "num_tokens": 24590376.0, "reward": 1.0415213108062744, "reward_std": 0.3739849925041199, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6758534908294678, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.753751277923584, "step": 105 }, { "adv/mean_abs_final_conf": 0.7251718044281006, "adv/mean_abs_reasoning": 0.4511352479457855, "adv/mean_abs_step_conf": 0.7519001364707947, "adv/ratio_final_to_reasoning": 1.6074376979633545, "adv/ratio_step_to_reasoning": 1.6666845250831588, "adv/std_final_conf": 0.9015657305717468, "adv/std_reasoning": 0.7206656336784363, "adv/std_step_conf": 0.9358862638473511, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7424395161290323, "calib/avg_num_step_conf": 4.1796875, "calib/ece": 0.2087301587301587, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3055555555555556, "calib/gap": 0.31643145161290326, "calib/mean_conf": 0.4999206349206349, "calib/mu_c": 0.655625, "calib/mu_w": 0.33919354838709675, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.10035714285714281, "calib/std_conf": 0.38800976921048197, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5021814671814672, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.11491093578050099, "calib/step_q_w": 0.3872705314009662, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 448.51171875, "completions/mean_terminated_length": 448.51171875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.11306666666666666, "grad_norm": 0.03875832259654999, "kl": 0.116058349609375, "learning_rate": 2.6111111111111113e-06, "loss": -0.0122, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03689251095056534, "mask/share_reasoning": 0.8643423914909363, "mask/share_step_conf": 0.09876511245965958, "num_tokens": 24809779.0, "reward": 1.1323719024658203, "reward_std": 0.22136250138282776, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7384449243545532, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.82065749168396, "step": 106 }, { "adv/mean_abs_final_conf": 0.7681862115859985, "adv/mean_abs_reasoning": 0.5777244567871094, "adv/mean_abs_step_conf": 0.7491806745529175, "adv/ratio_final_to_reasoning": 1.329675769411081, "adv/ratio_step_to_reasoning": 1.2967785347349237, "adv/std_final_conf": 0.9274890422821045, "adv/std_reasoning": 0.792971134185791, "adv/std_step_conf": 0.9358506798744202, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7280799475753603, "calib/avg_num_step_conf": 4.62109375, "calib/ece": 0.17730923694779113, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.3493975903614458, "calib/gap": 0.32397575360419384, "calib/mean_conf": 0.5883935742971887, "calib/mu_c": 0.7302142857142856, "calib/mu_w": 0.40623853211009175, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.10172690763052208, "calib/std_conf": 0.3808053051634628, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.48981260647359454, "calib/step_q_c_n": 587.0, "calib/step_q_gap": 0.1215407943930576, "calib/step_q_w": 0.36827181208053694, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 475.2734375, "completions/mean_terminated_length": 475.2734375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.11413333333333334, "grad_norm": 0.07799244672060013, "kl": 0.11484527587890625, "learning_rate": 2.5833333333333337e-06, "loss": 0.0611, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03707215189933777, "mask/share_reasoning": 0.8559058904647827, "mask/share_step_conf": 0.10702195018529892, "num_tokens": 25036065.0, "reward": 1.1105011701583862, "reward_std": 0.2526007294654846, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7247980833053589, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7990319728851318, "step": 107 }, { "adv/mean_abs_final_conf": 0.6599736213684082, "adv/mean_abs_reasoning": 0.5213769674301147, "adv/mean_abs_step_conf": 0.7592152953147888, "adv/ratio_final_to_reasoning": 1.2658281101703461, "adv/ratio_step_to_reasoning": 1.4561734459751599, "adv/std_final_conf": 0.865111231803894, "adv/std_reasoning": 0.7577022314071655, "adv/std_step_conf": 0.9359382390975952, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7108021390374332, "calib/avg_num_step_conf": 4.62109375, "calib/ece": 0.2111200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.2970837789661319, "calib/mean_conf": 0.73184, "calib/mu_c": 0.832848484848485, "calib/mu_w": 0.535764705882353, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.14148000000000013, "calib/std_conf": 0.36135054227162855, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5201437908496732, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.07531125496450575, "calib/step_q_w": 0.44483253588516747, "calib/step_q_w_n": 418.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 482.21875, "completions/mean_terminated_length": 484.1098327636719, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.1152, "grad_norm": 0.05277223140001297, "kl": 0.1013336181640625, "learning_rate": 2.5555555555555557e-06, "loss": -0.065, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035249825567007065, "mask/share_reasoning": 0.851841390132904, "mask/share_step_conf": 0.10900251567363739, "num_tokens": 25262745.0, "reward": 1.1391626596450806, "reward_std": 0.24059459567070007, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7480719089508057, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8061065673828125, "step": 108 }, { "adv/mean_abs_final_conf": 0.6619673371315002, "adv/mean_abs_reasoning": 0.31352001428604126, "adv/mean_abs_step_conf": 0.7552825212478638, "adv/ratio_final_to_reasoning": 2.111403760423255, "adv/ratio_step_to_reasoning": 2.4090408485334485, "adv/std_final_conf": 0.8693623542785645, "adv/std_reasoning": 0.6185252070426941, "adv/std_step_conf": 0.9353784322738647, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8374396748793497, "calib/avg_num_step_conf": 5.0234375, "calib/ece": 0.13673306772908367, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4302788844621514, "calib/gap": 0.5134213868427737, "calib/mean_conf": 0.5750597609561753, "calib/mu_c": 0.8348387096774195, "calib/mu_w": 0.3214173228346457, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10888446215139441, "calib/std_conf": 0.42031516397350965, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5258373983739837, "calib/step_q_c_n": 615.0, "calib/step_q_gap": 0.15632920165267222, "calib/step_q_w": 0.36950819672131147, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 492.90625, "completions/mean_terminated_length": 492.90625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.11626666666666667, "grad_norm": 0.031876228749752045, "kl": 0.102783203125, "learning_rate": 2.5277777777777778e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03490302711725235, "mask/share_reasoning": 0.8533127307891846, "mask/share_step_conf": 0.11178424954414368, "num_tokens": 25493529.0, "reward": 1.1876109838485718, "reward_std": 0.2054915726184845, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7973277568817139, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8581793904304504, "step": 109 }, { "adv/mean_abs_final_conf": 0.764315128326416, "adv/mean_abs_reasoning": 0.5388686656951904, "adv/mean_abs_step_conf": 0.7830692529678345, "adv/ratio_final_to_reasoning": 1.4183699609632687, "adv/ratio_step_to_reasoning": 1.4531727354337864, "adv/std_final_conf": 0.9091663360595703, "adv/std_reasoning": 0.7755135893821716, "adv/std_step_conf": 0.9355852007865906, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6606814600840337, "calib/avg_num_step_conf": 4.69921875, "calib/ece": 0.2866396761133603, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4979757085020243, "calib/gap": 0.23397058823529415, "calib/mean_conf": 0.6352226720647772, "calib/mu_c": 0.7564705882352941, "calib/mu_w": 0.5225, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2200404858299595, "calib/std_conf": 0.40716638440223923, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5408249496981891, "calib/step_q_c_n": 497.0, "calib/step_q_gap": 0.11504591287099358, "calib/step_q_w": 0.4257790368271955, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 436.8203125, "completions/mean_terminated_length": 438.5333557128906, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.11733333333333333, "grad_norm": 0.037987880408763885, "kl": 0.1119537353515625, "learning_rate": 2.5e-06, "loss": 0.0048, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036573655903339386, "mask/share_reasoning": 0.8427259922027588, "mask/share_step_conf": 0.11679408699274063, "num_tokens": 25710275.0, "reward": 1.06510329246521, "reward_std": 0.23794767260551453, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6501030921936035, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7966315746307373, "step": 110 }, { "adv/mean_abs_final_conf": 0.6385260820388794, "adv/mean_abs_reasoning": 0.5157452821731567, "adv/mean_abs_step_conf": 0.7536460161209106, "adv/ratio_final_to_reasoning": 1.2380648046810443, "adv/ratio_step_to_reasoning": 1.4612756377437515, "adv/std_final_conf": 0.8398423194885254, "adv/std_reasoning": 0.7576634883880615, "adv/std_step_conf": 0.9358636140823364, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7523087293562521, "calib/avg_num_step_conf": 4.07421875, "calib/ece": 0.24954918032786902, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6188524590163934, "calib/gap": 0.3211425682507584, "calib/mean_conf": 0.7228278688524591, "calib/mu_c": 0.8741860465116279, "calib/mu_w": 0.5530434782608695, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.22184426229508214, "calib/std_conf": 0.38164521316469274, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5588198757763976, "calib/step_q_c_n": 483.0, "calib/step_q_gap": 0.1459805900621119, "calib/step_q_w": 0.4128392857142857, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2699.0, "completions/max_terminated_length": 2699.0, "completions/mean_length": 460.4375, "completions/mean_terminated_length": 462.2431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.1184, "grad_norm": 0.031583499163389206, "kl": 0.1053466796875, "learning_rate": 2.4722222222222226e-06, "loss": -0.0507, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.037514761090278625, "mask/share_reasoning": 0.8553412556648254, "mask/share_step_conf": 0.10323773324489594, "num_tokens": 25935555.0, "reward": 1.0907256603240967, "reward_std": 0.26927220821380615, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6895140409469604, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8008747100830078, "step": 111 }, { "adv/mean_abs_final_conf": 0.6715695261955261, "adv/mean_abs_reasoning": 0.4684767723083496, "adv/mean_abs_step_conf": 0.7551547288894653, "adv/ratio_final_to_reasoning": 1.4335172326398748, "adv/ratio_step_to_reasoning": 1.6119363296680702, "adv/std_final_conf": 0.8733709454536438, "adv/std_reasoning": 0.7393901944160461, "adv/std_step_conf": 0.9356892704963684, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8344532279314888, "calib/avg_num_step_conf": 4.625, "calib/ece": 0.1204048582995951, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.44534412955465585, "calib/gap": 0.5215131752305664, "calib/mean_conf": 0.57165991902834, "calib/mu_c": 0.814469696969697, "calib/mu_w": 0.2929565217391305, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.07882591093117405, "calib/std_conf": 0.42045001644516367, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5273089171974522, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.1731182697154378, "calib/step_q_w": 0.3541906474820144, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 538.16796875, "completions/mean_terminated_length": 538.16796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.11946666666666667, "grad_norm": 0.028904251754283905, "kl": 0.097686767578125, "learning_rate": 2.4444444444444447e-06, "loss": -0.0036, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03284811973571777, "mask/share_reasoning": 0.8688977360725403, "mask/share_step_conf": 0.09825415909290314, "num_tokens": 26181246.0, "reward": 1.163731336593628, "reward_std": 0.22518390417099, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7981777191162109, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8231689929962158, "step": 112 }, { "adv/mean_abs_final_conf": 0.6834825277328491, "adv/mean_abs_reasoning": 0.5273545980453491, "adv/mean_abs_step_conf": 0.7404484152793884, "adv/ratio_final_to_reasoning": 1.2960587245587532, "adv/ratio_step_to_reasoning": 1.404080703996658, "adv/std_final_conf": 0.8908624053001404, "adv/std_reasoning": 0.7928634285926819, "adv/std_step_conf": 0.9354289174079895, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7851299643752476, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.19566265060240962, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5662650602409639, "calib/gap": 0.3544748647578836, "calib/mean_conf": 0.6876305220883534, "calib/mu_c": 0.8385314685314685, "calib/mu_w": 0.48405660377358495, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1544979919678715, "calib/std_conf": 0.38450354998612984, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5137192474674385, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.12504577807968337, "calib/step_q_w": 0.3886734693877551, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 432.34765625, "completions/mean_terminated_length": 432.34765625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.12053333333333334, "grad_norm": 0.03928934410214424, "kl": 0.1289215087890625, "learning_rate": 2.4166666666666667e-06, "loss": 0.0206, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.037928465753793716, "mask/share_reasoning": 0.8393813371658325, "mask/share_step_conf": 0.12269022315740585, "num_tokens": 26397127.0, "reward": 1.1599385738372803, "reward_std": 0.2528593838214874, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7432464361190796, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8474411964416504, "step": 113 }, { "adv/mean_abs_final_conf": 0.6147596836090088, "adv/mean_abs_reasoning": 0.5180281400680542, "adv/mean_abs_step_conf": 0.7503921389579773, "adv/ratio_final_to_reasoning": 1.1867302875250112, "adv/ratio_step_to_reasoning": 1.4485547809418173, "adv/std_final_conf": 0.8503017425537109, "adv/std_reasoning": 0.7753562331199646, "adv/std_step_conf": 0.9356796741485596, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8306440631808281, "calib/avg_num_step_conf": 5.1171875, "calib/ece": 0.16803212851405624, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.46171568627450965, "calib/mean_conf": 0.7612048192771085, "calib/mu_c": 0.9392156862745097, "calib/mu_w": 0.47750000000000004, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15738955823293174, "calib/std_conf": 0.3656541618540561, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5442463958060288, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.15053524406928293, "calib/step_q_w": 0.3937111517367459, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 438.015625, "completions/mean_terminated_length": 439.7333679199219, "completions/min_length": 0.0, "completions/min_terminated_length": 23.0, "epoch": 0.1216, "grad_norm": 0.04486739635467529, "kl": 0.1123199462890625, "learning_rate": 2.388888888888889e-06, "loss": -0.0061, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03878115117549896, "mask/share_reasoning": 0.8267152905464172, "mask/share_step_conf": 0.130597323179245, "num_tokens": 26614283.0, "reward": 1.188596248626709, "reward_std": 0.24334082007408142, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8040202856063843, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8394063711166382, "step": 114 }, { "adv/mean_abs_final_conf": 0.7535337209701538, "adv/mean_abs_reasoning": 0.6055867671966553, "adv/mean_abs_step_conf": 0.7513902187347412, "adv/ratio_final_to_reasoning": 1.2443034785227647, "adv/ratio_step_to_reasoning": 1.2407639324964617, "adv/std_final_conf": 0.9215371608734131, "adv/std_reasoning": 0.8266196846961975, "adv/std_step_conf": 0.9360374212265015, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6653112192622951, "calib/avg_num_step_conf": 4.75390625, "calib/ece": 0.34132, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.624, "calib/gap": 0.2081083504098361, "calib/mean_conf": 0.73524, "calib/mu_c": 0.8367968750000001, "calib/mu_w": 0.628688524590164, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.28228, "calib/std_conf": 0.3860043295093981, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5139455782312926, "calib/step_q_c_n": 588.0, "calib/step_q_gap": 0.09846863069552148, "calib/step_q_w": 0.41547694753577114, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 440.21875, "completions/mean_terminated_length": 440.21875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.12266666666666666, "grad_norm": 0.05996016785502434, "kl": 0.11113739013671875, "learning_rate": 2.361111111111111e-06, "loss": 0.0315, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03766099363565445, "mask/share_reasoning": 0.8441624641418457, "mask/share_step_conf": 0.11817656457424164, "num_tokens": 26832243.0, "reward": 1.0623983144760132, "reward_std": 0.29575490951538086, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6283395290374756, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8023255467414856, "step": 115 }, { "adv/mean_abs_final_conf": 0.7394832372665405, "adv/mean_abs_reasoning": 0.51995849609375, "adv/mean_abs_step_conf": 0.7374346256256104, "adv/ratio_final_to_reasoning": 1.4221966615066322, "adv/ratio_step_to_reasoning": 1.4182567092675196, "adv/std_final_conf": 0.9036124348640442, "adv/std_reasoning": 0.7754974961280823, "adv/std_step_conf": 0.9353717565536499, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6897636294319481, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.28478087649402395, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5697211155378487, "calib/gap": 0.27373236751810914, "calib/mean_conf": 0.6809561752988048, "calib/mu_c": 0.8216393442622951, "calib/mu_w": 0.547906976744186, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.23984063745019926, "calib/std_conf": 0.4027157337661665, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5136998254799302, "calib/step_q_c_n": 573.0, "calib/step_q_gap": 0.11805785017128828, "calib/step_q_w": 0.3956419753086419, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 512.96875, "completions/mean_terminated_length": 512.96875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.12373333333333333, "grad_norm": 0.03793274611234665, "kl": 0.09812164306640625, "learning_rate": 2.3333333333333336e-06, "loss": -0.0113, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034530505537986755, "mask/share_reasoning": 0.8493834137916565, "mask/share_step_conf": 0.11608609557151794, "num_tokens": 27068083.0, "reward": 1.1046454906463623, "reward_std": 0.2591022551059723, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6625652313232422, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8389630317687988, "step": 116 }, { "adv/mean_abs_final_conf": 0.6700632572174072, "adv/mean_abs_reasoning": 0.5251960158348083, "adv/mean_abs_step_conf": 0.7502630949020386, "adv/ratio_final_to_reasoning": 1.2758346160572636, "adv/ratio_step_to_reasoning": 1.42853919733089, "adv/std_final_conf": 0.8910670876502991, "adv/std_reasoning": 0.757550835609436, "adv/std_step_conf": 0.9356370568275452, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7566102756892231, "calib/avg_num_step_conf": 5.21875, "calib/ece": 0.25251968503936995, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.562992125984252, "calib/gap": 0.3700125313283208, "calib/mean_conf": 0.6753543307086615, "calib/mu_c": 0.879298245614035, "calib/mu_w": 0.5092857142857142, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.239527559055118, "calib/std_conf": 0.39802919070650244, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5111211573236889, "calib/step_q_c_n": 553.0, "calib/step_q_gap": 0.1128069810784782, "calib/step_q_w": 0.3983141762452107, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 462.67578125, "completions/mean_terminated_length": 462.67578125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1248, "grad_norm": 0.039365384727716446, "kl": 0.10528564453125, "learning_rate": 2.305555555555556e-06, "loss": 0.0083, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035405274480581284, "mask/share_reasoning": 0.8448545932769775, "mask/share_step_conf": 0.11974013596773148, "num_tokens": 27293128.0, "reward": 1.1315536499023438, "reward_std": 0.21968162059783936, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7202702760696411, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8368913531303406, "step": 117 }, { "adv/mean_abs_final_conf": 0.586737871170044, "adv/mean_abs_reasoning": 0.4335465431213379, "adv/mean_abs_step_conf": 0.7783301472663879, "adv/ratio_final_to_reasoning": 1.3533445958207813, "adv/ratio_step_to_reasoning": 1.7952631836544353, "adv/std_final_conf": 0.8127620220184326, "adv/std_reasoning": 0.701391875743866, "adv/std_step_conf": 0.9351849555969238, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7986656200941915, "calib/avg_num_step_conf": 5.96875, "calib/ece": 0.21003984063745013, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.37197082679225535, "calib/mean_conf": 0.7494820717131474, "calib/mu_c": 0.9036054421768707, "calib/mu_w": 0.5316346153846153, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18693227091633458, "calib/std_conf": 0.3743341108835886, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4696088019559902, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.10428485829401835, "calib/step_q_w": 0.36532394366197185, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 494.5078125, "completions/mean_terminated_length": 494.5078125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.12586666666666665, "grad_norm": 0.03469162434339523, "kl": 0.097198486328125, "learning_rate": 2.277777777777778e-06, "loss": 0.0425, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03486177325248718, "mask/share_reasoning": 0.8360300660133362, "mask/share_step_conf": 0.12910816073417664, "num_tokens": 27523730.0, "reward": 1.1617202758789062, "reward_std": 0.2108428180217743, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7519367337226868, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8408982753753662, "step": 118 }, { "adv/mean_abs_final_conf": 0.6262631416320801, "adv/mean_abs_reasoning": 0.5967061519622803, "adv/mean_abs_step_conf": 0.7638027667999268, "adv/ratio_final_to_reasoning": 1.04953357623782, "adv/ratio_step_to_reasoning": 1.280031660957652, "adv/std_final_conf": 0.861553966999054, "adv/std_reasoning": 0.8265232443809509, "adv/std_step_conf": 0.9356296062469482, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7408450704225351, "calib/avg_num_step_conf": 4.8984375, "calib/ece": 0.2215873015873016, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.49603174603174605, "calib/gap": 0.3755697823303458, "calib/mean_conf": 0.6175396825396826, "calib/mu_c": 0.7814788732394367, "calib/mu_w": 0.40590909090909083, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13781746031746034, "calib/std_conf": 0.4239210141088477, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5052832861189802, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.11292927152044008, "calib/step_q_w": 0.39235401459854014, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 525.4453125, "completions/mean_terminated_length": 525.4453125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.12693333333333334, "grad_norm": 0.03453721106052399, "kl": 0.0975494384765625, "learning_rate": 2.25e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033460669219493866, "mask/share_reasoning": 0.8612064123153687, "mask/share_step_conf": 0.10533291101455688, "num_tokens": 27763308.0, "reward": 1.1733651161193848, "reward_std": 0.208999365568161, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7443429231643677, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8630497455596924, "step": 119 }, { "adv/mean_abs_final_conf": 0.6886662244796753, "adv/mean_abs_reasoning": 0.5544482469558716, "adv/mean_abs_step_conf": 0.7756971120834351, "adv/ratio_final_to_reasoning": 1.242074852361263, "adv/ratio_step_to_reasoning": 1.3990433125946427, "adv/std_final_conf": 0.8938544392585754, "adv/std_reasoning": 0.8264629244804382, "adv/std_step_conf": 0.9358031153678894, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7579295921924015, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.21085365853658533, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5528455284552846, "calib/gap": 0.42188846287905185, "calib/mean_conf": 0.6501219512195122, "calib/mu_c": 0.813046357615894, "calib/mu_w": 0.3911578947368422, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12357723577235769, "calib/std_conf": 0.42475864185097756, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.47984756898817343, "calib/step_q_c_n": 761.0, "calib/step_q_gap": 0.08424904228467434, "calib/step_q_w": 0.3955985267034991, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 466.35546875, "completions/mean_terminated_length": 466.35546875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.128, "grad_norm": 0.051741085946559906, "kl": 0.106475830078125, "learning_rate": 2.222222222222222e-06, "loss": 0.0191, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03581401705741882, "mask/share_reasoning": 0.8447315692901611, "mask/share_step_conf": 0.11945446580648422, "num_tokens": 27989383.0, "reward": 1.1456115245819092, "reward_std": 0.2601540982723236, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7468265295028687, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8228267431259155, "step": 120 }, { "adv/mean_abs_final_conf": 0.7501630187034607, "adv/mean_abs_reasoning": 0.6250473260879517, "adv/mean_abs_step_conf": 0.7540261149406433, "adv/ratio_final_to_reasoning": 1.2001699509676869, "adv/ratio_step_to_reasoning": 1.206350436950022, "adv/std_final_conf": 0.9055350422859192, "adv/std_reasoning": 0.8267970085144043, "adv/std_step_conf": 0.9355260729789734, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7311981413873216, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.24650406504065042, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4959349593495935, "calib/gap": 0.28248390308662463, "calib/mean_conf": 0.6463414634146342, "calib/mu_c": 0.7783969465648855, "calib/mu_w": 0.4959130434782608, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18016260162601627, "calib/std_conf": 0.3992498418888415, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4725547445255474, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.10243527021252469, "calib/step_q_w": 0.3701194743130227, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 543.90234375, "completions/mean_terminated_length": 548.18505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.12906666666666666, "grad_norm": 0.031234215945005417, "kl": 0.09563446044921875, "learning_rate": 2.1944444444444445e-06, "loss": -0.0565, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03101031668484211, "mask/share_reasoning": 0.849081814289093, "mask/share_step_conf": 0.11209535598754883, "num_tokens": 28233678.0, "reward": 1.1141672134399414, "reward_std": 0.26280272006988525, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6912468671798706, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8283709287643433, "step": 121 }, { "adv/mean_abs_final_conf": 0.6409776210784912, "adv/mean_abs_reasoning": 0.5207710266113281, "adv/mean_abs_step_conf": 0.7637045383453369, "adv/ratio_final_to_reasoning": 1.2308242746324634, "adv/ratio_step_to_reasoning": 1.4664881479962202, "adv/std_final_conf": 0.8490242958068848, "adv/std_reasoning": 0.7752876877784729, "adv/std_step_conf": 0.9349822998046875, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7824970752632264, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.1734509803921569, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5803921568627451, "calib/gap": 0.439220720135188, "calib/mean_conf": 0.682156862745098, "calib/mu_c": 0.850955414012739, "calib/mu_w": 0.411734693877551, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11996078431372553, "calib/std_conf": 0.407408422167366, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5227373612823675, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.13122672298449511, "calib/step_q_w": 0.39151063829787236, "calib/step_q_w_n": 470.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 471.87109375, "completions/mean_terminated_length": 473.7215881347656, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.13013333333333332, "grad_norm": 0.031476348638534546, "kl": 0.1068267822265625, "learning_rate": 2.166666666666667e-06, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0348004549741745, "mask/share_reasoning": 0.8466958999633789, "mask/share_step_conf": 0.11459743976593018, "num_tokens": 28461821.0, "reward": 1.2163907289505005, "reward_std": 0.19097883999347687, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7977089881896973, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.875464916229248, "step": 122 }, { "adv/mean_abs_final_conf": 0.7445669770240784, "adv/mean_abs_reasoning": 0.5697649717330933, "adv/mean_abs_step_conf": 0.7346761226654053, "adv/ratio_final_to_reasoning": 1.3067966862885196, "adv/ratio_step_to_reasoning": 1.2894371523589638, "adv/std_final_conf": 0.9072105884552002, "adv/std_reasoning": 0.8099097609519958, "adv/std_step_conf": 0.9359018206596375, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6791389680278569, "calib/avg_num_step_conf": 5.90234375, "calib/ece": 0.26869047619047615, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.49206349206349204, "calib/gap": 0.2841538461538463, "calib/mean_conf": 0.6205158730158731, "calib/mu_c": 0.7524444444444446, "calib/mu_w": 0.46829059829059827, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17674603174603168, "calib/std_conf": 0.4183273236928797, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.46036601307189545, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.07938746079307502, "calib/step_q_w": 0.38097855227882044, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 556.60546875, "completions/mean_terminated_length": 558.7882690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.1312, "grad_norm": 0.033415503799915314, "kl": 0.0966949462890625, "learning_rate": 2.138888888888889e-06, "loss": -0.0082, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03071346879005432, "mask/share_reasoning": 0.8498426675796509, "mask/share_step_conf": 0.115537628531456, "num_tokens": 28709600.0, "reward": 1.1285775899887085, "reward_std": 0.25766658782958984, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.699337899684906, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8369823694229126, "step": 123 }, { "adv/mean_abs_final_conf": 0.6646140813827515, "adv/mean_abs_reasoning": 0.45616310834884644, "adv/mean_abs_step_conf": 0.7651708126068115, "adv/ratio_final_to_reasoning": 1.4569658729931623, "adv/ratio_step_to_reasoning": 1.67740617029831, "adv/std_final_conf": 0.8431295156478882, "adv/std_reasoning": 0.7392640113830566, "adv/std_step_conf": 0.9358372092247009, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7841423948220064, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.1950592885375494, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.541501976284585, "calib/gap": 0.3796711974110033, "calib/mean_conf": 0.6663636363636364, "calib/mu_c": 0.8209333333333334, "calib/mu_w": 0.4412621359223301, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1342687747035573, "calib/std_conf": 0.3990317061978957, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5136621621621622, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.13355763254543745, "calib/step_q_w": 0.38010452961672475, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2241.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 480.98046875, "completions/mean_terminated_length": 480.98046875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.13226666666666667, "grad_norm": 0.038355790078639984, "kl": 0.1045684814453125, "learning_rate": 2.1111111111111114e-06, "loss": 0.0398, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03441513329744339, "mask/share_reasoning": 0.8505345582962036, "mask/share_step_conf": 0.1150503158569336, "num_tokens": 28939547.0, "reward": 1.1643469333648682, "reward_std": 0.22691610455513, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7644277215003967, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8334689140319824, "step": 124 }, { "adv/mean_abs_final_conf": 0.742373526096344, "adv/mean_abs_reasoning": 0.47091740369796753, "adv/mean_abs_step_conf": 0.7559845447540283, "adv/ratio_final_to_reasoning": 1.5764410494636982, "adv/ratio_step_to_reasoning": 1.60534424682018, "adv/std_final_conf": 0.8906104564666748, "adv/std_reasoning": 0.7393792271614075, "adv/std_step_conf": 0.935725748538971, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6311610772357724, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.3185258964143427, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.47808764940239046, "calib/gap": 0.1802997967479676, "calib/mean_conf": 0.6022709163346613, "calib/mu_c": 0.690625, "calib/mu_w": 0.5103252032520325, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20541832669322713, "calib/std_conf": 0.4166753519409285, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47332365747460087, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.07772664254922773, "calib/step_q_w": 0.39559701492537314, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 510.54296875, "completions/mean_terminated_length": 510.54296875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.13333333333333333, "grad_norm": 0.030706195160746574, "kl": 0.096435546875, "learning_rate": 2.0833333333333334e-06, "loss": 0.0279, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034426264464855194, "mask/share_reasoning": 0.8521647453308105, "mask/share_step_conf": 0.11340896785259247, "num_tokens": 29175054.0, "reward": 1.0779061317443848, "reward_std": 0.24169524013996124, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6451429724693298, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8107587695121765, "step": 125 }, { "adv/mean_abs_final_conf": 0.5916764140129089, "adv/mean_abs_reasoning": 0.5020506381988525, "adv/mean_abs_step_conf": 0.7531881928443909, "adv/ratio_final_to_reasoning": 1.1785193942499428, "adv/ratio_step_to_reasoning": 1.5002235542345186, "adv/std_final_conf": 0.7995308637619019, "adv/std_reasoning": 0.7754698395729065, "adv/std_step_conf": 0.9352940320968628, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7731808731808733, "calib/avg_num_step_conf": 6.17578125, "calib/ece": 0.2134439834024897, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5352697095435685, "calib/gap": 0.4235973665973665, "calib/mean_conf": 0.6165145228215767, "calib/mu_c": 0.8116153846153845, "calib/mu_w": 0.388018018018018, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14526970954356855, "calib/std_conf": 0.4341896262370006, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5084081041968163, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.18336316037659156, "calib/step_q_w": 0.3250449438202247, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 515.25, "completions/mean_terminated_length": 521.3596801757812, "completions/min_length": 0.0, "completions/min_terminated_length": 21.0, "epoch": 0.1344, "grad_norm": 0.032088715583086014, "kl": 0.094940185546875, "learning_rate": 2.0555555555555555e-06, "loss": -0.0959, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03380977362394333, "mask/share_reasoning": 0.8281418085098267, "mask/share_step_conf": 0.1263296753168106, "num_tokens": 29412422.0, "reward": 1.1038548946380615, "reward_std": 0.2703258693218231, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7186906337738037, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7999711632728577, "step": 126 }, { "adv/mean_abs_final_conf": 0.6639594435691833, "adv/mean_abs_reasoning": 0.42932218313217163, "adv/mean_abs_step_conf": 0.7333925366401672, "adv/ratio_final_to_reasoning": 1.5465295520608493, "adv/ratio_step_to_reasoning": 1.7082567951406884, "adv/std_final_conf": 0.8485727906227112, "adv/std_reasoning": 0.7205451726913452, "adv/std_step_conf": 0.9357307553291321, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.740875, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.26549407114624507, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4743083003952569, "calib/gap": 0.327150625, "calib/mean_conf": 0.5762450592885376, "calib/mu_c": 0.74176, "calib/mu_w": 0.414609375, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17383399209486164, "calib/std_conf": 0.4357109169931961, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47575716234652116, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.09505853220953492, "calib/step_q_w": 0.38069863013698624, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 460.34375, "completions/mean_terminated_length": 462.1490478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 35.0, "epoch": 0.13546666666666668, "grad_norm": 0.0348670557141304, "kl": 0.1072998046875, "learning_rate": 2.027777777777778e-06, "loss": -0.061, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0351860374212265, "mask/share_reasoning": 0.8259862661361694, "mask/share_step_conf": 0.13492143154144287, "num_tokens": 29633942.0, "reward": 1.1170024871826172, "reward_std": 0.22174401581287384, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7007777094841003, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8263181447982788, "step": 127 }, { "adv/mean_abs_final_conf": 0.7019212245941162, "adv/mean_abs_reasoning": 0.5486740469932556, "adv/mean_abs_step_conf": 0.7690510749816895, "adv/ratio_final_to_reasoning": 1.2793045861029113, "adv/ratio_step_to_reasoning": 1.401653821966073, "adv/std_final_conf": 0.8775027990341187, "adv/std_reasoning": 0.7754642963409424, "adv/std_step_conf": 0.9357042908668518, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7367886178861789, "calib/avg_num_step_conf": 4.9921875, "calib/ece": 0.29071129707112975, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5439330543933054, "calib/gap": 0.3590573310905523, "calib/mean_conf": 0.6215062761506277, "calib/mu_c": 0.8062931034482759, "calib/mu_w": 0.44723577235772355, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.21343096234309628, "calib/std_conf": 0.43609650068580086, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5024716981132075, "calib/step_q_c_n": 530.0, "calib/step_q_gap": 0.1295572596105336, "calib/step_q_w": 0.37291443850267386, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 541.0703125, "completions/mean_terminated_length": 543.1921997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.13653333333333334, "grad_norm": 0.12452156841754913, "kl": 0.4658660888671875, "learning_rate": 2.0000000000000003e-06, "loss": -0.0899, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.035640161484479904, "mask/share_reasoning": 0.8514869809150696, "mask/share_step_conf": 0.10896661132574081, "num_tokens": 29879120.0, "reward": 1.0647579431533813, "reward_std": 0.2794821560382843, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6730015873908997, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7866345643997192, "step": 128 }, { "adv/mean_abs_final_conf": 0.6740026473999023, "adv/mean_abs_reasoning": 0.49119341373443604, "adv/mean_abs_step_conf": 0.758554220199585, "adv/ratio_final_to_reasoning": 1.372173625610343, "adv/ratio_step_to_reasoning": 1.544308614467086, "adv/std_final_conf": 0.8902783393859863, "adv/std_reasoning": 0.775271475315094, "adv/std_step_conf": 0.9356626868247986, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6853947368421052, "calib/avg_num_step_conf": 5.78515625, "calib/ece": 0.2503571428571427, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5436507936507936, "calib/gap": 0.2583078947368421, "calib/mean_conf": 0.709404761904762, "calib/mu_c": 0.8119078947368421, "calib/mu_w": 0.5536, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17829365079365067, "calib/std_conf": 0.3797921879252788, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4949701314217443, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.06855708794348342, "calib/step_q_w": 0.42641304347826087, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 464.4765625, "completions/mean_terminated_length": 464.4765625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1376, "grad_norm": 0.03653028607368469, "kl": 0.10650634765625, "learning_rate": 1.9722222222222224e-06, "loss": -0.0337, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03688127547502518, "mask/share_reasoning": 0.8289004564285278, "mask/share_step_conf": 0.1342182755470276, "num_tokens": 30100410.0, "reward": 1.1344451904296875, "reward_std": 0.21902605891227722, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7134796977043152, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.827044665813446, "step": 129 }, { "adv/mean_abs_final_conf": 0.6065406799316406, "adv/mean_abs_reasoning": 0.4211353659629822, "adv/mean_abs_step_conf": 0.7387837767601013, "adv/ratio_final_to_reasoning": 1.4402511139018317, "adv/ratio_step_to_reasoning": 1.7542667666268628, "adv/std_final_conf": 0.8230589628219604, "adv/std_reasoning": 0.7014061212539673, "adv/std_step_conf": 0.9356253147125244, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7767603730233814, "calib/avg_num_step_conf": 5.05078125, "calib/ece": 0.19618473895582333, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.606425702811245, "calib/gap": 0.42343357210433835, "calib/mean_conf": 0.6834136546184739, "calib/mu_c": 0.8500662251655628, "calib/mu_w": 0.4266326530612245, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13658634538152614, "calib/std_conf": 0.41361564911772564, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5404630969609262, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.1171740604160757, "calib/step_q_w": 0.4232890365448505, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 455.4921875, "completions/mean_terminated_length": 455.4921875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.13866666666666666, "grad_norm": 0.036019667983055115, "kl": 0.108154296875, "learning_rate": 1.944444444444445e-06, "loss": 0.0524, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03684893622994423, "mask/share_reasoning": 0.8434115648269653, "mask/share_step_conf": 0.11973953992128372, "num_tokens": 30322304.0, "reward": 1.1682312488555908, "reward_std": 0.23199787735939026, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7613061666488647, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8422915935516357, "step": 130 }, { "adv/mean_abs_final_conf": 0.7187561392784119, "adv/mean_abs_reasoning": 0.4014168381690979, "adv/mean_abs_step_conf": 0.7524584531784058, "adv/ratio_final_to_reasoning": 1.7905480561222347, "adv/ratio_step_to_reasoning": 1.8745064522216945, "adv/std_final_conf": 0.8986456394195557, "adv/std_reasoning": 0.7012752294540405, "adv/std_step_conf": 0.9357667565345764, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7624935765673175, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.20059760956175296, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3904382470119522, "calib/gap": 0.38298111510791355, "calib/mean_conf": 0.5310358565737051, "calib/mu_c": 0.7431249999999999, "calib/mu_w": 0.3601438848920864, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14270916334661352, "calib/std_conf": 0.4212839919835041, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.500577849117175, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.11279012533200872, "calib/step_q_w": 0.38778772378516624, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 460.75390625, "completions/mean_terminated_length": 464.38189697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.13973333333333332, "grad_norm": 0.033635661005973816, "kl": 0.1043853759765625, "learning_rate": 1.916666666666667e-06, "loss": -0.085, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03563493862748146, "mask/share_reasoning": 0.8335763216018677, "mask/share_step_conf": 0.12297628074884415, "num_tokens": 30546465.0, "reward": 1.1304597854614258, "reward_std": 0.22982972860336304, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7387917637825012, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8262102603912354, "step": 131 }, { "adv/mean_abs_final_conf": 0.6320432424545288, "adv/mean_abs_reasoning": 0.5479155778884888, "adv/mean_abs_step_conf": 0.7798007726669312, "adv/ratio_final_to_reasoning": 1.153541289864844, "adv/ratio_step_to_reasoning": 1.4232133637668456, "adv/std_final_conf": 0.8763335943222046, "adv/std_reasoning": 0.792772114276886, "adv/std_step_conf": 0.9349549412727356, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8292570978952201, "calib/avg_num_step_conf": 5.4140625, "calib/ece": 0.1606746031746032, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4880952380952381, "calib/gap": 0.46420890433414214, "calib/mean_conf": 0.6292460317460317, "calib/mu_c": 0.8152980132450331, "calib/mu_w": 0.351089108910891, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09535714285714289, "calib/std_conf": 0.41082648379314624, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5584105960264901, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.16641376559859788, "calib/step_q_w": 0.39199683042789224, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 463.515625, "completions/mean_terminated_length": 465.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.1408, "grad_norm": 0.03424832969903946, "kl": 0.10589599609375, "learning_rate": 1.888888888888889e-06, "loss": -0.0352, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03672264143824577, "mask/share_reasoning": 0.8319817781448364, "mask/share_step_conf": 0.1273893415927887, "num_tokens": 30770717.0, "reward": 1.1921403408050537, "reward_std": 0.20535215735435486, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7974933981895447, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8479623198509216, "step": 132 }, { "adv/mean_abs_final_conf": 0.7448168992996216, "adv/mean_abs_reasoning": 0.5736385583877563, "adv/mean_abs_step_conf": 0.7350783348083496, "adv/ratio_final_to_reasoning": 1.2984080104255398, "adv/ratio_step_to_reasoning": 1.2814311800697793, "adv/std_final_conf": 0.8978567719459534, "adv/std_reasoning": 0.8099358677864075, "adv/std_step_conf": 0.9359154105186462, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7202092871157619, "calib/avg_num_step_conf": 6.3359375, "calib/ece": 0.2692369477911647, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3614457831325301, "calib/gap": 0.320380640941792, "calib/mean_conf": 0.4864257028112449, "calib/mu_c": 0.6652727272727272, "calib/mu_w": 0.3448920863309352, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15694779116465862, "calib/std_conf": 0.42975999127701225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46619533527696794, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.09176157459320722, "calib/step_q_w": 0.3744337606837607, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2130.0, "completions/max_terminated_length": 2130.0, "completions/mean_length": 569.0, "completions/mean_terminated_length": 573.4802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.14186666666666667, "grad_norm": 0.04488792642951012, "kl": 0.08905792236328125, "learning_rate": 1.8611111111111113e-06, "loss": -0.1102, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030352286994457245, "mask/share_reasoning": 0.8483153581619263, "mask/share_step_conf": 0.11351984739303589, "num_tokens": 31022725.0, "reward": 1.120848298072815, "reward_std": 0.2642374038696289, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.7049039006233215, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8370283842086792, "step": 133 }, { "adv/mean_abs_final_conf": 0.7327480912208557, "adv/mean_abs_reasoning": 0.5926192998886108, "adv/mean_abs_step_conf": 0.7497801780700684, "adv/ratio_final_to_reasoning": 1.2364566786106757, "adv/ratio_step_to_reasoning": 1.265197029882418, "adv/std_final_conf": 0.9129754900932312, "adv/std_reasoning": 0.8265247941017151, "adv/std_step_conf": 0.9356361031532288, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.737291449426486, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.20534136546184734, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3092369477911647, "calib/gap": 0.3447536496350365, "calib/mean_conf": 0.461566265060241, "calib/mu_c": 0.65125, "calib/mu_w": 0.3064963503649635, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10855421686746983, "calib/std_conf": 0.4173645039328073, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5047840531561462, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.11189114634112529, "calib/step_q_w": 0.39289290681502087, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 525.98046875, "completions/mean_terminated_length": 528.0431518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.14293333333333333, "grad_norm": 0.030651414766907692, "kl": 0.0981597900390625, "learning_rate": 1.8333333333333333e-06, "loss": -0.0658, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031976088881492615, "mask/share_reasoning": 0.8612660765647888, "mask/share_step_conf": 0.10285161435604095, "num_tokens": 31266328.0, "reward": 1.1207489967346191, "reward_std": 0.22252947092056274, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7283519506454468, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8207431435585022, "step": 134 }, { "adv/mean_abs_final_conf": 0.7493464946746826, "adv/mean_abs_reasoning": 0.5230491161346436, "adv/mean_abs_step_conf": 0.7491369843482971, "adv/ratio_final_to_reasoning": 1.4326503411618146, "adv/ratio_step_to_reasoning": 1.432249785420637, "adv/std_final_conf": 0.913597822189331, "adv/std_reasoning": 0.7754340767860413, "adv/std_step_conf": 0.9354913830757141, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6451528952504879, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.2869588313413015, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.41434262948207173, "calib/gap": 0.2268954673606593, "calib/mean_conf": 0.5647543160690571, "calib/mu_c": 0.6605747126436782, "calib/mu_w": 0.4336792452830189, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13701195219123513, "calib/std_conf": 0.41709440920609037, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4684268125854993, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.06258647645104554, "calib/step_q_w": 0.40584033613445375, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 508.18359375, "completions/mean_terminated_length": 508.18359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.144, "grad_norm": 0.0449938029050827, "kl": 0.094482421875, "learning_rate": 1.8055555555555557e-06, "loss": 0.0734, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03592613339424133, "mask/share_reasoning": 0.8439304828643799, "mask/share_step_conf": 0.12014340609312057, "num_tokens": 31502303.0, "reward": 1.1263656616210938, "reward_std": 0.23089656233787537, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6755272150039673, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8462612628936768, "step": 135 }, { "adv/mean_abs_final_conf": 0.6829756498336792, "adv/mean_abs_reasoning": 0.4903065860271454, "adv/mean_abs_step_conf": 0.7625067234039307, "adv/ratio_final_to_reasoning": 1.3929563038663055, "adv/ratio_step_to_reasoning": 1.5551631267741428, "adv/std_final_conf": 0.8802697062492371, "adv/std_reasoning": 0.7576611638069153, "adv/std_step_conf": 0.9353592991828918, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8413996292372881, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.16715447154471547, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3130081300813008, "calib/gap": 0.46579978813559336, "calib/mean_conf": 0.45186991869918697, "calib/mu_c": 0.6942372881355934, "calib/mu_w": 0.22843750000000002, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06967479674796749, "calib/std_conf": 0.4207443067906461, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47006163328197237, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.1524903817330257, "calib/step_q_w": 0.31757125154894666, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 495.5, "completions/mean_terminated_length": 499.4015808105469, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.14506666666666668, "grad_norm": 0.029826823621988297, "kl": 0.10265350341796875, "learning_rate": 1.777777777777778e-06, "loss": 0.0001, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03438597545027733, "mask/share_reasoning": 0.83380126953125, "mask/share_step_conf": 0.12400020658969879, "num_tokens": 31737639.0, "reward": 1.1563998460769653, "reward_std": 0.21687453985214233, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7736788988113403, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8364971876144409, "step": 136 }, { "adv/mean_abs_final_conf": 0.6984125375747681, "adv/mean_abs_reasoning": 0.4689386487007141, "adv/mean_abs_step_conf": 0.7421602606773376, "adv/ratio_final_to_reasoning": 1.4893473581455827, "adv/ratio_step_to_reasoning": 1.5826382891101793, "adv/std_final_conf": 0.8773199319839478, "adv/std_reasoning": 0.7392724752426147, "adv/std_step_conf": 0.9357515573501587, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7205172194688017, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.2425498007968127, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3745019920318725, "calib/gap": 0.3167702376413776, "calib/mean_conf": 0.5148207171314741, "calib/mu_c": 0.6776229508196722, "calib/mu_w": 0.36085271317829454, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13565737051792826, "calib/std_conf": 0.421878114099864, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4733026113671275, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.09498628483651522, "calib/step_q_w": 0.37831632653061226, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 468.09375, "completions/mean_terminated_length": 469.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.14613333333333334, "grad_norm": 0.04951392114162445, "kl": 0.116912841796875, "learning_rate": 1.75e-06, "loss": 0.0458, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035196349024772644, "mask/share_reasoning": 0.8351722955703735, "mask/share_step_conf": 0.1257251352071762, "num_tokens": 31964455.0, "reward": 1.132115364074707, "reward_std": 0.2261495590209961, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7153968811035156, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8382851481437683, "step": 137 }, { "adv/mean_abs_final_conf": 0.7322195172309875, "adv/mean_abs_reasoning": 0.5432929992675781, "adv/mean_abs_step_conf": 0.7652676701545715, "adv/ratio_final_to_reasoning": 1.3477433322684154, "adv/ratio_step_to_reasoning": 1.408572669234172, "adv/std_final_conf": 0.910886287689209, "adv/std_reasoning": 0.7755010724067688, "adv/std_step_conf": 0.9350913763046265, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7641581279423982, "calib/avg_num_step_conf": 5.34765625, "calib/ece": 0.21911646586345376, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.37349397590361444, "calib/gap": 0.4090729714760454, "calib/mean_conf": 0.5021686746987952, "calib/mu_c": 0.653312101910828, "calib/mu_w": 0.24423913043478263, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.045381526104417626, "calib/std_conf": 0.4253665035142027, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4698145859085291, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.10417172876567199, "calib/step_q_w": 0.3656428571428571, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 486.3515625, "completions/mean_terminated_length": 486.3515625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1472, "grad_norm": 0.06074017286300659, "kl": 0.10494232177734375, "learning_rate": 1.7222222222222224e-06, "loss": -0.0467, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.036103568971157074, "mask/share_reasoning": 0.8429389595985413, "mask/share_step_conf": 0.12095746397972107, "num_tokens": 32193297.0, "reward": 1.163382649421692, "reward_std": 0.20006373524665833, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7394359111785889, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8467612266540527, "step": 138 }, { "adv/mean_abs_final_conf": 0.6884328722953796, "adv/mean_abs_reasoning": 0.514777660369873, "adv/mean_abs_step_conf": 0.782334566116333, "adv/ratio_final_to_reasoning": 1.3373402252940298, "adv/ratio_step_to_reasoning": 1.5197523636791417, "adv/std_final_conf": 0.8758470416069031, "adv/std_reasoning": 0.7753527164459229, "adv/std_step_conf": 0.9349035620689392, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7622080679405521, "calib/avg_num_step_conf": 5.296875, "calib/ece": 0.22660079051383397, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.31620553359683795, "calib/gap": 0.3659156050955414, "calib/mean_conf": 0.4933201581027668, "calib/mu_c": 0.6321656050955414, "calib/mu_w": 0.26625000000000004, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.049683794466403156, "calib/std_conf": 0.4093251510925892, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49825699745547075, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.10916927815722516, "calib/step_q_w": 0.3890877192982456, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 449.41796875, "completions/mean_terminated_length": 449.41796875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.14826666666666666, "grad_norm": 0.09828542172908783, "kl": 0.111328125, "learning_rate": 1.6944444444444446e-06, "loss": 0.0748, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037707164883613586, "mask/share_reasoning": 0.8396942615509033, "mask/share_step_conf": 0.1225985437631607, "num_tokens": 32411444.0, "reward": 1.1785929203033447, "reward_std": 0.19305641949176788, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7442941665649414, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8617194890975952, "step": 139 }, { "adv/mean_abs_final_conf": 0.6502406597137451, "adv/mean_abs_reasoning": 0.6026022434234619, "adv/mean_abs_step_conf": 0.7315477132797241, "adv/ratio_final_to_reasoning": 1.0790544954158205, "adv/ratio_step_to_reasoning": 1.2139810650616005, "adv/std_final_conf": 0.8286058902740479, "adv/std_reasoning": 0.8265973329544067, "adv/std_step_conf": 0.935325026512146, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7739368998628258, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.22099206349206352, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.48412698412698413, "calib/gap": 0.3839876543209877, "calib/mean_conf": 0.6066269841269841, "calib/mu_c": 0.7437654320987654, "calib/mu_w": 0.35977777777777775, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09238095238095242, "calib/std_conf": 0.41818406052623064, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5448609431680774, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.10422888899200516, "calib/step_q_w": 0.44063205417607226, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 475.62109375, "completions/mean_terminated_length": 475.62109375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.14933333333333335, "grad_norm": 0.03260481357574463, "kl": 0.0984954833984375, "learning_rate": 1.6666666666666667e-06, "loss": -0.0596, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035779692232608795, "mask/share_reasoning": 0.8465887904167175, "mask/share_step_conf": 0.11763153970241547, "num_tokens": 32638219.0, "reward": 1.177355408668518, "reward_std": 0.1992952972650528, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7544379234313965, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8522652387619019, "step": 140 }, { "adv/mean_abs_final_conf": 0.652328372001648, "adv/mean_abs_reasoning": 0.46055957674980164, "adv/mean_abs_step_conf": 0.7508203983306885, "adv/ratio_final_to_reasoning": 1.416382168416019, "adv/ratio_step_to_reasoning": 1.630235123171851, "adv/std_final_conf": 0.8770766258239746, "adv/std_reasoning": 0.739296019077301, "adv/std_step_conf": 0.9351637363433838, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8754497001998668, "calib/avg_num_step_conf": 5.4921875, "calib/ece": 0.09928853754940715, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4980237154150198, "calib/gap": 0.567321785476349, "calib/mean_conf": 0.6248221343873518, "calib/mu_c": 0.8378481012658228, "calib/mu_w": 0.2705263157894737, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.049802371541502015, "calib/std_conf": 0.41490851887586083, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5223926380368098, "calib/step_q_c_n": 815.0, "calib/step_q_gap": 0.12699500690313803, "calib/step_q_w": 0.39539763113367177, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 523.765625, "completions/mean_terminated_length": 523.765625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1504, "grad_norm": 0.037485457956790924, "kl": 0.0937347412109375, "learning_rate": 1.638888888888889e-06, "loss": 0.0771, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03306419402360916, "mask/share_reasoning": 0.8539596199989319, "mask/share_step_conf": 0.11297617852687836, "num_tokens": 32879399.0, "reward": 1.2163152694702148, "reward_std": 0.1883358359336853, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.8347440958023071, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8532785177230835, "step": 141 }, { "adv/mean_abs_final_conf": 0.7393682599067688, "adv/mean_abs_reasoning": 0.49121570587158203, "adv/mean_abs_step_conf": 0.7508542537689209, "adv/ratio_final_to_reasoning": 1.5051804147729368, "adv/ratio_step_to_reasoning": 1.528563205113022, "adv/std_final_conf": 0.9064189195632935, "adv/std_reasoning": 0.7752906084060669, "adv/std_step_conf": 0.9352670907974243, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8096784363177806, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.1771031746031746, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.43253968253968256, "calib/gap": 0.4667931904161412, "calib/mean_conf": 0.5708333333333334, "calib/mu_c": 0.8116393442622951, "calib/mu_w": 0.3448461538461539, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1319047619047619, "calib/std_conf": 0.4265072479333529, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5488201438848921, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.1478608797587423, "calib/step_q_w": 0.4009592641261498, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 504.42578125, "completions/mean_terminated_length": 506.4039611816406, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.15146666666666667, "grad_norm": 0.03455263748764992, "kl": 0.09305572509765625, "learning_rate": 1.6111111111111113e-06, "loss": -0.0546, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03336789086461067, "mask/share_reasoning": 0.8411300182342529, "mask/share_step_conf": 0.12159579992294312, "num_tokens": 33113692.0, "reward": 1.1650047302246094, "reward_std": 0.22650645673274994, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7737703323364258, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8437427282333374, "step": 142 }, { "adv/mean_abs_final_conf": 0.6579672694206238, "adv/mean_abs_reasoning": 0.5036629438400269, "adv/mean_abs_step_conf": 0.7606317400932312, "adv/ratio_final_to_reasoning": 1.306364260995955, "adv/ratio_step_to_reasoning": 1.5101999251603124, "adv/std_final_conf": 0.8390932083129883, "adv/std_reasoning": 0.7575718760490417, "adv/std_step_conf": 0.9355660676956177, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8124918672739101, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.18055776892430278, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4262948207171315, "calib/gap": 0.41892843201040986, "calib/mean_conf": 0.5998406374501992, "calib/mu_c": 0.7767586206896552, "calib/mu_w": 0.3578301886792453, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.10135458167330677, "calib/std_conf": 0.4079225066312555, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5018259803921569, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.11198560582769695, "calib/step_q_w": 0.3898403745644599, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 495.2265625, "completions/mean_terminated_length": 497.1686706542969, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.15253333333333333, "grad_norm": 0.3523845970630646, "kl": 0.6697616577148438, "learning_rate": 1.5833333333333333e-06, "loss": -0.0899, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03295125439763069, "mask/share_reasoning": 0.8419654965400696, "mask/share_step_conf": 0.12117701023817062, "num_tokens": 33347806.0, "reward": 1.1694198846817017, "reward_std": 0.20234212279319763, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7747402191162109, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.837003767490387, "step": 143 }, { "adv/mean_abs_final_conf": 0.6553045511245728, "adv/mean_abs_reasoning": 0.5001832842826843, "adv/mean_abs_step_conf": 0.7231369018554688, "adv/ratio_final_to_reasoning": 1.310128850196081, "adv/ratio_step_to_reasoning": 1.445743839465814, "adv/std_final_conf": 0.8448795676231384, "adv/std_reasoning": 0.7576124668121338, "adv/std_step_conf": 0.9357913136482239, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7382836071069394, "calib/avg_num_step_conf": 5.56640625, "calib/ece": 0.2080158730158729, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5198412698412699, "calib/gap": 0.37706469996647674, "calib/mean_conf": 0.6242857142857142, "calib/mu_c": 0.7664331210191083, "calib/mu_w": 0.3893684210526316, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10464285714285702, "calib/std_conf": 0.42411051377106473, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5042337164750957, "calib/step_q_c_n": 870.0, "calib/step_q_gap": 0.10962110386248308, "calib/step_q_w": 0.39461261261261266, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 493.74609375, "completions/mean_terminated_length": 493.74609375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1536, "grad_norm": 0.035648465156555176, "kl": 0.1121368408203125, "learning_rate": 1.5555555555555558e-06, "loss": 0.036, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03582587465643883, "mask/share_reasoning": 0.8412730097770691, "mask/share_step_conf": 0.12290111184120178, "num_tokens": 33578333.0, "reward": 1.1678181886672974, "reward_std": 0.21501518785953522, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7465636730194092, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8468817472457886, "step": 144 }, { "adv/mean_abs_final_conf": 0.6195697784423828, "adv/mean_abs_reasoning": 0.4878042936325073, "adv/mean_abs_step_conf": 0.7531744241714478, "adv/ratio_final_to_reasoning": 1.2701195674779002, "adv/ratio_step_to_reasoning": 1.5440094193571405, "adv/std_final_conf": 0.8374097347259521, "adv/std_reasoning": 0.7394381761550903, "adv/std_step_conf": 0.9352357983589172, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8002614379084968, "calib/avg_num_step_conf": 6.0625, "calib/ece": 0.14268774703557313, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.49407114624505927, "calib/gap": 0.41498169934640516, "calib/mean_conf": 0.6635573122529644, "calib/mu_c": 0.8275816993464051, "calib/mu_w": 0.41259999999999997, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10075098814229248, "calib/std_conf": 0.38478574921930536, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5201180637544274, "calib/step_q_c_n": 847.0, "calib/step_q_gap": 0.14028827652038484, "calib/step_q_w": 0.37982978723404254, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 488.4765625, "completions/mean_terminated_length": 488.4765625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.15466666666666667, "grad_norm": 0.03285250440239906, "kl": 0.094085693359375, "learning_rate": 1.527777777777778e-06, "loss": 0.1492, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03602126985788345, "mask/share_reasoning": 0.8300851583480835, "mask/share_step_conf": 0.13389351963996887, "num_tokens": 33806087.0, "reward": 1.1905624866485596, "reward_std": 0.19327059388160706, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7945300340652466, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8467923402786255, "step": 145 }, { "adv/mean_abs_final_conf": 0.7271376848220825, "adv/mean_abs_reasoning": 0.5126863121986389, "adv/mean_abs_step_conf": 0.7399911880493164, "adv/ratio_final_to_reasoning": 1.4182896393386741, "adv/ratio_step_to_reasoning": 1.4433605314639428, "adv/std_final_conf": 0.8864182233810425, "adv/std_reasoning": 0.7927311062812805, "adv/std_step_conf": 0.9353997111320496, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7329771290806021, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.23456349206349206, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4126984126984127, "calib/gap": 0.34000716752459753, "calib/mean_conf": 0.5587698412698413, "calib/mu_c": 0.7598058252427183, "calib/mu_w": 0.4197986577181208, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1923015873015873, "calib/std_conf": 0.4220016486595798, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49958400000000003, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.10310635550708835, "calib/step_q_w": 0.3964776444929117, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 511.69140625, "completions/mean_terminated_length": 515.720458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.15573333333333333, "grad_norm": 0.04114718735218048, "kl": 0.105072021484375, "learning_rate": 1.5e-06, "loss": 0.0185, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032341986894607544, "mask/share_reasoning": 0.8382286429405212, "mask/share_step_conf": 0.12161687016487122, "num_tokens": 34044296.0, "reward": 1.1270300149917603, "reward_std": 0.2149513065814972, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.7105585932731628, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8446260094642639, "step": 146 }, { "adv/mean_abs_final_conf": 0.6262341737747192, "adv/mean_abs_reasoning": 0.40930497646331787, "adv/mean_abs_step_conf": 0.7368854284286499, "adv/ratio_final_to_reasoning": 1.5299940381517514, "adv/ratio_step_to_reasoning": 1.8003334208048407, "adv/std_final_conf": 0.8180379867553711, "adv/std_reasoning": 0.7014212608337402, "adv/std_step_conf": 0.9353666305541992, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7938918988648092, "calib/avg_num_step_conf": 5.8125, "calib/ece": 0.21275999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.54, "calib/gap": 0.47253869969040235, "calib/mean_conf": 0.60636, "calib/mu_c": 0.8634210526315789, "calib/mu_w": 0.3908823529411765, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18155999999999994, "calib/std_conf": 0.44288141798905944, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5424237804878049, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.1334093574108819, "calib/step_q_w": 0.40901442307692304, "calib/step_q_w_n": 832.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 536.19140625, "completions/mean_terminated_length": 536.19140625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1568, "grad_norm": 0.04610983282327652, "kl": 0.0912017822265625, "learning_rate": 1.4722222222222225e-06, "loss": -0.0097, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03225596994161606, "mask/share_reasoning": 0.8475143313407898, "mask/share_step_conf": 0.12022969126701355, "num_tokens": 34285241.0, "reward": 1.1384716033935547, "reward_std": 0.26133859157562256, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7456488013267517, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8328420519828796, "step": 147 }, { "adv/mean_abs_final_conf": 0.5644453763961792, "adv/mean_abs_reasoning": 0.3946724534034729, "adv/mean_abs_step_conf": 0.7691161036491394, "adv/ratio_final_to_reasoning": 1.4301615720293197, "adv/ratio_step_to_reasoning": 1.948745338106669, "adv/std_final_conf": 0.7765293717384338, "adv/std_reasoning": 0.6815993189811707, "adv/std_step_conf": 0.9353897571563721, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7558139534883721, "calib/avg_num_step_conf": 5.8203125, "calib/ece": 0.1956521739130434, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6324110671936759, "calib/gap": 0.38987438989376966, "calib/mean_conf": 0.7222134387351778, "calib/mu_c": 0.8470348837209302, "calib/mu_w": 0.4571604938271605, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11901185770750983, "calib/std_conf": 0.40146549955043687, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.494751968503937, "calib/step_q_c_n": 1016.0, "calib/step_q_gap": 0.08905576597229142, "calib/step_q_w": 0.40569620253164557, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 476.58203125, "completions/mean_terminated_length": 478.4510192871094, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.15786666666666666, "grad_norm": 0.04696307331323624, "kl": 0.0993499755859375, "learning_rate": 1.4444444444444445e-06, "loss": -0.0649, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036317791789770126, "mask/share_reasoning": 0.8251909017562866, "mask/share_step_conf": 0.13458506762981415, "num_tokens": 34512358.0, "reward": 1.1845102310180664, "reward_std": 0.16278135776519775, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7798437476158142, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8380970358848572, "step": 148 }, { "adv/mean_abs_final_conf": 0.6602301001548767, "adv/mean_abs_reasoning": 0.5817610025405884, "adv/mean_abs_step_conf": 0.7569714784622192, "adv/ratio_final_to_reasoning": 1.134882017308841, "adv/ratio_step_to_reasoning": 1.3011726037951585, "adv/std_final_conf": 0.8782934546470642, "adv/std_reasoning": 0.8265743255615234, "adv/std_step_conf": 0.9354011416435242, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8224085855664803, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.1583466666666667, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.552, "calib/gap": 0.5193470856628752, "calib/mean_conf": 0.6600533333333333, "calib/mu_c": 0.903107769423559, "calib/mu_w": 0.3837606837606837, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14320000000000005, "calib/std_conf": 0.41802636471878185, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5346911764705883, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.1386883315203749, "calib/step_q_w": 0.3960028449502134, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 507.0546875, "completions/mean_terminated_length": 507.0546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.15893333333333334, "grad_norm": 0.03461736813187599, "kl": 0.09224700927734375, "learning_rate": 1.4166666666666667e-06, "loss": 0.0572, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03385186567902565, "mask/share_reasoning": 0.8462405204772949, "mask/share_step_conf": 0.11990756541490555, "num_tokens": 34746620.0, "reward": 1.1829655170440674, "reward_std": 0.23066285252571106, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7954006195068359, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8485827445983887, "step": 149 }, { "adv/mean_abs_final_conf": 0.7037779688835144, "adv/mean_abs_reasoning": 0.5218222141265869, "adv/mean_abs_step_conf": 0.7530713081359863, "adv/ratio_final_to_reasoning": 1.3486930027719892, "adv/ratio_step_to_reasoning": 1.4431568602276896, "adv/std_final_conf": 0.8822007775306702, "adv/std_reasoning": 0.7575706839561462, "adv/std_step_conf": 0.9354367256164551, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7739515198153135, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.2362549800796813, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6215139442231076, "calib/gap": 0.38889444658201877, "calib/mean_conf": 0.7207171314741037, "calib/mu_c": 0.8957971014492754, "calib/mu_w": 0.5069026548672566, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.20358565737051795, "calib/std_conf": 0.40392432980510673, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5245503355704698, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.13456544131065107, "calib/step_q_w": 0.38998489425981875, "calib/step_q_w_n": 662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 460.9921875, "completions/mean_terminated_length": 460.9921875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.16, "grad_norm": 0.0454990491271019, "kl": 0.10170745849609375, "learning_rate": 1.3888888888888892e-06, "loss": 0.025, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0382879413664341, "mask/share_reasoning": 0.8248406648635864, "mask/share_step_conf": 0.1368713676929474, "num_tokens": 34969594.0, "reward": 1.1438206434249878, "reward_std": 0.2367713451385498, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7262125015258789, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8399109244346619, "step": 150 }, { "adv/mean_abs_final_conf": 0.7293127775192261, "adv/mean_abs_reasoning": 0.5311736464500427, "adv/mean_abs_step_conf": 0.7511847019195557, "adv/ratio_final_to_reasoning": 1.3730213883790985, "adv/ratio_step_to_reasoning": 1.4141979876823674, "adv/std_final_conf": 0.9007099866867065, "adv/std_reasoning": 0.7754827737808228, "adv/std_step_conf": 0.9359373450279236, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7826836728053621, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.18302419354838714, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4717741935483871, "calib/gap": 0.4641276761892367, "calib/mean_conf": 0.5780241935483871, "calib/mu_c": 0.815702479338843, "calib/mu_w": 0.35157480314960626, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.13657258064516137, "calib/std_conf": 0.4409535664917214, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4977232142857143, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.14567064857858042, "calib/step_q_w": 0.3520525657071339, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2129.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 524.390625, "completions/mean_terminated_length": 526.4470825195312, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.16106666666666666, "grad_norm": 0.03694000095129013, "kl": 0.0893096923828125, "learning_rate": 1.3611111111111112e-06, "loss": -0.0521, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03346676751971245, "mask/share_reasoning": 0.8463730216026306, "mask/share_step_conf": 0.11625394225120544, "num_tokens": 35210862.0, "reward": 1.126600742340088, "reward_std": 0.23628029227256775, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.751246452331543, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8096367120742798, "step": 151 }, { "adv/mean_abs_final_conf": 0.7440125942230225, "adv/mean_abs_reasoning": 0.6250966787338257, "adv/mean_abs_step_conf": 0.7569712400436401, "adv/ratio_final_to_reasoning": 1.1902360379358738, "adv/ratio_step_to_reasoning": 1.210966664511696, "adv/std_final_conf": 0.8971284627914429, "adv/std_reasoning": 0.8266865015029907, "adv/std_step_conf": 0.9358736276626587, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7288417966146875, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.23567346938775513, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.40408163265306124, "calib/gap": 0.35499533519925375, "calib/mean_conf": 0.553795918367347, "calib/mu_c": 0.730569105691057, "calib/mu_w": 0.3755737704918033, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.14371428571428574, "calib/std_conf": 0.4257429031412937, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.46891547049441784, "calib/step_q_c_n": 627.0, "calib/step_q_gap": 0.12126689168304827, "calib/step_q_w": 0.34764857881136957, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 497.62890625, "completions/mean_terminated_length": 499.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.16213333333333332, "grad_norm": 0.038726769387722015, "kl": 0.09711456298828125, "learning_rate": 1.3333333333333334e-06, "loss": -0.11, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03548308089375496, "mask/share_reasoning": 0.8377888202667236, "mask/share_step_conf": 0.1228218674659729, "num_tokens": 35443647.0, "reward": 1.0959519147872925, "reward_std": 0.2796997129917145, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7023949027061462, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8029018640518188, "step": 152 }, { "adv/mean_abs_final_conf": 0.718186616897583, "adv/mean_abs_reasoning": 0.5393495559692383, "adv/mean_abs_step_conf": 0.7579127550125122, "adv/ratio_final_to_reasoning": 1.3315791381471809, "adv/ratio_step_to_reasoning": 1.4052347807174974, "adv/std_final_conf": 0.8757449388504028, "adv/std_reasoning": 0.7927350997924805, "adv/std_step_conf": 0.9352637529373169, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6873619233268355, "calib/avg_num_step_conf": 6.203125, "calib/ece": 0.3093172690763052, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3534136546184739, "calib/gap": 0.25028849902534117, "calib/mean_conf": 0.48955823293172696, "calib/mu_c": 0.6041481481481482, "calib/mu_w": 0.35385964912280704, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12835341365461844, "calib/std_conf": 0.430426385244852, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.46790502793296096, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.13672383527241055, "calib/step_q_w": 0.3311811926605504, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 515.41015625, "completions/mean_terminated_length": 517.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.1632, "grad_norm": 0.03288364037871361, "kl": 0.09600830078125, "learning_rate": 1.3055555555555556e-06, "loss": 0.0006, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03258873149752617, "mask/share_reasoning": 0.8409442901611328, "mask/share_step_conf": 0.1225607693195343, "num_tokens": 35682912.0, "reward": 1.0964607000350952, "reward_std": 0.2279350906610489, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6613730192184448, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8225946426391602, "step": 153 }, { "adv/mean_abs_final_conf": 0.6926400661468506, "adv/mean_abs_reasoning": 0.46163809299468994, "adv/mean_abs_step_conf": 0.7453674077987671, "adv/ratio_final_to_reasoning": 1.5003962555464803, "adv/ratio_step_to_reasoning": 1.6146141731144807, "adv/std_final_conf": 0.8909844756126404, "adv/std_reasoning": 0.7392441034317017, "adv/std_step_conf": 0.9355279803276062, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.730795739348371, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.21984189723320158, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.45849802371541504, "calib/gap": 0.41358583959899753, "calib/mean_conf": 0.558498023715415, "calib/mu_c": 0.7759166666666667, "calib/mu_w": 0.36233082706766917, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15201581027667985, "calib/std_conf": 0.44135180973523674, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5033224222585925, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.12163286181903205, "calib/step_q_w": 0.3816895604395604, "calib/step_q_w_n": 728.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 469.078125, "completions/mean_terminated_length": 470.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.16426666666666667, "grad_norm": 0.0505913607776165, "kl": 0.112213134765625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0439, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03482554107904434, "mask/share_reasoning": 0.8422807455062866, "mask/share_step_conf": 0.11898745596408844, "num_tokens": 35907436.0, "reward": 1.1507153511047363, "reward_std": 0.2105042189359665, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7422734498977661, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8456881642341614, "step": 154 }, { "adv/mean_abs_final_conf": 0.770971417427063, "adv/mean_abs_reasoning": 0.47824519872665405, "adv/mean_abs_step_conf": 0.7767425775527954, "adv/ratio_final_to_reasoning": 1.6120839675543082, "adv/ratio_step_to_reasoning": 1.624151334129233, "adv/std_final_conf": 0.9311116337776184, "adv/std_reasoning": 0.7392560243606567, "adv/std_step_conf": 0.9357247352600098, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6687376968503935, "calib/avg_num_step_conf": 5.59375, "calib/ece": 0.252, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2784313725490196, "calib/gap": 0.27756828248031495, "calib/mean_conf": 0.4494117647058824, "calib/mu_c": 0.5887401574803149, "calib/mu_w": 0.311171875, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10168627450980394, "calib/std_conf": 0.4219581419845835, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4082369942196532, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.07377753476019372, "calib/step_q_w": 0.3344594594594595, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 453.2890625, "completions/mean_terminated_length": 453.2890625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.16533333333333333, "grad_norm": 0.039521224796772, "kl": 0.1176300048828125, "learning_rate": 1.25e-06, "loss": 0.0123, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03726223111152649, "mask/share_reasoning": 0.8310775756835938, "mask/share_step_conf": 0.13166022300720215, "num_tokens": 36130694.0, "reward": 1.1223604679107666, "reward_std": 0.20777511596679688, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7056055068969727, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8271185159683228, "step": 155 }, { "adv/mean_abs_final_conf": 0.699209451675415, "adv/mean_abs_reasoning": 0.4771537184715271, "adv/mean_abs_step_conf": 0.7692943811416626, "adv/ratio_final_to_reasoning": 1.465375673724606, "adv/ratio_step_to_reasoning": 1.6122569129419206, "adv/std_final_conf": 0.8949079513549805, "adv/std_reasoning": 0.7392115592956543, "adv/std_step_conf": 0.9355168342590332, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7784376571141276, "calib/avg_num_step_conf": 5.9296875, "calib/ece": 0.23126482213438737, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.48616600790513836, "calib/gap": 0.4132151835093013, "calib/mean_conf": 0.5733201581027668, "calib/mu_c": 0.7644117647058825, "calib/mu_w": 0.35119658119658115, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1335177865612648, "calib/std_conf": 0.4449177703800726, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44553410553410555, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.11217378164746589, "calib/step_q_w": 0.33336032388663966, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 472.9140625, "completions/mean_terminated_length": 474.7686462402344, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.1664, "grad_norm": 0.03909357264637947, "kl": 0.10540771484375, "learning_rate": 1.2222222222222223e-06, "loss": -0.0407, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035646237432956696, "mask/share_reasoning": 0.827398419380188, "mask/share_step_conf": 0.1330491006374359, "num_tokens": 36356520.0, "reward": 1.1607627868652344, "reward_std": 0.20456355810165405, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7487425804138184, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.845917820930481, "step": 156 }, { "adv/mean_abs_final_conf": 0.6460797190666199, "adv/mean_abs_reasoning": 0.4546028971672058, "adv/mean_abs_step_conf": 0.7391088604927063, "adv/ratio_final_to_reasoning": 1.4211957800809785, "adv/ratio_step_to_reasoning": 1.62583403031164, "adv/std_final_conf": 0.8762221932411194, "adv/std_reasoning": 0.7391842007637024, "adv/std_step_conf": 0.9354991316795349, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7898527528809218, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.18361111111111106, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.49206349206349204, "calib/gap": 0.5060909090909091, "calib/mean_conf": 0.5740873015873017, "calib/mu_c": 0.795, "calib/mu_w": 0.28890909090909095, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09710317460317458, "calib/std_conf": 0.44950453358310655, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46499999999999997, "calib/step_q_c_n": 928.0, "calib/step_q_gap": 0.12450863213811414, "calib/step_q_w": 0.34049136786188583, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 498.671875, "completions/mean_terminated_length": 498.671875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.16746666666666668, "grad_norm": 0.07070081681013107, "kl": 0.099639892578125, "learning_rate": 1.1944444444444446e-06, "loss": 0.0253, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03461083769798279, "mask/share_reasoning": 0.8251791000366211, "mask/share_step_conf": 0.1402100920677185, "num_tokens": 36587908.0, "reward": 1.1632893085479736, "reward_std": 0.20778614282608032, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7844120860099792, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8229026794433594, "step": 157 }, { "adv/mean_abs_final_conf": 0.7103238701820374, "adv/mean_abs_reasoning": 0.48325201869010925, "adv/mean_abs_step_conf": 0.771677553653717, "adv/ratio_final_to_reasoning": 1.4698828824500791, "adv/ratio_step_to_reasoning": 1.5968428973052338, "adv/std_final_conf": 0.8656406998634338, "adv/std_reasoning": 0.7394470572471619, "adv/std_step_conf": 0.935847282409668, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7064367816091954, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.237, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.472, "calib/gap": 0.3402331691297209, "calib/mean_conf": 0.60724, "calib/mu_c": 0.7501379310344828, "calib/mu_w": 0.4099047619047619, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13212, "calib/std_conf": 0.4316201830313314, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48336461126005364, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.12709318268862502, "calib/step_q_w": 0.3562714285714286, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 493.94921875, "completions/mean_terminated_length": 493.94921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.16853333333333334, "grad_norm": 0.031547460705041885, "kl": 0.10711669921875, "learning_rate": 1.1666666666666668e-06, "loss": 0.0507, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03697506710886955, "mask/share_reasoning": 0.8330240249633789, "mask/share_step_conf": 0.13000091910362244, "num_tokens": 36819599.0, "reward": 1.1219482421875, "reward_std": 0.25272446870803833, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7178941369056702, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.81160569190979, "step": 158 }, { "adv/mean_abs_final_conf": 0.6777149438858032, "adv/mean_abs_reasoning": 0.5489553213119507, "adv/mean_abs_step_conf": 0.7470642924308777, "adv/ratio_final_to_reasoning": 1.2345539201006912, "adv/ratio_step_to_reasoning": 1.3608835973125564, "adv/std_final_conf": 0.8770651817321777, "adv/std_reasoning": 0.7754817008972168, "adv/std_step_conf": 0.9351387619972229, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7349534777799466, "calib/avg_num_step_conf": 5.12890625, "calib/ece": 0.23884000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.436, "calib/gap": 0.3636664714685406, "calib/mean_conf": 0.55364, "calib/mu_c": 0.7121985815602837, "calib/mu_w": 0.3485321100917431, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11424000000000001, "calib/std_conf": 0.44136691131075967, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4798344827586207, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.13508958479943706, "calib/step_q_w": 0.3447448979591836, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 432.18359375, "completions/mean_terminated_length": 435.58660888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.1696, "grad_norm": 0.05171238258481026, "kl": 0.1108245849609375, "learning_rate": 1.138888888888889e-06, "loss": -0.0979, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03822503238916397, "mask/share_reasoning": 0.8270611763000488, "mask/share_step_conf": 0.1269012987613678, "num_tokens": 37035022.0, "reward": 1.1491003036499023, "reward_std": 0.22951547801494598, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7168339490890503, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8511192798614502, "step": 159 }, { "adv/mean_abs_final_conf": 0.7331618666648865, "adv/mean_abs_reasoning": 0.559362530708313, "adv/mean_abs_step_conf": 0.7502003908157349, "adv/ratio_final_to_reasoning": 1.3107096496729838, "adv/ratio_step_to_reasoning": 1.341170260127661, "adv/std_final_conf": 0.9073959589004517, "adv/std_reasoning": 0.792900562286377, "adv/std_step_conf": 0.9360281825065613, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7023730830248547, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.2711382113821138, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.43089430894308944, "calib/gap": 0.3433593336858805, "calib/mean_conf": 0.5052032520325204, "calib/mu_c": 0.6782786885245902, "calib/mu_w": 0.33491935483870966, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14020325203252035, "calib/std_conf": 0.4591258034613807, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4382012195121951, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.11328578610838957, "calib/step_q_w": 0.3249154334038055, "calib/step_q_w_n": 946.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 507.83203125, "completions/mean_terminated_length": 511.8307189941406, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.17066666666666666, "grad_norm": 0.03477999195456505, "kl": 0.110809326171875, "learning_rate": 1.111111111111111e-06, "loss": 0.0361, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03321065753698349, "mask/share_reasoning": 0.8289576768875122, "mask/share_step_conf": 0.1300192028284073, "num_tokens": 37269867.0, "reward": 1.0891594886779785, "reward_std": 0.2826342284679413, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6830366849899292, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8057091236114502, "step": 160 }, { "adv/mean_abs_final_conf": 0.666127622127533, "adv/mean_abs_reasoning": 0.4846762716770172, "adv/mean_abs_step_conf": 0.7656421661376953, "adv/ratio_final_to_reasoning": 1.3743763849273662, "adv/ratio_step_to_reasoning": 1.5796980600855794, "adv/std_final_conf": 0.8550475239753723, "adv/std_reasoning": 0.739328145980835, "adv/std_step_conf": 0.9354023337364197, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7183908045977012, "calib/avg_num_step_conf": 5.25, "calib/ece": 0.29035573122529645, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.41106719367588934, "calib/gap": 0.3515240797322857, "calib/mean_conf": 0.5158102766798419, "calib/mu_c": 0.6255747126436781, "calib/mu_w": 0.2740506329113924, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.059209486166007914, "calib/std_conf": 0.4429900153822679, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4532934782608695, "calib/step_q_c_n": 920.0, "calib/step_q_gap": 0.10897744052502045, "calib/step_q_w": 0.34431603773584907, "calib/step_q_w_n": 424.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 451.203125, "completions/mean_terminated_length": 451.203125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.17173333333333332, "grad_norm": 0.06945797055959702, "kl": 0.1070098876953125, "learning_rate": 1.0833333333333335e-06, "loss": 0.0184, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03743017092347145, "mask/share_reasoning": 0.8344805240631104, "mask/share_step_conf": 0.12808924913406372, "num_tokens": 37489295.0, "reward": 1.1474106311798096, "reward_std": 0.19529061019420624, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6996968984603882, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8415412306785583, "step": 161 }, { "adv/mean_abs_final_conf": 0.6987650394439697, "adv/mean_abs_reasoning": 0.584774374961853, "adv/mean_abs_step_conf": 0.7395343780517578, "adv/ratio_final_to_reasoning": 1.1949310184625528, "adv/ratio_step_to_reasoning": 1.2646490847004033, "adv/std_final_conf": 0.902337372303009, "adv/std_reasoning": 0.8099278807640076, "adv/std_step_conf": 0.9356653094291687, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7526661197703035, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.2203187250996015, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.545816733067729, "calib/gap": 0.4302433688815969, "calib/mean_conf": 0.6168924302788844, "calib/mu_c": 0.7745911949685534, "calib/mu_w": 0.3443478260869565, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10187250996015929, "calib/std_conf": 0.44126031062801013, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4804425837320574, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.08754462454838391, "calib/step_q_w": 0.3928979591836735, "calib/step_q_w_n": 490.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 443.77734375, "completions/mean_terminated_length": 445.5176696777344, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.1728, "grad_norm": 0.04064112529158592, "kl": 0.1116790771484375, "learning_rate": 1.0555555555555557e-06, "loss": 0.0137, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0373004712164402, "mask/share_reasoning": 0.8302710652351379, "mask/share_step_conf": 0.12852223217487335, "num_tokens": 37707046.0, "reward": 1.1640316247940063, "reward_std": 0.22133593261241913, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7575312852859497, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8329587578773499, "step": 162 }, { "adv/mean_abs_final_conf": 0.6372281312942505, "adv/mean_abs_reasoning": 0.5325202941894531, "adv/mean_abs_step_conf": 0.7637654542922974, "adv/ratio_final_to_reasoning": 1.1966269421978983, "adv/ratio_step_to_reasoning": 1.43424666181938, "adv/std_final_conf": 0.8429669737815857, "adv/std_reasoning": 0.7576155662536621, "adv/std_step_conf": 0.9351708292961121, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7866588495863462, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.2068951612903226, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.39919354838709675, "calib/gap": 0.46308188391635724, "calib/mean_conf": 0.5001209677419355, "calib/mu_c": 0.7223255813953489, "calib/mu_w": 0.2592436974789916, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.09342741935483871, "calib/std_conf": 0.45073281000163135, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.47199481865284976, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.123129665114799, "calib/step_q_w": 0.34886515353805075, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2149.0, "completions/max_terminated_length": 2149.0, "completions/mean_length": 503.72265625, "completions/mean_terminated_length": 505.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.17386666666666667, "grad_norm": 0.03318033367395401, "kl": 0.101806640625, "learning_rate": 1.0277777777777777e-06, "loss": -0.0523, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03390524163842201, "mask/share_reasoning": 0.8307154178619385, "mask/share_step_conf": 0.13147307932376862, "num_tokens": 37940831.0, "reward": 1.1570065021514893, "reward_std": 0.1966712772846222, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7536964416503906, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8433359861373901, "step": 163 }, { "adv/mean_abs_final_conf": 0.6957120895385742, "adv/mean_abs_reasoning": 0.48879551887512207, "adv/mean_abs_step_conf": 0.7435122132301331, "adv/ratio_final_to_reasoning": 1.423319287254586, "adv/ratio_step_to_reasoning": 1.5211109441862258, "adv/std_final_conf": 0.8903563618659973, "adv/std_reasoning": 0.7753127217292786, "adv/std_step_conf": 0.9353837370872498, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7536496350364963, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.258102766798419, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3438735177865613, "calib/gap": 0.3506519003272086, "calib/mean_conf": 0.46229249011857704, "calib/mu_c": 0.6230656934306569, "calib/mu_w": 0.27241379310344827, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08944664031620557, "calib/std_conf": 0.43647650628239665, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4358097686375321, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.11571593485737125, "calib/step_q_w": 0.3200938337801609, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 524.1171875, "completions/mean_terminated_length": 526.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.17493333333333333, "grad_norm": 0.04268093407154083, "kl": 0.107574462890625, "learning_rate": 1.0000000000000002e-06, "loss": -0.0644, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031301628798246384, "mask/share_reasoning": 0.8485506772994995, "mask/share_step_conf": 0.1162414401769638, "num_tokens": 38181141.0, "reward": 1.1473987102508545, "reward_std": 0.2089129090309143, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7205109596252441, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8463993072509766, "step": 164 }, { "adv/mean_abs_final_conf": 0.7059795260429382, "adv/mean_abs_reasoning": 0.5269777178764343, "adv/mean_abs_step_conf": 0.7742398977279663, "adv/ratio_final_to_reasoning": 1.339676236953298, "adv/ratio_step_to_reasoning": 1.469208035679243, "adv/std_final_conf": 0.8767625689506531, "adv/std_reasoning": 0.775352954864502, "adv/std_step_conf": 0.9353828430175781, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7407479421867911, "calib/avg_num_step_conf": 5.82421875, "calib/ece": 0.23928000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.452, "calib/gap": 0.4209799727785339, "calib/mean_conf": 0.5307999999999999, "calib/mu_c": 0.7648648648648648, "calib/mu_w": 0.34388489208633094, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16304, "calib/std_conf": 0.45047459417818453, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44881913303437965, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.08493835444557185, "calib/step_q_w": 0.3638807785888078, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 513.6484375, "completions/mean_terminated_length": 515.6627807617188, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.176, "grad_norm": 0.03866741061210632, "kl": 0.107147216796875, "learning_rate": 9.722222222222224e-07, "loss": -0.0191, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03287225589156151, "mask/share_reasoning": 0.8358644247055054, "mask/share_step_conf": 0.12735706567764282, "num_tokens": 38418211.0, "reward": 1.1342720985412598, "reward_std": 0.22282421588897705, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.7291538715362549, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8387600779533386, "step": 165 }, { "adv/mean_abs_final_conf": 0.6282885074615479, "adv/mean_abs_reasoning": 0.5605138540267944, "adv/mean_abs_step_conf": 0.7245954275131226, "adv/ratio_final_to_reasoning": 1.1209152154721114, "adv/ratio_step_to_reasoning": 1.292734197928468, "adv/std_final_conf": 0.8683748841285706, "adv/std_reasoning": 0.8098090291023254, "adv/std_step_conf": 0.9353573322296143, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8162691885964912, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.19221774193548385, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4959677419354839, "calib/gap": 0.5316611842105263, "calib/mean_conf": 0.5836693548387096, "calib/mu_c": 0.7894736842105263, "calib/mu_w": 0.2578125, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08149193548387097, "calib/std_conf": 0.45787799864321854, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.46612267250821465, "calib/step_q_c_n": 913.0, "calib/step_q_gap": 0.14354549130016098, "calib/step_q_w": 0.3225771812080537, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 525.0, "completions/mean_terminated_length": 527.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.17706666666666668, "grad_norm": 0.04430662468075752, "kl": 0.10025787353515625, "learning_rate": 9.444444444444445e-07, "loss": 0.039, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032747916877269745, "mask/share_reasoning": 0.8334342241287231, "mask/share_step_conf": 0.1299116164445877, "num_tokens": 38658795.0, "reward": 1.1746551990509033, "reward_std": 0.23381714522838593, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7690227031707764, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8467544317245483, "step": 166 }, { "adv/mean_abs_final_conf": 0.6404507160186768, "adv/mean_abs_reasoning": 0.5007696151733398, "adv/mean_abs_step_conf": 0.7638825178146362, "adv/ratio_final_to_reasoning": 1.2789328597682323, "adv/ratio_step_to_reasoning": 1.525417067387806, "adv/std_final_conf": 0.8442684412002563, "adv/std_reasoning": 0.7394320368766785, "adv/std_step_conf": 0.9355236887931824, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6737989235165127, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.2622, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.596, "calib/gap": 0.2984603628148049, "calib/mean_conf": 0.6941200000000001, "calib/mu_c": 0.8146979865771812, "calib/mu_w": 0.5162376237623763, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.18016000000000001, "calib/std_conf": 0.409020568675953, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.48212837837837835, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.033213649696207825, "calib/step_q_w": 0.4489147286821705, "calib/step_q_w_n": 516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 499.2578125, "completions/mean_terminated_length": 499.2578125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.17813333333333334, "grad_norm": 0.032998669892549515, "kl": 0.0972747802734375, "learning_rate": 9.166666666666666e-07, "loss": 0.0018, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034088607877492905, "mask/share_reasoning": 0.845373809337616, "mask/share_step_conf": 0.12053757905960083, "num_tokens": 38892213.0, "reward": 1.1221466064453125, "reward_std": 0.2021704912185669, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7050972580909729, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.818838894367218, "step": 167 }, { "adv/mean_abs_final_conf": 0.6557689905166626, "adv/mean_abs_reasoning": 0.5603863000869751, "adv/mean_abs_step_conf": 0.7279667854309082, "adv/ratio_final_to_reasoning": 1.1702088191929096, "adv/ratio_step_to_reasoning": 1.2990445792802638, "adv/std_final_conf": 0.8571983575820923, "adv/std_reasoning": 0.7929483652114868, "adv/std_step_conf": 0.9356951713562012, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7758103241296519, "calib/avg_num_step_conf": 5.921875, "calib/ece": 0.19943775100401606, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4899598393574297, "calib/gap": 0.43086434573829513, "calib/mean_conf": 0.5988755020080322, "calib/mu_c": 0.7753741496598638, "calib/mu_w": 0.3445098039215687, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10397590361445784, "calib/std_conf": 0.43865294503399493, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4667781908302354, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.13769497503333838, "calib/step_q_w": 0.32908321579689703, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 522.91796875, "completions/mean_terminated_length": 527.0354614257812, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.1792, "grad_norm": 0.030330434441566467, "kl": 0.10137939453125, "learning_rate": 8.88888888888889e-07, "loss": -0.0273, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03261226788163185, "mask/share_reasoning": 0.8453991413116455, "mask/share_step_conf": 0.11417609453201294, "num_tokens": 39130752.0, "reward": 1.1684155464172363, "reward_std": 0.22862845659255981, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7529062032699585, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8496999740600586, "step": 168 }, { "adv/mean_abs_final_conf": 0.6607838869094849, "adv/mean_abs_reasoning": 0.4686254560947418, "adv/mean_abs_step_conf": 0.7495909333229065, "adv/ratio_final_to_reasoning": 1.4100469326102816, "adv/ratio_step_to_reasoning": 1.5995523153385034, "adv/std_final_conf": 0.8423540592193604, "adv/std_reasoning": 0.7207648754119873, "adv/std_step_conf": 0.9355702996253967, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7115801933320183, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.25689516129032264, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.532258064516129, "calib/gap": 0.3188104162556717, "calib/mean_conf": 0.6358467741935484, "calib/mu_c": 0.7785401459854014, "calib/mu_w": 0.45972972972972975, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.17016129032258068, "calib/std_conf": 0.43216177422638546, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5004310344827586, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.1279524020041261, "calib/step_q_w": 0.3724786324786325, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 508.15234375, "completions/mean_terminated_length": 508.15234375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.18026666666666666, "grad_norm": 0.034315966069698334, "kl": 0.099456787109375, "learning_rate": 8.611111111111112e-07, "loss": 0.0557, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034047387540340424, "mask/share_reasoning": 0.8473041653633118, "mask/share_step_conf": 0.11864843964576721, "num_tokens": 39365023.0, "reward": 1.1078245639801025, "reward_std": 0.23867198824882507, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6902972459793091, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.817422091960907, "step": 169 }, { "adv/mean_abs_final_conf": 0.6348492503166199, "adv/mean_abs_reasoning": 0.44265952706336975, "adv/mean_abs_step_conf": 0.7500869631767273, "adv/ratio_final_to_reasoning": 1.434170534017981, "adv/ratio_step_to_reasoning": 1.694500891357405, "adv/std_final_conf": 0.845905065536499, "adv/std_reasoning": 0.7205403447151184, "adv/std_step_conf": 0.935236394405365, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7798478089740225, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.22203187250996018, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5896414342629482, "calib/gap": 0.41587181842036225, "calib/mean_conf": 0.6696812749003984, "calib/mu_c": 0.8403378378378379, "calib/mu_w": 0.42446601941747564, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15103585657370522, "calib/std_conf": 0.42753613813375685, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4838546255506608, "calib/step_q_c_n": 908.0, "calib/step_q_gap": 0.12379947849183726, "calib/step_q_w": 0.3600551470588235, "calib/step_q_w_n": 544.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 499.71875, "completions/mean_terminated_length": 499.71875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.18133333333333335, "grad_norm": 0.05624116212129593, "kl": 0.09752655029296875, "learning_rate": 8.333333333333333e-07, "loss": -0.0172, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032660022377967834, "mask/share_reasoning": 0.8419471979141235, "mask/share_step_conf": 0.12539277970790863, "num_tokens": 39597103.0, "reward": 1.164198637008667, "reward_std": 0.22839687764644623, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7475042939186096, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8471578359603882, "step": 170 }, { "adv/mean_abs_final_conf": 0.7149491906166077, "adv/mean_abs_reasoning": 0.5040189623832703, "adv/mean_abs_step_conf": 0.7576113343238831, "adv/ratio_final_to_reasoning": 1.4184966121828966, "adv/ratio_step_to_reasoning": 1.503140538089069, "adv/std_final_conf": 0.8838882446289062, "adv/std_reasoning": 0.7576205730438232, "adv/std_step_conf": 0.9358006715774536, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6920087064676617, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.29799212598425195, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.484251968503937, "calib/gap": 0.2762300995024875, "calib/mean_conf": 0.5831889763779528, "calib/mu_c": 0.7289166666666667, "calib/mu_w": 0.45268656716417915, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2043700787401575, "calib/std_conf": 0.4422914425957622, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48197406340057636, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.10611972166388167, "calib/step_q_w": 0.3758543417366947, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 465.953125, "completions/mean_terminated_length": 467.7804260253906, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.1824, "grad_norm": 0.044052887707948685, "kl": 0.1040191650390625, "learning_rate": 8.055555555555557e-07, "loss": -0.0613, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0346250906586647, "mask/share_reasoning": 0.8380456566810608, "mask/share_step_conf": 0.1234230250120163, "num_tokens": 39823283.0, "reward": 1.1075711250305176, "reward_std": 0.23493888974189758, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6674386858940125, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.837531566619873, "step": 171 }, { "adv/mean_abs_final_conf": 0.6492525339126587, "adv/mean_abs_reasoning": 0.5081915855407715, "adv/mean_abs_step_conf": 0.7804287672042847, "adv/ratio_final_to_reasoning": 1.2775743487011555, "adv/ratio_step_to_reasoning": 1.5356979324516422, "adv/std_final_conf": 0.8524985313415527, "adv/std_reasoning": 0.757496178150177, "adv/std_step_conf": 0.934755802154541, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7525674786043448, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.2151778656126483, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5731225296442688, "calib/gap": 0.3166366030283081, "calib/mean_conf": 0.7010276679841898, "calib/mu_c": 0.8236774193548387, "calib/mu_w": 0.5070408163265306, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15177865612648225, "calib/std_conf": 0.38632622970259445, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4635011709601874, "calib/step_q_c_n": 854.0, "calib/step_q_gap": 0.0510630084160178, "calib/step_q_w": 0.4124381625441696, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 474.51953125, "completions/mean_terminated_length": 474.51953125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.18346666666666667, "grad_norm": 0.04483543336391449, "kl": 0.1094207763671875, "learning_rate": 7.777777777777779e-07, "loss": 0.061, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035601723939180374, "mask/share_reasoning": 0.8355083465576172, "mask/share_step_conf": 0.12888994812965393, "num_tokens": 40048112.0, "reward": 1.1799538135528564, "reward_std": 0.2019103467464447, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7470546960830688, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8627352714538574, "step": 172 }, { "adv/mean_abs_final_conf": 0.7111342549324036, "adv/mean_abs_reasoning": 0.5352945327758789, "adv/mean_abs_step_conf": 0.7427831292152405, "adv/ratio_final_to_reasoning": 1.328491533893821, "adv/ratio_step_to_reasoning": 1.3876157586803421, "adv/std_final_conf": 0.8797129988670349, "adv/std_reasoning": 0.792944610118866, "adv/std_step_conf": 0.9360061883926392, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6726320673427455, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.29532258064516126, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6612903225806451, "calib/gap": 0.2637847153178233, "calib/mean_conf": 0.7501612903225807, "calib/mu_c": 0.8639716312056738, "calib/mu_w": 0.6001869158878504, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23846774193548384, "calib/std_conf": 0.3867940684715662, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4869066366704162, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.06949922926300889, "calib/step_q_w": 0.41740740740740734, "calib/step_q_w_n": 729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2397.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 525.81640625, "completions/mean_terminated_length": 525.81640625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.18453333333333333, "grad_norm": 0.03908439353108406, "kl": 0.095733642578125, "learning_rate": 7.5e-07, "loss": 0.0406, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0347096249461174, "mask/share_reasoning": 0.8326051235198975, "mask/share_step_conf": 0.13268522918224335, "num_tokens": 40285881.0, "reward": 1.0948352813720703, "reward_std": 0.27466046810150146, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6795969009399414, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8035906553268433, "step": 173 }, { "adv/mean_abs_final_conf": 0.7004639506340027, "adv/mean_abs_reasoning": 0.5624732375144958, "adv/mean_abs_step_conf": 0.7741360664367676, "adv/ratio_final_to_reasoning": 1.245328495501887, "adv/ratio_step_to_reasoning": 1.3763073774986794, "adv/std_final_conf": 0.8960103988647461, "adv/std_reasoning": 0.7756208777427673, "adv/std_step_conf": 0.9355176687240601, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6280317004850722, "calib/avg_num_step_conf": 5.9140625, "calib/ece": 0.29665289256198346, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4834710743801653, "calib/gap": 0.22471476395436202, "calib/mean_conf": 0.6067355371900827, "calib/mu_c": 0.7172357723577234, "calib/mu_w": 0.4925210084033614, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.19756198347107434, "calib/std_conf": 0.4254473633542015, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.44660598179453836, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.04566638447910215, "calib/step_q_w": 0.4009395973154362, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 543.96484375, "completions/mean_terminated_length": 548.248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.1856, "grad_norm": 0.03411554917693138, "kl": 0.09084320068359375, "learning_rate": 7.222222222222222e-07, "loss": -0.0468, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03144906461238861, "mask/share_reasoning": 0.8394503593444824, "mask/share_step_conf": 0.12128806114196777, "num_tokens": 40529368.0, "reward": 1.045793056488037, "reward_std": 0.24938157200813293, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6349597573280334, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7815009355545044, "step": 174 }, { "adv/mean_abs_final_conf": 0.717574417591095, "adv/mean_abs_reasoning": 0.6078211665153503, "adv/mean_abs_step_conf": 0.7437684535980225, "adv/ratio_final_to_reasoning": 1.1805683268731195, "adv/ratio_step_to_reasoning": 1.2236632986344658, "adv/std_final_conf": 0.9007578492164612, "adv/std_reasoning": 0.826562762260437, "adv/std_step_conf": 0.9361304044723511, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7264182499331013, "calib/avg_num_step_conf": 6.4296875, "calib/ece": 0.26903614457831326, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.46586345381526106, "calib/gap": 0.37825929890286336, "calib/mean_conf": 0.5473493975903615, "calib/mu_c": 0.7721782178217823, "calib/mu_w": 0.39391891891891895, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2053815261044177, "calib/std_conf": 0.4547935256945654, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49954861111111115, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.15431496625129804, "calib/step_q_w": 0.3452336448598131, "calib/step_q_w_n": 1070.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2487.0, "completions/max_terminated_length": 2487.0, "completions/mean_length": 544.6328125, "completions/mean_terminated_length": 546.7686767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.18666666666666668, "grad_norm": 0.03185137361288071, "kl": 0.0924072265625, "learning_rate": 6.944444444444446e-07, "loss": -0.0444, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03254655748605728, "mask/share_reasoning": 0.8343169689178467, "mask/share_step_conf": 0.12923020124435425, "num_tokens": 40774618.0, "reward": 1.0986486673355103, "reward_std": 0.2614133954048157, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.6948410272598267, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8188250064849854, "step": 175 }, { "adv/mean_abs_final_conf": 0.7038910984992981, "adv/mean_abs_reasoning": 0.5159138441085815, "adv/mean_abs_step_conf": 0.7356275916099548, "adv/ratio_final_to_reasoning": 1.3643578410180326, "adv/ratio_step_to_reasoning": 1.4258729437295956, "adv/std_final_conf": 0.8717449307441711, "adv/std_reasoning": 0.757626473903656, "adv/std_step_conf": 0.935823917388916, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7927789514115613, "calib/avg_num_step_conf": 5.59375, "calib/ece": 0.23728000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.552, "calib/gap": 0.44018052621471115, "calib/mean_conf": 0.6571199999999999, "calib/mu_c": 0.8807317073170733, "calib/mu_w": 0.44055118110236213, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.20120000000000007, "calib/std_conf": 0.42556844995840565, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5200438596491228, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.1446427901304062, "calib/step_q_w": 0.3754010695187166, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 481.30859375, "completions/mean_terminated_length": 483.19610595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.18773333333333334, "grad_norm": 0.04285878688097, "kl": 0.10150909423828125, "learning_rate": 6.666666666666667e-07, "loss": -0.0869, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03696080669760704, "mask/share_reasoning": 0.8263058066368103, "mask/share_step_conf": 0.13282713294029236, "num_tokens": 41001897.0, "reward": 1.1082634925842285, "reward_std": 0.2980533540248871, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7168636322021484, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8091504573822021, "step": 176 }, { "adv/mean_abs_final_conf": 0.7086734771728516, "adv/mean_abs_reasoning": 0.44564753770828247, "adv/mean_abs_step_conf": 0.7541524767875671, "adv/ratio_final_to_reasoning": 1.590210687165838, "adv/ratio_step_to_reasoning": 1.6922621869869494, "adv/std_final_conf": 0.8821295499801636, "adv/std_reasoning": 0.7014589309692383, "adv/std_step_conf": 0.9357683658599854, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7694280078895464, "calib/avg_num_step_conf": 5.73828125, "calib/ece": 0.22299595141700407, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.48582995951417, "calib/gap": 0.4125726495726495, "calib/mean_conf": 0.5868016194331984, "calib/mu_c": 0.7822307692307692, "calib/mu_w": 0.3696581196581197, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.14174089068825912, "calib/std_conf": 0.437853492955034, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5092661870503596, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.12377523097800819, "calib/step_q_w": 0.38549095607235145, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 506.140625, "completions/mean_terminated_length": 506.140625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1888, "grad_norm": 0.03434273600578308, "kl": 0.09429168701171875, "learning_rate": 6.388888888888889e-07, "loss": -0.0578, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033864229917526245, "mask/share_reasoning": 0.8405582904815674, "mask/share_step_conf": 0.12557752430438995, "num_tokens": 41235301.0, "reward": 1.1220314502716064, "reward_std": 0.24766887724399567, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.73037189245224, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8127731084823608, "step": 177 }, { "adv/mean_abs_final_conf": 0.6909327507019043, "adv/mean_abs_reasoning": 0.5780574083328247, "adv/mean_abs_step_conf": 0.7503166794776917, "adv/ratio_final_to_reasoning": 1.1952666651131127, "adv/ratio_step_to_reasoning": 1.297996822913627, "adv/std_final_conf": 0.88179612159729, "adv/std_reasoning": 0.7929460406303406, "adv/std_step_conf": 0.9349659085273743, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.796986493374108, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.16919631093544138, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4782608695652174, "calib/gap": 0.47001868841318395, "calib/mean_conf": 0.6095388669301712, "calib/mu_c": 0.8120370370370371, "calib/mu_w": 0.34201834862385316, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10478260869565219, "calib/std_conf": 0.42505284789110764, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.517627345844504, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.1893374907720402, "calib/step_q_w": 0.3282898550724638, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 480.34375, "completions/mean_terminated_length": 480.34375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.18986666666666666, "grad_norm": 0.03665749356150627, "kl": 0.09930419921875, "learning_rate": 6.111111111111112e-07, "loss": 0.0778, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03533283621072769, "mask/share_reasoning": 0.8403965830802917, "mask/share_step_conf": 0.12427057325839996, "num_tokens": 41464341.0, "reward": 1.1939667463302612, "reward_std": 0.2174018770456314, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7896803021430969, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8587312698364258, "step": 178 }, { "adv/mean_abs_final_conf": 0.7072902917861938, "adv/mean_abs_reasoning": 0.6448341608047485, "adv/mean_abs_step_conf": 0.7562251687049866, "adv/ratio_final_to_reasoning": 1.096856113986735, "adv/ratio_step_to_reasoning": 1.1727436520441517, "adv/std_final_conf": 0.8755064010620117, "adv/std_reasoning": 0.858925461769104, "adv/std_step_conf": 0.9356964826583862, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.752932788904012, "calib/avg_num_step_conf": 6.18359375, "calib/ece": 0.23392000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.452, "calib/gap": 0.3749381035711972, "calib/mean_conf": 0.55288, "calib/mu_c": 0.7193525179856116, "calib/mu_w": 0.3444144144144144, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11540000000000006, "calib/std_conf": 0.4377493639058771, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4893333333333333, "calib/step_q_c_n": 795.0, "calib/step_q_gap": 0.09919373942470378, "calib/step_q_w": 0.3901395939086295, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 508.3984375, "completions/mean_terminated_length": 508.3984375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.19093333333333334, "grad_norm": 0.051263369619846344, "kl": 0.1013641357421875, "learning_rate": 5.833333333333334e-07, "loss": 0.0806, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03382003307342529, "mask/share_reasoning": 0.8366349339485168, "mask/share_step_conf": 0.12954503297805786, "num_tokens": 41700755.0, "reward": 1.1520042419433594, "reward_std": 0.22504480183124542, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7291203141212463, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8473212718963623, "step": 179 }, { "adv/mean_abs_final_conf": 0.6792027950286865, "adv/mean_abs_reasoning": 0.5286693572998047, "adv/mean_abs_step_conf": 0.7685763239860535, "adv/ratio_final_to_reasoning": 1.2847402363127989, "adv/ratio_step_to_reasoning": 1.4537939704158023, "adv/std_final_conf": 0.862787127494812, "adv/std_reasoning": 0.7755132913589478, "adv/std_step_conf": 0.9360314607620239, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7416482655459791, "calib/avg_num_step_conf": 6.01953125, "calib/ece": 0.2315416666666667, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.525, "calib/gap": 0.3361200940237909, "calib/mean_conf": 0.6391249999999999, "calib/mu_c": 0.7805755395683454, "calib/mu_w": 0.44445544554455446, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14575000000000005, "calib/std_conf": 0.4145555664905892, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.47737545565006073, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.11126124952192701, "calib/step_q_w": 0.3661142061281337, "calib/step_q_w_n": 718.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 561.98828125, "completions/mean_terminated_length": 564.1921997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.192, "grad_norm": 0.034965962171554565, "kl": 0.09717559814453125, "learning_rate": 5.555555555555555e-07, "loss": 0.0278, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.030879775062203407, "mask/share_reasoning": 0.8455160856246948, "mask/share_step_conf": 0.11969786882400513, "num_tokens": 41948480.0, "reward": 1.082472801208496, "reward_std": 0.2739405035972595, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6942156553268433, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7830908298492432, "step": 180 }, { "adv/mean_abs_final_conf": 0.6977521181106567, "adv/mean_abs_reasoning": 0.5350258350372314, "adv/mean_abs_step_conf": 0.7366287708282471, "adv/ratio_final_to_reasoning": 1.3041465895980546, "adv/ratio_step_to_reasoning": 1.3768097213043673, "adv/std_final_conf": 0.8979544043540955, "adv/std_reasoning": 0.7927014231681824, "adv/std_step_conf": 0.9357931613922119, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8018730158730158, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.19780876494023902, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.42231075697211157, "calib/gap": 0.4372215873015873, "calib/mean_conf": 0.5604382470119521, "calib/mu_c": 0.77992, "calib/mu_w": 0.34269841269841267, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13011952191235057, "calib/std_conf": 0.4297606662437984, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5105908419497784, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.13949113520491035, "calib/step_q_w": 0.3710997067448681, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 468.33203125, "completions/mean_terminated_length": 468.33203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.19306666666666666, "grad_norm": 0.031503282487392426, "kl": 0.1115264892578125, "learning_rate": 5.277777777777779e-07, "loss": 0.0721, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035831812769174576, "mask/share_reasoning": 0.8384501934051514, "mask/share_step_conf": 0.12571796774864197, "num_tokens": 42174637.0, "reward": 1.1525561809539795, "reward_std": 0.23784999549388885, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7531113624572754, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8393547534942627, "step": 181 }, { "adv/mean_abs_final_conf": 0.6776944398880005, "adv/mean_abs_reasoning": 0.47964245080947876, "adv/mean_abs_step_conf": 0.7465130686759949, "adv/ratio_final_to_reasoning": 1.4129158892092957, "adv/ratio_step_to_reasoning": 1.556394909199814, "adv/std_final_conf": 0.8647273182868958, "adv/std_reasoning": 0.7575068473815918, "adv/std_step_conf": 0.9357609748840332, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8048875855327469, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.15850393700787402, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.547244094488189, "calib/gap": 0.4334688823721082, "calib/mean_conf": 0.6879527559055119, "calib/mu_c": 0.8569032258064516, "calib/mu_w": 0.42343434343434344, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11811023622047245, "calib/std_conf": 0.3957447650037054, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5147479484173506, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.1278431865125887, "calib/step_q_w": 0.3869047619047619, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 479.85546875, "completions/mean_terminated_length": 481.7372741699219, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.19413333333333332, "grad_norm": 0.044035859405994415, "kl": 0.09618377685546875, "learning_rate": 5.000000000000001e-07, "loss": -0.0291, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03356048837304115, "mask/share_reasoning": 0.8339300751686096, "mask/share_step_conf": 0.12860319018363953, "num_tokens": 42403640.0, "reward": 1.1931354999542236, "reward_std": 0.2289653867483139, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7957609295845032, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.847839891910553, "step": 182 }, { "adv/mean_abs_final_conf": 0.6630797982215881, "adv/mean_abs_reasoning": 0.5739421844482422, "adv/mean_abs_step_conf": 0.7572264671325684, "adv/ratio_final_to_reasoning": 1.1553076532596016, "adv/ratio_step_to_reasoning": 1.319342762477942, "adv/std_final_conf": 0.8448971509933472, "adv/std_reasoning": 0.7929164171218872, "adv/std_step_conf": 0.9354925155639648, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7529853620955316, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.25356, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.452, "calib/gap": 0.3791692347200822, "calib/mean_conf": 0.5473199999999999, "calib/mu_c": 0.7262878787878788, "calib/mu_w": 0.34711864406779663, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.13643999999999998, "calib/std_conf": 0.44979953045773624, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4964082278481013, "calib/step_q_c_n": 632.0, "calib/step_q_gap": 0.12648812132346748, "calib/step_q_w": 0.3699201065246338, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 525.8515625, "completions/mean_terminated_length": 527.9137573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.1952, "grad_norm": 0.04003230854868889, "kl": 0.08823394775390625, "learning_rate": 4.7222222222222226e-07, "loss": 0.01, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03344167023897171, "mask/share_reasoning": 0.8502011299133301, "mask/share_step_conf": 0.11245097219944, "num_tokens": 42644938.0, "reward": 1.126890778541565, "reward_std": 0.2072310894727707, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7198058366775513, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8236920833587646, "step": 183 }, { "adv/mean_abs_final_conf": 0.7117974758148193, "adv/mean_abs_reasoning": 0.5680974125862122, "adv/mean_abs_step_conf": 0.7626612186431885, "adv/ratio_final_to_reasoning": 1.2529496879319089, "adv/ratio_step_to_reasoning": 1.342483175854018, "adv/std_final_conf": 0.8946519494056702, "adv/std_reasoning": 0.7928260564804077, "adv/std_step_conf": 0.9355891942977905, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7530124514660597, "calib/avg_num_step_conf": 6.2421875, "calib/ece": 0.21844621513944223, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6215139442231076, "calib/gap": 0.378046592582675, "calib/mean_conf": 0.7079282868525898, "calib/mu_c": 0.8540259740259739, "calib/mu_w": 0.47597938144329893, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15641434262948206, "calib/std_conf": 0.4043780464107534, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4989277652370203, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.1237592259111776, "calib/step_q_w": 0.3751685393258427, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 525.46484375, "completions/mean_terminated_length": 525.46484375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.19626666666666667, "grad_norm": 0.03780568763613701, "kl": 0.09539031982421875, "learning_rate": 4.444444444444445e-07, "loss": 0.107, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03317577391862869, "mask/share_reasoning": 0.8409853577613831, "mask/share_step_conf": 0.12583887577056885, "num_tokens": 42884737.0, "reward": 1.1625230312347412, "reward_std": 0.257118821144104, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7507980465888977, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8390820026397705, "step": 184 }, { "adv/mean_abs_final_conf": 0.6712584495544434, "adv/mean_abs_reasoning": 0.48389190435409546, "adv/mean_abs_step_conf": 0.7558167576789856, "adv/ratio_final_to_reasoning": 1.3872074393359544, "adv/ratio_step_to_reasoning": 1.5619537150303404, "adv/std_final_conf": 0.8619060516357422, "adv/std_reasoning": 0.7576537132263184, "adv/std_step_conf": 0.9357088208198547, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8066108387799564, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.22610655737704916, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5983606557377049, "calib/gap": 0.42056372549019605, "calib/mean_conf": 0.6977459016393442, "calib/mu_c": 0.8838970588235294, "calib/mu_w": 0.4633333333333334, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18323770491803276, "calib/std_conf": 0.40918681976184174, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5336063408190225, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.19398485119753284, "calib/step_q_w": 0.33962148962148964, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 541.83984375, "completions/mean_terminated_length": 541.83984375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.19733333333333333, "grad_norm": 0.05819058418273926, "kl": 0.0870513916015625, "learning_rate": 4.1666666666666667e-07, "loss": 0.1115, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03250384330749512, "mask/share_reasoning": 0.8443625569343567, "mask/share_step_conf": 0.1231335997581482, "num_tokens": 43130368.0, "reward": 1.122775673866272, "reward_std": 0.2509518265724182, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7374019622802734, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8075161576271057, "step": 185 }, { "adv/mean_abs_final_conf": 0.6365198493003845, "adv/mean_abs_reasoning": 0.5316267609596252, "adv/mean_abs_step_conf": 0.7644907236099243, "adv/ratio_final_to_reasoning": 1.1973058845860574, "adv/ratio_step_to_reasoning": 1.4380215213958805, "adv/std_final_conf": 0.8457080721855164, "adv/std_reasoning": 0.7753575444221497, "adv/std_step_conf": 0.9350101947784424, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7555590062111801, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.24968627450980385, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5411764705882353, "calib/gap": 0.3616521739130435, "calib/mean_conf": 0.6489019607843138, "calib/mu_c": 0.812, "calib/mu_w": 0.45034782608695656, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1747843137254901, "calib/std_conf": 0.433074247807235, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48518159806295397, "calib/step_q_c_n": 826.0, "calib/step_q_gap": 0.08296553158095954, "calib/step_q_w": 0.4022160664819944, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 480.0390625, "completions/mean_terminated_length": 481.9216003417969, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.1984, "grad_norm": 0.027318695560097694, "kl": 0.101898193359375, "learning_rate": 3.8888888888888895e-07, "loss": 0.0403, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03359612450003624, "mask/share_reasoning": 0.83244788646698, "mask/share_step_conf": 0.1300496906042099, "num_tokens": 43358298.0, "reward": 1.1724010705947876, "reward_std": 0.20041513442993164, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7310941219329834, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8700760006904602, "step": 186 }, { "adv/mean_abs_final_conf": 0.7441973686218262, "adv/mean_abs_reasoning": 0.664376974105835, "adv/mean_abs_step_conf": 0.7474873661994934, "adv/ratio_final_to_reasoning": 1.1201432283582962, "adv/ratio_step_to_reasoning": 1.1250952325756236, "adv/std_final_conf": 0.8918111324310303, "adv/std_reasoning": 0.8590844869613647, "adv/std_step_conf": 0.9357367157936096, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6873687664041995, "calib/avg_num_step_conf": 6.9375, "calib/ece": 0.2960728744939271, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4939271255060729, "calib/gap": 0.280001312335958, "calib/mean_conf": 0.6168016194331983, "calib/mu_c": 0.7528346456692913, "calib/mu_w": 0.47283333333333327, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.19935222672064778, "calib/std_conf": 0.4320152066568983, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4732522407170295, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.1257517382044666, "calib/step_q_w": 0.34750050251256287, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 575.44140625, "completions/mean_terminated_length": 575.44140625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.19946666666666665, "grad_norm": 0.03541375696659088, "kl": 0.0881195068359375, "learning_rate": 3.611111111111111e-07, "loss": -0.0178, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030242277309298515, "mask/share_reasoning": 0.8467020988464355, "mask/share_step_conf": 0.12305556237697601, "num_tokens": 43607155.0, "reward": 1.0910013914108276, "reward_std": 0.30093953013420105, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6646523475646973, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8172961473464966, "step": 187 }, { "adv/mean_abs_final_conf": 0.6986135244369507, "adv/mean_abs_reasoning": 0.5794321298599243, "adv/mean_abs_step_conf": 0.7691971063613892, "adv/ratio_final_to_reasoning": 1.205686547975581, "adv/ratio_step_to_reasoning": 1.3275016463918559, "adv/std_final_conf": 0.8814206719398499, "adv/std_reasoning": 0.7929037809371948, "adv/std_step_conf": 0.9354791641235352, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7335718932986337, "calib/avg_num_step_conf": 6.03125, "calib/ece": 0.24537848605577695, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.601593625498008, "calib/gap": 0.3321600520494469, "calib/mean_conf": 0.7028286852589642, "calib/mu_c": 0.843103448275862, "calib/mu_w": 0.5109433962264152, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18525896414342635, "calib/std_conf": 0.40743435730411504, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.52737922705314, "calib/step_q_c_n": 828.0, "calib/step_q_gap": 0.12147140582408972, "calib/step_q_w": 0.4059078212290503, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 523.77734375, "completions/mean_terminated_length": 525.8314208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.20053333333333334, "grad_norm": 0.03188059478998184, "kl": 0.08913421630859375, "learning_rate": 3.3333333333333335e-07, "loss": 0.0705, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034026190638542175, "mask/share_reasoning": 0.8330436944961548, "mask/share_step_conf": 0.12902390956878662, "num_tokens": 43845314.0, "reward": 1.1429827213287354, "reward_std": 0.2559507489204407, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7220597863197327, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8363538384437561, "step": 188 }, { "adv/mean_abs_final_conf": 0.6727781295776367, "adv/mean_abs_reasoning": 0.44140952825546265, "adv/mean_abs_step_conf": 0.7335576415061951, "adv/ratio_final_to_reasoning": 1.5241586021864737, "adv/ratio_step_to_reasoning": 1.6618527570199022, "adv/std_final_conf": 0.8597487807273865, "adv/std_reasoning": 0.7014789581298828, "adv/std_step_conf": 0.9356318116188049, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7398478835978837, "calib/avg_num_step_conf": 5.55859375, "calib/ece": 0.24214574898785426, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4331983805668016, "calib/gap": 0.3963564814814815, "calib/mean_conf": 0.5297570850202429, "calib/mu_c": 0.7094814814814815, "calib/mu_w": 0.313125, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11267206477732798, "calib/std_conf": 0.4506593703872346, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5149048316251831, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.2036345613549128, "calib/step_q_w": 0.31127027027027027, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 478.1875, "completions/mean_terminated_length": 483.85772705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.2016, "grad_norm": 0.03390476480126381, "kl": 0.1073455810546875, "learning_rate": 3.055555555555556e-07, "loss": -0.0692, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03491278365254402, "mask/share_reasoning": 0.8374266624450684, "mask/share_step_conf": 0.11594181507825851, "num_tokens": 44075498.0, "reward": 1.1270095109939575, "reward_std": 0.2504253387451172, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7151448726654053, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8274785280227661, "step": 189 }, { "adv/mean_abs_final_conf": 0.6990896463394165, "adv/mean_abs_reasoning": 0.48381108045578003, "adv/mean_abs_step_conf": 0.7443450093269348, "adv/ratio_final_to_reasoning": 1.4449641080581097, "adv/ratio_step_to_reasoning": 1.5385034353196616, "adv/std_final_conf": 0.8977615237236023, "adv/std_reasoning": 0.7576168179512024, "adv/std_step_conf": 0.9358354210853577, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7738014626218852, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.19955284552845526, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.540650406504065, "calib/gap": 0.41882448537378114, "calib/mean_conf": 0.6448373983739837, "calib/mu_c": 0.8219014084507043, "calib/mu_w": 0.4030769230769231, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1335772357723577, "calib/std_conf": 0.4239618765263017, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4915936739659368, "calib/step_q_c_n": 822.0, "calib/step_q_gap": 0.14889266894081116, "calib/step_q_w": 0.3427010050251256, "calib/step_q_w_n": 796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 564.60546875, "completions/mean_terminated_length": 571.3004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.20266666666666666, "grad_norm": 0.04439045488834381, "kl": 0.08817291259765625, "learning_rate": 2.7777777777777776e-07, "loss": -0.0007, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.029904117807745934, "mask/share_reasoning": 0.8434927463531494, "mask/share_step_conf": 0.11488443613052368, "num_tokens": 44325645.0, "reward": 1.1512739658355713, "reward_std": 0.2505730092525482, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7457519769668579, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8352595567703247, "step": 190 }, { "adv/mean_abs_final_conf": 0.7039273977279663, "adv/mean_abs_reasoning": 0.5166229605674744, "adv/mean_abs_step_conf": 0.7615114450454712, "adv/ratio_final_to_reasoning": 1.3625553865332487, "adv/ratio_step_to_reasoning": 1.4740178102208308, "adv/std_final_conf": 0.8850259184837341, "adv/std_reasoning": 0.7576173543930054, "adv/std_step_conf": 0.9355537295341492, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7208251721741328, "calib/avg_num_step_conf": 6.55859375, "calib/ece": 0.2646428571428571, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6190476190476191, "calib/gap": 0.36280849181777985, "calib/mean_conf": 0.7055158730158729, "calib/mu_c": 0.876842105263158, "calib/mu_w": 0.5140336134453781, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2211904761904762, "calib/std_conf": 0.41151690420499143, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5186729857819905, "calib/step_q_c_n": 844.0, "calib/step_q_gap": 0.13002627919516418, "calib/step_q_w": 0.3886467065868263, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 504.484375, "completions/mean_terminated_length": 504.484375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.20373333333333332, "grad_norm": 0.03353596478700638, "kl": 0.094268798828125, "learning_rate": 2.5000000000000004e-07, "loss": 0.0216, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03465589880943298, "mask/share_reasoning": 0.8255807161331177, "mask/share_step_conf": 0.13976339995861053, "num_tokens": 44558961.0, "reward": 1.1210756301879883, "reward_std": 0.23884853720664978, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.714613676071167, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.818879246711731, "step": 191 }, { "adv/mean_abs_final_conf": 0.6482102274894714, "adv/mean_abs_reasoning": 0.49863550066947937, "adv/mean_abs_step_conf": 0.7356137037277222, "adv/ratio_final_to_reasoning": 1.2999680660907007, "adv/ratio_step_to_reasoning": 1.4752533719321437, "adv/std_final_conf": 0.8706212043762207, "adv/std_reasoning": 0.75757896900177, "adv/std_step_conf": 0.9354583024978638, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.832940251572327, "calib/avg_num_step_conf": 5.59765625, "calib/ece": 0.16419999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.552, "calib/gap": 0.530704926624738, "calib/mean_conf": 0.65012, "calib/mu_c": 0.8751388888888889, "calib/mu_w": 0.3444339622641509, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11916, "calib/std_conf": 0.4264837459974295, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.48260172626387177, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.10440236935068847, "calib/step_q_w": 0.3781993569131833, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 507.06640625, "completions/mean_terminated_length": 509.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.2048, "grad_norm": 0.061243936419487, "kl": 0.10663604736328125, "learning_rate": 2.2222222222222224e-07, "loss": 0.0477, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035767100751399994, "mask/share_reasoning": 0.837902307510376, "mask/share_step_conf": 0.12242428958415985, "num_tokens": 44793746.0, "reward": 1.1880981922149658, "reward_std": 0.2168246954679489, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.8043121099472046, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8432351350784302, "step": 192 }, { "adv/mean_abs_final_conf": 0.7263143062591553, "adv/mean_abs_reasoning": 0.6314749717712402, "adv/mean_abs_step_conf": 0.7740864157676697, "adv/ratio_final_to_reasoning": 1.150187004596394, "adv/ratio_step_to_reasoning": 1.225838632363235, "adv/std_final_conf": 0.8906956315040588, "adv/std_reasoning": 0.8266628384590149, "adv/std_step_conf": 0.9355929493904114, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7353971232020013, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.2540316205533597, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.45849802371541504, "calib/gap": 0.31774108818011254, "calib/mean_conf": 0.6152173913043478, "calib/mu_c": 0.7696923076923077, "calib/mu_w": 0.45195121951219513, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17770750988142292, "calib/std_conf": 0.4199242969769833, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.508815592203898, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.12327128840642965, "calib/step_q_w": 0.3855443037974684, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 504.51171875, "completions/mean_terminated_length": 504.51171875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.20586666666666667, "grad_norm": 0.042873747646808624, "kl": 0.0930938720703125, "learning_rate": 1.9444444444444447e-07, "loss": 0.0569, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033232901245355606, "mask/share_reasoning": 0.8482795357704163, "mask/share_step_conf": 0.11848757416009903, "num_tokens": 45028613.0, "reward": 1.1263048648834229, "reward_std": 0.26204511523246765, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7099542617797852, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8289578557014465, "step": 193 }, { "adv/mean_abs_final_conf": 0.6581943035125732, "adv/mean_abs_reasoning": 0.5350709557533264, "adv/mean_abs_step_conf": 0.7595569491386414, "adv/ratio_final_to_reasoning": 1.2301065801373978, "adv/ratio_step_to_reasoning": 1.4195443444865383, "adv/std_final_conf": 0.8700836300849915, "adv/std_reasoning": 0.7928699254989624, "adv/std_step_conf": 0.9356017708778381, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.801510989010989, "calib/avg_num_step_conf": 5.4140625, "calib/ece": 0.19689243027888442, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.42150641025641017, "calib/mean_conf": 0.6286852589641435, "calib/mu_c": 0.8033333333333332, "calib/mu_w": 0.38182692307692306, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11996015936254979, "calib/std_conf": 0.42559387209635524, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5106290956749673, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.13867082922231883, "calib/step_q_w": 0.37195826645264846, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 489.75, "completions/mean_terminated_length": 491.6706237792969, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.20693333333333333, "grad_norm": 0.031372662633657455, "kl": 0.09343719482421875, "learning_rate": 1.6666666666666668e-07, "loss": 0.0224, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03451238200068474, "mask/share_reasoning": 0.845749020576477, "mask/share_step_conf": 0.11583234369754791, "num_tokens": 45259933.0, "reward": 1.1746629476547241, "reward_std": 0.23227962851524353, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7637102007865906, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8497854471206665, "step": 194 }, { "adv/mean_abs_final_conf": 0.6965253353118896, "adv/mean_abs_reasoning": 0.5378446578979492, "adv/mean_abs_step_conf": 0.7651320695877075, "adv/ratio_final_to_reasoning": 1.2950306842018473, "adv/ratio_step_to_reasoning": 1.4225893264015348, "adv/std_final_conf": 0.8900267481803894, "adv/std_reasoning": 0.792778491973877, "adv/std_step_conf": 0.9356444478034973, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7591954022988506, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.2272509960159363, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5099601593625498, "calib/gap": 0.4037758620689656, "calib/mean_conf": 0.6113944223107569, "calib/mu_c": 0.798, "calib/mu_w": 0.39422413793103445, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15039840637450205, "calib/std_conf": 0.43467445322633635, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5138539042821159, "calib/step_q_c_n": 794.0, "calib/step_q_gap": 0.11080232835088377, "calib/step_q_w": 0.4030515759312321, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 490.859375, "completions/mean_terminated_length": 494.7243957519531, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.208, "grad_norm": 0.041009191423654556, "kl": 0.10141754150390625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0664, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034289367496967316, "mask/share_reasoning": 0.8296006917953491, "mask/share_step_conf": 0.12829747796058655, "num_tokens": 45491577.0, "reward": 1.1288546323776245, "reward_std": 0.25321608781814575, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.735246479511261, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8149751424789429, "step": 195 }, { "adv/mean_abs_final_conf": 0.639642596244812, "adv/mean_abs_reasoning": 0.43357157707214355, "adv/mean_abs_step_conf": 0.7559478282928467, "adv/ratio_final_to_reasoning": 1.4752871960939902, "adv/ratio_step_to_reasoning": 1.7435364038336438, "adv/std_final_conf": 0.8292726278305054, "adv/std_reasoning": 0.7013278007507324, "adv/std_step_conf": 0.9355693459510803, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7500616294835449, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.22945098039215683, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.611764705882353, "calib/gap": 0.3932916307161346, "calib/mean_conf": 0.7307843137254901, "calib/mu_c": 0.9189473684210526, "calib/mu_w": 0.525655737704918, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2193333333333333, "calib/std_conf": 0.3851388501197819, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5326361031518624, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.08710899471812744, "calib/step_q_w": 0.44552710843373494, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 421.9609375, "completions/mean_terminated_length": 423.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.20906666666666668, "grad_norm": 0.03640659153461456, "kl": 0.1021728515625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0044, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03759670630097389, "mask/share_reasoning": 0.827721118927002, "mask/share_step_conf": 0.13077595829963684, "num_tokens": 45702143.0, "reward": 1.1514766216278076, "reward_std": 0.18845239281654358, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7516941428184509, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.832089364528656, "step": 196 }, { "adv/mean_abs_final_conf": 0.7016844749450684, "adv/mean_abs_reasoning": 0.6250609755516052, "adv/mean_abs_step_conf": 0.7250959277153015, "adv/ratio_final_to_reasoning": 1.1225856394663645, "adv/ratio_step_to_reasoning": 1.1600403097880447, "adv/std_final_conf": 0.8904477953910828, "adv/std_reasoning": 0.8589926958084106, "adv/std_step_conf": 0.9358147382736206, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7472128597355459, "calib/avg_num_step_conf": 6.12109375, "calib/ece": 0.24686746987951802, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5261044176706827, "calib/gap": 0.35282473424941674, "calib/mean_conf": 0.6451807228915661, "calib/mu_c": 0.8095488721804512, "calib/mu_w": 0.4567241379310345, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17895582329317267, "calib/std_conf": 0.4208769619953476, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4814709677419355, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.08943813945910717, "calib/step_q_w": 0.3920328282828283, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 518.7578125, "completions/mean_terminated_length": 518.7578125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.21013333333333334, "grad_norm": 0.03738636150956154, "kl": 0.0984039306640625, "learning_rate": 8.333333333333334e-08, "loss": 0.0697, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032930146902799606, "mask/share_reasoning": 0.8381119966506958, "mask/share_step_conf": 0.1289578378200531, "num_tokens": 45940001.0, "reward": 1.1338474750518799, "reward_std": 0.2597096562385559, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7132207155227661, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8373578190803528, "step": 197 }, { "adv/mean_abs_final_conf": 0.6680877208709717, "adv/mean_abs_reasoning": 0.5190901756286621, "adv/mean_abs_step_conf": 0.7632308006286621, "adv/ratio_final_to_reasoning": 1.2870359568293908, "adv/ratio_step_to_reasoning": 1.470324110265283, "adv/std_final_conf": 0.8786221146583557, "adv/std_reasoning": 0.7753717303276062, "adv/std_step_conf": 0.9356816411018372, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8258064516129031, "calib/avg_num_step_conf": 6.07421875, "calib/ece": 0.15480314960629915, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.48031496062992124, "calib/gap": 0.5219745845552297, "calib/mean_conf": 0.5991338582677166, "calib/mu_c": 0.8025806451612902, "calib/mu_w": 0.2806060606060606, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07185039370078736, "calib/std_conf": 0.4342103405922523, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49769696335078534, "calib/step_q_c_n": 955.0, "calib/step_q_gap": 0.10981363001745204, "calib/step_q_w": 0.3878833333333333, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 457.97265625, "completions/mean_terminated_length": 457.97265625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2112, "grad_norm": 0.05018523335456848, "kl": 0.1059112548828125, "learning_rate": 5.555555555555556e-08, "loss": 0.0534, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037562139332294464, "mask/share_reasoning": 0.8166549205780029, "mask/share_step_conf": 0.14578291773796082, "num_tokens": 46162626.0, "reward": 1.2054414749145508, "reward_std": 0.20616331696510315, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.8151402473449707, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.851328432559967, "step": 198 }, { "adv/mean_abs_final_conf": 0.7400945425033569, "adv/mean_abs_reasoning": 0.6025816202163696, "adv/mean_abs_step_conf": 0.77257239818573, "adv/ratio_final_to_reasoning": 1.2282063004802741, "adv/ratio_step_to_reasoning": 1.2821041536386748, "adv/std_final_conf": 0.906673014163971, "adv/std_reasoning": 0.8266450762748718, "adv/std_step_conf": 0.9356889724731445, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6141747828375889, "calib/avg_num_step_conf": 5.8671875, "calib/ece": 0.32704453441295545, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.582995951417004, "calib/gap": 0.2014648591734668, "calib/mean_conf": 0.6980566801619432, "calib/mu_c": 0.7926717557251909, "calib/mu_w": 0.5912068965517241, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.24736842105263157, "calib/std_conf": 0.412814312622985, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5140081521739132, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.11232407906686354, "calib/step_q_w": 0.4016840731070496, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 544.16796875, "completions/mean_terminated_length": 546.302001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.21226666666666666, "grad_norm": 0.03123634122312069, "kl": 0.0904998779296875, "learning_rate": 2.777777777777778e-08, "loss": 0.0078, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03431296348571777, "mask/share_reasoning": 0.833673894405365, "mask/share_step_conf": 0.12810686230659485, "num_tokens": 46406133.0, "reward": 1.0824717283248901, "reward_std": 0.27605992555618286, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6297984719276428, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8265548944473267, "step": 199 }, { "adv/mean_abs_final_conf": 0.6329021453857422, "adv/mean_abs_reasoning": 0.4508885145187378, "adv/mean_abs_step_conf": 0.7580750584602356, "adv/ratio_final_to_reasoning": 1.403677683077111, "adv/ratio_step_to_reasoning": 1.6812915699779527, "adv/std_final_conf": 0.8475625514984131, "adv/std_reasoning": 0.701388418674469, "adv/std_step_conf": 0.9355641007423401, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8224703149199794, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.191106719367589, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5652173913043478, "calib/gap": 0.49758711925658233, "calib/mean_conf": 0.6452569169960475, "calib/mu_c": 0.8497986577181208, "calib/mu_w": 0.35221153846153846, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12371541501976291, "calib/std_conf": 0.4400858176299625, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5602846534653466, "calib/step_q_c_n": 808.0, "calib/step_q_gap": 0.21865814744125028, "calib/step_q_w": 0.34162650602409633, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 520.86328125, "completions/mean_terminated_length": 520.86328125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.21333333333333335, "grad_norm": 0.033193521201610565, "kl": 0.0963134765625, "learning_rate": 0.0, "loss": 0.0717, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03284844756126404, "mask/share_reasoning": 0.8499813079833984, "mask/share_step_conf": 0.11717026680707932, "num_tokens": 46647522.0, "reward": 1.2013983726501465, "reward_std": 0.2132171094417572, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7925854921340942, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8640990853309631, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.0034122529719024896, "train_runtime": 12611.9052, "train_samples_per_second": 4.06, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 46647522, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }