{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7490277290344238, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343300461769104, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04266543686389923, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.078, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.7281402349472046, "reward_std": 0.16804265975952148, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7698483467102051, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345317482948303, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.03963975980877876, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0095, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.6806339025497437, "reward_std": 0.16487614810466766, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7796330451965332, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7743935585021973, "adv/std_final_conf": 0.9288880825042725, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.932464599609375, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4512251048218029, "calib/avg_num_step_conf": 4.90625, "calib/ece": 0.25462745098039213, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.30196078431372547, "calib/gap": -0.006965408805031492, "calib/mean_conf": 0.8781568627450981, "calib/mu_c": 0.8755345911949685, "calib/mu_w": 0.8825, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25462745098039213, "calib/std_conf": 0.05181542375757666, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7867372881355932, "calib/step_q_c_n": 708.0, "calib/step_q_gap": 0.024255536310775594, "calib/step_q_w": 0.7624817518248176, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 490.4765625, "completions/mean_terminated_length": 492.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.05618441477417946, "kl": 0.0011872649192810059, "learning_rate": 7.5e-07, "loss": -0.0191, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03281779587268829, "mask/share_reasoning": 0.8549892902374268, "mask/share_step_conf": 0.10828666388988495, "num_tokens": 689479.0, "reward": 0.7213048934936523, "reward_std": 0.13984259963035583, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6905413866043091, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7520683407783508, "step": 3 }, { "adv/mean_abs_final_conf": 0.7581098079681396, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7545291185379028, "adv/std_final_conf": 0.9290906190872192, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343487620353699, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4615896358543418, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.20826771653543305, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2637795275590551, "calib/gap": -0.0029355742296919285, "calib/mean_conf": 0.8775590551181104, "calib/mu_c": 0.8765882352941176, "calib/mu_w": 0.8795238095238095, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20826771653543305, "calib/std_conf": 0.048783896520646235, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.793607594936709, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.006854906764665936, "calib/step_q_w": 0.786752688172043, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 501.4609375, "completions/mean_terminated_length": 503.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.004266666666666667, "grad_norm": 0.04018299654126167, "kl": 0.000284343957901001, "learning_rate": 1.0000000000000002e-06, "loss": 0.0582, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03364722430706024, "mask/share_reasoning": 0.849086344242096, "mask/share_step_conf": 0.11336017400026321, "num_tokens": 924021.0, "reward": 0.7345324754714966, "reward_std": 0.13953471183776855, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7220828533172607, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7469820976257324, "step": 4 }, { "adv/mean_abs_final_conf": 0.7413392066955566, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7512904405593872, "adv/std_final_conf": 0.9298239946365356, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348950982093811, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4331679894179894, "calib/avg_num_step_conf": 4.62109375, "calib/ece": 0.33457489878542507, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3157894736842105, "calib/gap": -0.00674735449735453, "calib/mean_conf": 0.8811336032388664, "calib/mu_c": 0.878074074074074, "calib/mu_w": 0.8848214285714285, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.33457489878542507, "calib/std_conf": 0.04259224764063817, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7943534482758621, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.0008832224031720681, "calib/step_q_w": 0.79347022587269, "calib/step_q_w_n": 487.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 513.0, "completions/mean_terminated_length": 515.0117797851562, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.005333333333333333, "grad_norm": 0.03751188516616821, "kl": 0.0002925395965576172, "learning_rate": 1.25e-06, "loss": -0.0334, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.034141309559345245, "mask/share_reasoning": 0.852514386177063, "mask/share_step_conf": 0.10943801701068878, "num_tokens": 1162037.0, "reward": 0.6394835710525513, "reward_std": 0.17741966247558594, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6105879545211792, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6683791875839233, "step": 5 }, { "adv/mean_abs_final_conf": 0.7694669961929321, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7502487897872925, "adv/std_final_conf": 0.9307279586791992, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.933992326259613, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5237428348097969, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.2790513833992096, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2608695652173913, "calib/gap": 0.0010552371026577578, "calib/mean_conf": 0.8798418972332016, "calib/mu_c": 0.880263157894737, "calib/mu_w": 0.8792079207920792, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2790513833992096, "calib/std_conf": 0.041168393988522095, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7931697054698457, "calib/step_q_c_n": 713.0, "calib/step_q_gap": -0.009380753245750606, "calib/step_q_w": 0.8025504587155963, "calib/step_q_w_n": 545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 449.0234375, "completions/mean_terminated_length": 450.7843322753906, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.0064, "grad_norm": 0.05391272157430649, "kl": 0.0003655552864074707, "learning_rate": 1.5e-06, "loss": -0.0082, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03775656595826149, "mask/share_reasoning": 0.8353100419044495, "mask/share_step_conf": 0.12302714586257935, "num_tokens": 1382939.0, "reward": 0.689347505569458, "reward_std": 0.153395414352417, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6682343482971191, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7104606032371521, "step": 6 }, { "adv/mean_abs_final_conf": 0.7352026700973511, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7448313236236572, "adv/std_final_conf": 0.9303086996078491, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341685175895691, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49784187448616063, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.24169960474308289, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.30039525691699603, "calib/gap": -0.0010311044121676938, "calib/mean_conf": 0.8822529644268774, "calib/mu_c": 0.881890243902439, "calib/mu_w": 0.8829213483146067, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23786561264822126, "calib/std_conf": 0.04291805825539821, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7909617486338798, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.005403808719716685, "calib/step_q_w": 0.7855579399141631, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 525.84375, "completions/mean_terminated_length": 527.9058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.007466666666666667, "grad_norm": 0.060856908559799194, "kl": 0.00033777952194213867, "learning_rate": 1.75e-06, "loss": -0.0041, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030560888350009918, "mask/share_reasoning": 0.8551996946334839, "mask/share_step_conf": 0.11033320426940918, "num_tokens": 1624979.0, "reward": 0.7218343019485474, "reward_std": 0.17086157202720642, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7019370794296265, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7417315244674683, "step": 7 }, { "adv/mean_abs_final_conf": 0.7708044648170471, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7599292993545532, "adv/std_final_conf": 0.9317449331283569, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343979954719543, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4968477531857814, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.29518218623481773, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.25101214574898784, "calib/gap": 0.013062374245472741, "calib/mean_conf": 0.870080971659919, "calib/mu_c": 0.8756338028169014, "calib/mu_w": 0.8625714285714287, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29518218623481773, "calib/std_conf": 0.07809986290186875, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7903883495145632, "calib/step_q_c_n": 618.0, "calib/step_q_gap": 0.026240084530335994, "calib/step_q_w": 0.7641482649842272, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 536.0078125, "completions/mean_terminated_length": 538.10986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.008533333333333334, "grad_norm": 0.043951455503702164, "kl": 0.0003973245620727539, "learning_rate": 2.0000000000000003e-06, "loss": -0.0311, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03252224624156952, "mask/share_reasoning": 0.860336184501648, "mask/share_step_conf": 0.10323531180620193, "num_tokens": 1868709.0, "reward": 0.6831406354904175, "reward_std": 0.1717548966407776, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6414449214935303, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7248363494873047, "step": 8 }, { "adv/mean_abs_final_conf": 0.7700271606445312, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7614004611968994, "adv/std_final_conf": 0.9310073852539062, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345640540122986, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4621662701559609, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.2667588932806325, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": -0.006087761036214778, "calib/mean_conf": 0.8833596837944664, "calib/mu_c": 0.881025641025641, "calib/mu_w": 0.8871134020618557, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2667588932806325, "calib/std_conf": 0.04391170933248677, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7720178799489145, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.023901375094545596, "calib/step_q_w": 0.7481165048543689, "calib/step_q_w_n": 515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2465.0, "completions/max_terminated_length": 2465.0, "completions/mean_length": 489.2734375, "completions/mean_terminated_length": 493.1259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.0096, "grad_norm": 0.03829352185130119, "kl": 0.00040724873542785645, "learning_rate": 2.25e-06, "loss": -0.02, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03479592502117157, "mask/share_reasoning": 0.8491111993789673, "mask/share_step_conf": 0.10828033089637756, "num_tokens": 2101499.0, "reward": 0.6831810474395752, "reward_std": 0.17485444247722626, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6700422167778015, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6963199377059937, "step": 9 }, { "adv/mean_abs_final_conf": 0.7428078055381775, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7591912150382996, "adv/std_final_conf": 0.9304345846176147, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342936873435974, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5017532467532467, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.27909448818897636, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3700787401574803, "calib/gap": 0.017472727272727084, "calib/mean_conf": 0.8853937007874015, "calib/mu_c": 0.8922727272727271, "calib/mu_w": 0.8748, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27909448818897636, "calib/std_conf": 0.08844034935136104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7868376536215461, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.0044474954493142205, "calib/step_q_w": 0.7823901581722319, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 522.12890625, "completions/mean_terminated_length": 522.12890625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.010666666666666666, "grad_norm": 0.03278151899576187, "kl": 0.0004666447639465332, "learning_rate": 2.5e-06, "loss": 0.0261, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03185880556702614, "mask/share_reasoning": 0.8579164743423462, "mask/share_step_conf": 0.11022467166185379, "num_tokens": 2341964.0, "reward": 0.7027997970581055, "reward_std": 0.162948340177536, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6785824298858643, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7270171642303467, "step": 10 }, { "adv/mean_abs_final_conf": 0.7659250497817993, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7720210552215576, "adv/std_final_conf": 0.9277737736701965, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9336807727813721, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47283041401273884, "calib/avg_num_step_conf": 5.43359375, "calib/ece": 0.2828853754940712, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.35177865612648224, "calib/gap": -0.019252255838641208, "calib/mean_conf": 0.8794071146245058, "calib/mu_c": 0.8721019108280256, "calib/mu_w": 0.8913541666666668, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.27086956521739136, "calib/std_conf": 0.09333544982717665, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7728014616321559, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.0013628651409278714, "calib/step_q_w": 0.771438596491228, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 536.41796875, "completions/mean_terminated_length": 536.41796875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.011733333333333333, "grad_norm": 0.04829133301973343, "kl": 0.0008193850517272949, "learning_rate": 2.7500000000000004e-06, "loss": 0.0151, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0325528159737587, "mask/share_reasoning": 0.8514052629470825, "mask/share_step_conf": 0.11604189872741699, "num_tokens": 2583767.0, "reward": 0.7033008337020874, "reward_std": 0.1320822387933731, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6716292500495911, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7349722981452942, "step": 11 }, { "adv/mean_abs_final_conf": 0.7572861909866333, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7546399831771851, "adv/std_final_conf": 0.9274834394454956, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344656467437744, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5159368168676398, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.2420799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.472, "calib/gap": 0.004847330935759175, "calib/mean_conf": 0.89408, "calib/mu_c": 0.8957668711656441, "calib/mu_w": 0.890919540229885, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2420799999999999, "calib/std_conf": 0.05309005179880691, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7620438799076213, "calib/step_q_c_n": 866.0, "calib/step_q_gap": -0.0006497227859814103, "calib/step_q_w": 0.7626936026936028, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 484.40625, "completions/mean_terminated_length": 488.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0128, "grad_norm": 0.04951918497681618, "kl": 0.0016373395919799805, "learning_rate": 3e-06, "loss": 0.001, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0348307266831398, "mask/share_reasoning": 0.8321601152420044, "mask/share_step_conf": 0.12519662082195282, "num_tokens": 2811951.0, "reward": 0.72372967004776, "reward_std": 0.17495250701904297, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6914949417114258, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7559643983840942, "step": 12 }, { "adv/mean_abs_final_conf": 0.7482670545578003, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.756947934627533, "adv/std_final_conf": 0.9266350865364075, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343085289001465, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5362989801395598, "calib/avg_num_step_conf": 4.8125, "calib/ece": 0.26543307086614176, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5118110236220472, "calib/gap": 0.0064251207729467685, "calib/mean_conf": 0.9032283464566928, "calib/mu_c": 0.9055555555555554, "calib/mu_w": 0.8991304347826087, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26543307086614176, "calib/std_conf": 0.04485114294251648, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7694840000000001, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.018488149377593466, "calib/step_q_w": 0.7509958506224066, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 486.62109375, "completions/mean_terminated_length": 486.62109375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.013866666666666666, "grad_norm": 0.027871543541550636, "kl": 0.0021517276763916016, "learning_rate": 3.2500000000000002e-06, "loss": -0.0376, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03444457799196243, "mask/share_reasoning": 0.8526432514190674, "mask/share_step_conf": 0.1129121482372284, "num_tokens": 3041118.0, "reward": 0.733703076839447, "reward_std": 0.151658296585083, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6940249800682068, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7733811140060425, "step": 13 }, { "adv/mean_abs_final_conf": 0.7391790151596069, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7640714645385742, "adv/std_final_conf": 0.9275602102279663, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341883063316345, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.38958755916159565, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.33461538461538465, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6963562753036437, "calib/gap": -0.012419878296146103, "calib/mean_conf": 0.9194736842105263, "calib/mu_c": 0.9143448275862068, "calib/mu_w": 0.9267647058823529, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.33352226720647776, "calib/std_conf": 0.035939196230548164, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7463983628922237, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.03850947400333482, "calib/step_q_w": 0.7078888888888889, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2752.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 554.06640625, "completions/mean_terminated_length": 556.2392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.014933333333333333, "grad_norm": 0.036938756704330444, "kl": 0.00412750244140625, "learning_rate": 3.5e-06, "loss": 0.0002, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03238217532634735, "mask/share_reasoning": 0.849589467048645, "mask/share_step_conf": 0.11412206292152405, "num_tokens": 3288359.0, "reward": 0.6798146963119507, "reward_std": 0.16921201348304749, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6168820858001709, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.74274742603302, "step": 14 }, { "adv/mean_abs_final_conf": 0.7596204280853271, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.75995934009552, "adv/std_final_conf": 0.9207594394683838, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343336224555969, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.49663684243924106, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.33113281249999993, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.76953125, "calib/gap": 0.0013852401802143532, "calib/mean_conf": 0.9287890624999999, "calib/mu_c": 0.9293464052287581, "calib/mu_w": 0.9279611650485438, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33113281249999993, "calib/std_conf": 0.04057807912372261, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7134652114597545, "calib/step_q_c_n": 733.0, "calib/step_q_gap": -0.007710125727143469, "calib/step_q_w": 0.721175337186898, "calib/step_q_w_n": 519.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 466.4375, "completions/mean_terminated_length": 468.2666931152344, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.016, "grad_norm": 0.035972610116004944, "kl": 0.008241653442382812, "learning_rate": 3.7500000000000005e-06, "loss": -0.0063, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034142546355724335, "mask/share_reasoning": 0.8487222790718079, "mask/share_step_conf": 0.11322891712188721, "num_tokens": 3515647.0, "reward": 0.7133145332336426, "reward_std": 0.1503565013408661, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6489074230194092, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7777215242385864, "step": 15 }, { "adv/mean_abs_final_conf": 0.7493153214454651, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7484169006347656, "adv/std_final_conf": 0.9216910600662231, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349498152732849, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5604395604395604, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.35545816733067703, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8764940239043825, "calib/gap": 0.010113814756671835, "calib/mean_conf": 0.9411155378486055, "calib/mu_c": 0.9453061224489795, "calib/mu_w": 0.9351923076923077, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.35545816733067703, "calib/std_conf": 0.0362202013476857, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6621888888888889, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.011259401709401762, "calib/step_q_w": 0.6509294871794872, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 635.55078125, "completions/mean_terminated_length": 640.5551147460938, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.017066666666666667, "grad_norm": 0.02724113129079342, "kl": 0.009087562561035156, "learning_rate": 4.000000000000001e-06, "loss": -0.0956, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.025532642379403114, "mask/share_reasoning": 0.8610584735870361, "mask/share_step_conf": 0.10559634864330292, "num_tokens": 3787196.0, "reward": 0.6936075687408447, "reward_std": 0.16829201579093933, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6143804788589478, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7728347182273865, "step": 16 }, { "adv/mean_abs_final_conf": 0.7701978087425232, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7553335428237915, "adv/std_final_conf": 0.9151206612586975, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934809684753418, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5359654731457801, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.21357142857142852, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9246031746031746, "calib/gap": 0.004705882352941337, "calib/mean_conf": 0.9437301587301588, "calib/mu_c": 0.9450000000000002, "calib/mu_w": 0.9402941176470588, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21357142857142852, "calib/std_conf": 0.026359364077647593, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6380529300567107, "calib/step_q_c_n": 1058.0, "calib/step_q_gap": -0.004592572588791977, "calib/step_q_w": 0.6426455026455027, "calib/step_q_w_n": 378.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 534.19921875, "completions/mean_terminated_length": 536.2941284179688, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.018133333333333335, "grad_norm": 0.0346217043697834, "kl": 0.013601303100585938, "learning_rate": 4.25e-06, "loss": 0.02, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03202342242002487, "mask/share_reasoning": 0.8436535000801086, "mask/share_step_conf": 0.1204168051481247, "num_tokens": 4027479.0, "reward": 0.7808581590652466, "reward_std": 0.17788267135620117, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7462132573127747, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8155031204223633, "step": 17 }, { "adv/mean_abs_final_conf": 0.7502391934394836, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7626986503601074, "adv/std_final_conf": 0.9088346362113953, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348260164260864, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.57067383739323, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.38055118110236213, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9566929133858267, "calib/gap": 0.007304017715912603, "calib/mean_conf": 0.9514173228346455, "calib/mu_c": 0.954551724137931, "calib/mu_w": 0.9472477064220184, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.38055118110236213, "calib/std_conf": 0.031050489984311938, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6232016348773842, "calib/step_q_c_n": 734.0, "calib/step_q_gap": -0.042810152941869184, "calib/step_q_w": 0.6660117878192534, "calib/step_q_w_n": 509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 504.359375, "completions/mean_terminated_length": 508.3307189941406, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.0192, "grad_norm": 0.029888764023780823, "kl": 0.015348434448242188, "learning_rate": 4.5e-06, "loss": -0.0963, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03175097703933716, "mask/share_reasoning": 0.8558838367462158, "mask/share_step_conf": 0.10455270856618881, "num_tokens": 4267315.0, "reward": 0.6815993785858154, "reward_std": 0.15395517647266388, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6041250228881836, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.759073793888092, "step": 18 }, { "adv/mean_abs_final_conf": 0.7441777586936951, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7759392857551575, "adv/std_final_conf": 0.8993977904319763, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340354800224304, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4692715231788079, "calib/avg_num_step_conf": 4.68359375, "calib/ece": 0.35298804780876497, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": 0.013600000000000168, "calib/mean_conf": 0.954581673306773, "calib/mu_c": 0.9600000000000001, "calib/mu_w": 0.9463999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.35298804780876497, "calib/std_conf": 0.07735238031133577, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6428904109589041, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.009777404562315684, "calib/step_q_w": 0.6331130063965884, "calib/step_q_w_n": 469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 487.98828125, "completions/mean_terminated_length": 489.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.020266666666666665, "grad_norm": 10893.642578125, "kl": 10240.019634246826, "learning_rate": 4.75e-06, "loss": 534.4695, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031817466020584106, "mask/share_reasoning": 0.8552243709564209, "mask/share_step_conf": 0.10905186831951141, "num_tokens": 4497000.0, "reward": 0.70301353931427, "reward_std": 0.1717141568660736, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6160424947738647, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7899844646453857, "step": 19 }, { "adv/mean_abs_final_conf": 0.7263522744178772, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.754086971282959, "adv/std_final_conf": 0.891075074672699, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345456957817078, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.47034706331045006, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.416984126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9722222222222222, "calib/gap": 0.000877192982455921, "calib/mean_conf": 0.9646031746031747, "calib/mu_c": 0.9649999999999999, "calib/mu_w": 0.9641228070175439, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.416984126984127, "calib/std_conf": 0.0233554586818447, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6103757225433526, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.030154115895153155, "calib/step_q_w": 0.5802216066481994, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 492.3515625, "completions/mean_terminated_length": 496.22833251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.021333333333333333, "grad_norm": 0.029597941786050797, "kl": 0.023279190063476562, "learning_rate": 5e-06, "loss": -0.0616, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03490706533193588, "mask/share_reasoning": 0.8316913843154907, "mask/share_step_conf": 0.1255890280008316, "num_tokens": 4727914.0, "reward": 0.673703134059906, "reward_std": 0.13693217933177948, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5692453384399414, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7781610488891602, "step": 20 }, { "adv/mean_abs_final_conf": 0.6890993118286133, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7478216886520386, "adv/std_final_conf": 0.8967701196670532, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352716207504272, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5542207792207793, "calib/avg_num_step_conf": 5.76953125, "calib/ece": 0.3635826771653544, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0032688311688313654, "calib/mean_conf": 0.9698818897637796, "calib/mu_c": 0.9711688311688312, "calib/mu_w": 0.9678999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3635826771653544, "calib/std_conf": 0.017534718843846057, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5828265765765765, "calib/step_q_c_n": 888.0, "calib/step_q_gap": -0.012793117820707023, "calib/step_q_w": 0.5956196943972836, "calib/step_q_w_n": 589.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 486.63671875, "completions/mean_terminated_length": 490.468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.0224, "grad_norm": 0.01837567612528801, "kl": 0.026103973388671875, "learning_rate": 4.9722222222222224e-06, "loss": -0.026, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03301333636045456, "mask/share_reasoning": 0.8315001726150513, "mask/share_step_conf": 0.1276739537715912, "num_tokens": 4955453.0, "reward": 0.7198125720024109, "reward_std": 0.18437299132347107, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6252046823501587, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8144204616546631, "step": 21 }, { "adv/mean_abs_final_conf": 0.6960532665252686, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7869043946266174, "adv/std_final_conf": 0.8804850578308105, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348463416099548, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47161892071952033, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.34031620553359687, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": 0.016654896735510016, "calib/mean_conf": 0.9648221343873518, "calib/mu_c": 0.9710759493670886, "calib/mu_w": 0.9544210526315786, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34031620553359687, "calib/std_conf": 0.07940283534646002, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6132855567805954, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.03634678127039126, "calib/step_q_w": 0.5769387755102041, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 484.32421875, "completions/mean_terminated_length": 488.1377868652344, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.023466666666666667, "grad_norm": 0.023580927401781082, "kl": 0.038417816162109375, "learning_rate": 4.944444444444445e-06, "loss": -0.0517, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032605595886707306, "mask/share_reasoning": 0.8292992115020752, "mask/share_step_conf": 0.1302826702594757, "num_tokens": 5181256.0, "reward": 0.724952757358551, "reward_std": 0.16283224523067474, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6435617208480835, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8063437938690186, "step": 22 }, { "adv/mean_abs_final_conf": 0.7656569480895996, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7499951124191284, "adv/std_final_conf": 0.9161418080329895, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356306195259094, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5094650471637162, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.40884462151394435, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": -0.005951673342809216, "calib/mean_conf": 0.9670916334661355, "calib/mu_c": 0.964507042253521, "calib/mu_w": 0.9704587155963302, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4050996015936256, "calib/std_conf": 0.06065304938847371, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6222773536895674, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.0031504423745827292, "calib/step_q_w": 0.6191269113149847, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 549.26171875, "completions/mean_terminated_length": 551.4157104492188, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.024533333333333334, "grad_norm": 0.022318795323371887, "kl": 0.03360748291015625, "learning_rate": 4.9166666666666665e-06, "loss": -0.0417, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03272176906466484, "mask/share_reasoning": 0.838639497756958, "mask/share_step_conf": 0.12473248690366745, "num_tokens": 5425803.0, "reward": 0.661957859992981, "reward_std": 0.21865949034690857, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5673785209655762, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7565371990203857, "step": 23 }, { "adv/mean_abs_final_conf": 0.7508785724639893, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7865563631057739, "adv/std_final_conf": 0.9145070910453796, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9358435273170471, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5549862041781632, "calib/avg_num_step_conf": 6.13671875, "calib/ece": 0.45259109311740886, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.004272106162133804, "calib/mean_conf": 0.9748582995951417, "calib/mu_c": 0.9768992248062016, "calib/mu_w": 0.9726271186440678, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.45259109311740886, "calib/std_conf": 0.01714076520831078, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6104135876042908, "calib/step_q_c_n": 839.0, "calib/step_q_gap": 0.013647603997733349, "calib/step_q_w": 0.5967659836065574, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 611.875, "completions/mean_terminated_length": 611.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.0256, "grad_norm": 0.02459198795258999, "kl": 0.032527923583984375, "learning_rate": 4.888888888888889e-06, "loss": -0.1041, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029947519302368164, "mask/share_reasoning": 0.8479431867599487, "mask/share_step_conf": 0.1221093088388443, "num_tokens": 5686955.0, "reward": 0.63507080078125, "reward_std": 0.24100688099861145, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5282472968101501, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7418943643569946, "step": 24 }, { "adv/mean_abs_final_conf": 0.7561845779418945, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7850910425186157, "adv/std_final_conf": 0.8727063536643982, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347834587097168, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.44064102564102564, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.385236220472441, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0030628205128204744, "calib/mean_conf": 0.9757874015748031, "calib/mu_c": 0.9745333333333334, "calib/mu_w": 0.9775961538461538, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.385236220472441, "calib/std_conf": 0.015521450699946315, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6261529808773904, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.03292001732746963, "calib/step_q_w": 0.5932329635499207, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 492.82421875, "completions/mean_terminated_length": 494.75689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.02666666666666667, "grad_norm": 0.020179076120257378, "kl": 0.038753509521484375, "learning_rate": 4.861111111111111e-06, "loss": -0.043, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032434768974781036, "mask/share_reasoning": 0.8340907096862793, "mask/share_step_conf": 0.1295682191848755, "num_tokens": 5916342.0, "reward": 0.6886754035949707, "reward_std": 0.14827287197113037, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6033198833465576, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7740308046340942, "step": 25 }, { "adv/mean_abs_final_conf": 0.7001343965530396, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7647371292114258, "adv/std_final_conf": 0.8727540373802185, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355230331420898, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5272166105499438, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.32838000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.0009238215488213131, "calib/mean_conf": 0.9748600000000002, "calib/mu_c": 0.9751851851851852, "calib/mu_w": 0.9742613636363638, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3276200000000001, "calib/std_conf": 0.018397836829366656, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6341238317757009, "calib/step_q_c_n": 856.0, "calib/step_q_gap": 0.02394015830631313, "calib/step_q_w": 0.6101836734693877, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 494.59375, "completions/mean_terminated_length": 500.4585266113281, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.027733333333333332, "grad_norm": 0.024296032264828682, "kl": 0.043674468994140625, "learning_rate": 4.833333333333333e-06, "loss": -0.014, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029950639232993126, "mask/share_reasoning": 0.8423521518707275, "mask/share_step_conf": 0.11597850918769836, "num_tokens": 6148198.0, "reward": 0.7104045152664185, "reward_std": 0.16006594896316528, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6456553936004639, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.775153636932373, "step": 26 }, { "adv/mean_abs_final_conf": 0.7514786720275879, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7520699501037598, "adv/std_final_conf": 0.9026116728782654, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352880716323853, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49590769230769227, "calib/avg_num_step_conf": 6.41796875, "calib/ece": 0.4653333333333335, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": 0.0012892307692307092, "calib/mean_conf": 0.9751372549019608, "calib/mu_c": 0.9757692307692307, "calib/mu_w": 0.97448, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4653333333333335, "calib/std_conf": 0.022732044936042178, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6068387096774194, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.0008364055299538986, "calib/step_q_w": 0.6060023041474655, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2313.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 516.35546875, "completions/mean_terminated_length": 516.35546875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.0288, "grad_norm": 0.021031202748417854, "kl": 0.04630279541015625, "learning_rate": 4.805555555555556e-06, "loss": 0.0115, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03199648857116699, "mask/share_reasoning": 0.8362272381782532, "mask/share_step_conf": 0.13177627325057983, "num_tokens": 6385601.0, "reward": 0.6513494253158569, "reward_std": 0.18348070979118347, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5315262079238892, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7711726427078247, "step": 27 }, { "adv/mean_abs_final_conf": 0.7151632308959961, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.786544144153595, "adv/std_final_conf": 0.8922801613807678, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356615543365479, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5129892037786774, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.3602310756972111, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": -0.005878340080971323, "calib/mean_conf": 0.9730517928286853, "calib/mu_c": 0.9708269230769232, "calib/mu_w": 0.9767052631578945, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.35588446215139435, "calib/std_conf": 0.06291309147116418, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6100180995475114, "calib/step_q_c_n": 884.0, "calib/step_q_gap": 0.0076701141995260125, "calib/step_q_w": 0.6023479853479854, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 562.87890625, "completions/mean_terminated_length": 565.0863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.029866666666666666, "grad_norm": 0.023689938709139824, "kl": 0.045177459716796875, "learning_rate": 4.777777777777778e-06, "loss": -0.0802, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030054759234189987, "mask/share_reasoning": 0.8516813516616821, "mask/share_step_conf": 0.11435763537883759, "num_tokens": 6636642.0, "reward": 0.6863582134246826, "reward_std": 0.20521211624145508, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6138904094696045, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.758825957775116, "step": 28 }, { "adv/mean_abs_final_conf": 0.7489525079727173, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7721630334854126, "adv/std_final_conf": 0.9078423380851746, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354410171508789, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5187202604557977, "calib/avg_num_step_conf": 6.43359375, "calib/ece": 0.4936363636363637, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.01489669421487616, "calib/mean_conf": 0.971897233201581, "calib/mu_c": 0.9796694214876035, "calib/mu_w": 0.9647727272727273, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4936363636363637, "calib/std_conf": 0.08247383890068877, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6070542635658915, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.023874423932443656, "calib/step_q_w": 0.5831798396334479, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2349.0, "completions/max_terminated_length": 2349.0, "completions/mean_length": 585.59375, "completions/mean_terminated_length": 587.8902587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.030933333333333334, "grad_norm": 0.024896398186683655, "kl": 0.0430755615234375, "learning_rate": 4.75e-06, "loss": -0.0228, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.027839384973049164, "mask/share_reasoning": 0.8492968082427979, "mask/share_step_conf": 0.11895756423473358, "num_tokens": 6893682.0, "reward": 0.6369415521621704, "reward_std": 0.20564672350883484, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5014816522598267, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7724014520645142, "step": 29 }, { "adv/mean_abs_final_conf": 0.7562463283538818, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.750690221786499, "adv/std_final_conf": 0.8935472965240479, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354106783866882, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5064412238325282, "calib/avg_num_step_conf": 6.1328125, "calib/ece": 0.4310799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.984, "calib/gap": 0.015848631239935407, "calib/mean_conf": 0.97108, "calib/mu_c": 0.9783703703703702, "calib/mu_w": 0.9625217391304348, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4310799999999999, "calib/std_conf": 0.08527621942839633, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5898925410872313, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.03692527536194257, "calib/step_q_w": 0.5529672657252888, "calib/step_q_w_n": 779.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 606.51171875, "completions/mean_terminated_length": 606.51171875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.032, "grad_norm": 0.02309613674879074, "kl": 0.0448760986328125, "learning_rate": 4.722222222222222e-06, "loss": -0.0328, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.027759820222854614, "mask/share_reasoning": 0.8556734323501587, "mask/share_step_conf": 0.1165667176246643, "num_tokens": 7155933.0, "reward": 0.6598018407821655, "reward_std": 0.19287419319152832, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5530972480773926, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7665064334869385, "step": 30 }, { "adv/mean_abs_final_conf": 0.7802824974060059, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7845057249069214, "adv/std_final_conf": 0.9099698066711426, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353131055831909, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4822432962720733, "calib/avg_num_step_conf": 7.1484375, "calib/ece": 0.5296787148594378, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9799196787148594, "calib/gap": 0.006041203400915407, "calib/mean_conf": 0.9714457831325302, "calib/mu_c": 0.9748181818181818, "calib/mu_w": 0.9687769784172664, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5296787148594378, "calib/std_conf": 0.06643169992789887, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5463068493150685, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.040461394769613857, "calib/step_q_w": 0.5058454545454546, "calib/step_q_w_n": 1100.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 641.30859375, "completions/mean_terminated_length": 643.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.03306666666666667, "grad_norm": 0.04329806938767433, "kl": 0.044368743896484375, "learning_rate": 4.694444444444445e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.026912782341241837, "mask/share_reasoning": 0.8497459888458252, "mask/share_step_conf": 0.11943497508764267, "num_tokens": 7426020.0, "reward": 0.6124132871627808, "reward_std": 0.18781393766403198, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.4585081934928894, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7663182616233826, "step": 31 }, { "adv/mean_abs_final_conf": 0.7666586637496948, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7693814039230347, "adv/std_final_conf": 0.9167671799659729, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356940984725952, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47760025062656647, "calib/avg_num_step_conf": 6.06640625, "calib/ece": 0.5046996047430831, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": -0.003230576441102362, "calib/mean_conf": 0.9770316205533598, "calib/mu_c": 0.9753333333333335, "calib/mu_w": 0.9785639097744359, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.503711462450593, "calib/std_conf": 0.0259186738909823, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5566849710982659, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.044272659832296, "calib/step_q_w": 0.5124123112659699, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 582.82421875, "completions/mean_terminated_length": 582.82421875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.034133333333333335, "grad_norm": 0.03413167595863342, "kl": 0.05112457275390625, "learning_rate": 4.666666666666667e-06, "loss": -0.058, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.028594808652997017, "mask/share_reasoning": 0.854396641254425, "mask/share_step_conf": 0.11700853705406189, "num_tokens": 7681927.0, "reward": 0.6248780488967896, "reward_std": 0.19497355818748474, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.48585769534111023, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.763898491859436, "step": 32 }, { "adv/mean_abs_final_conf": 0.7173354029655457, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7580817341804504, "adv/std_final_conf": 0.8851724863052368, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935470700263977, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4924653215636822, "calib/avg_num_step_conf": 6.55859375, "calib/ece": 0.458452380952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0009167717528373309, "calib/mean_conf": 0.9743253968253969, "calib/mu_c": 0.9747692307692308, "calib/mu_w": 0.9738524590163935, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.458452380952381, "calib/std_conf": 0.018255822887149574, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5375353418308226, "calib/step_q_c_n": 863.0, "calib/step_q_gap": -0.002783408169177326, "calib/step_q_w": 0.54031875, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 574.7265625, "completions/mean_terminated_length": 574.7265625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.0352, "grad_norm": 0.01975160650908947, "kl": 0.053974151611328125, "learning_rate": 4.638888888888889e-06, "loss": -0.0549, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.028635134920477867, "mask/share_reasoning": 0.8466184139251709, "mask/share_step_conf": 0.12474644184112549, "num_tokens": 7935929.0, "reward": 0.655128538608551, "reward_std": 0.17714199423789978, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5317574143409729, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7784996628761292, "step": 33 }, { "adv/mean_abs_final_conf": 0.7687841057777405, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7759872078895569, "adv/std_final_conf": 0.9181791543960571, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351248145103455, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5015461315158399, "calib/avg_num_step_conf": 6.55859375, "calib/ece": 0.43280632411067205, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9723320158102767, "calib/gap": -0.01255080146409182, "calib/mean_conf": 0.966086956521739, "calib/mu_c": 0.960431654676259, "calib/mu_w": 0.9729824561403508, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4247430830039527, "calib/std_conf": 0.07909133377861023, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5107519675925926, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.02507221299136564, "calib/step_q_w": 0.4856797546012269, "calib/step_q_w_n": 815.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 515.11328125, "completions/mean_terminated_length": 517.1333618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.03626666666666667, "grad_norm": 0.020340269431471825, "kl": 0.06337738037109375, "learning_rate": 4.611111111111112e-06, "loss": -0.0421, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030964922159910202, "mask/share_reasoning": 0.8273770809173584, "mask/share_step_conf": 0.1377517580986023, "num_tokens": 8172910.0, "reward": 0.6818004250526428, "reward_std": 0.1963166892528534, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5597124695777893, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8038883805274963, "step": 34 }, { "adv/mean_abs_final_conf": 0.7449852824211121, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.759742021560669, "adv/std_final_conf": 0.8946696519851685, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355137348175049, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4529529529529529, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.4090627450980392, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": -0.0017976726726727232, "calib/mean_conf": 0.9737686274509804, "calib/mu_c": 0.972986111111111, "calib/mu_w": 0.9747837837837837, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4090627450980392, "calib/std_conf": 0.016829403426287484, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5247587064676617, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.03468963739859254, "calib/step_q_w": 0.4900690690690691, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 574.77734375, "completions/mean_terminated_length": 574.77734375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.037333333333333336, "grad_norm": 0.019349532201886177, "kl": 0.05916595458984375, "learning_rate": 4.583333333333333e-06, "loss": -0.0447, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.02842956781387329, "mask/share_reasoning": 0.8613673448562622, "mask/share_step_conf": 0.11020311713218689, "num_tokens": 8429309.0, "reward": 0.6868324875831604, "reward_std": 0.1873733252286911, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5831688046455383, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7904962301254272, "step": 35 }, { "adv/mean_abs_final_conf": 0.7299237847328186, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7725102305412292, "adv/std_final_conf": 0.8623014092445374, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352320432662964, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4311440677966102, "calib/avg_num_step_conf": 6.34765625, "calib/ece": 0.20968525896414342, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": -0.004596133474576103, "calib/mean_conf": 0.9723147410358566, "calib/mu_c": 0.971234375, "calib/mu_w": 0.9758305084745761, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20852988047808765, "calib/std_conf": 0.019501775938964513, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4856729200652528, "calib/step_q_c_n": 1226.0, "calib/step_q_gap": 0.010750614300841788, "calib/step_q_w": 0.474922305764411, "calib/step_q_w_n": 399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 542.81640625, "completions/mean_terminated_length": 542.81640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0384, "grad_norm": 0.045901015400886536, "kl": 0.062183380126953125, "learning_rate": 4.555555555555556e-06, "loss": -0.0415, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03236595168709755, "mask/share_reasoning": 0.8319715261459351, "mask/share_step_conf": 0.1356624960899353, "num_tokens": 8670982.0, "reward": 0.7835397720336914, "reward_std": 0.1716996133327484, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7599384784698486, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8071410655975342, "step": 36 }, { "adv/mean_abs_final_conf": 0.7526949048042297, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7663756608963013, "adv/std_final_conf": 0.8950009346008301, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353793263435364, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5237055733678969, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.5281967213114757, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.00850262637287691, "calib/mean_conf": 0.96672131147541, "calib/mu_c": 0.9714953271028037, "calib/mu_w": 0.9629927007299268, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5281967213114757, "calib/std_conf": 0.06370967220686559, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5241205211726383, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.08163356465089916, "calib/step_q_w": 0.44248695652173914, "calib/step_q_w_n": 1035.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 608.45703125, "completions/mean_terminated_length": 608.45703125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.039466666666666664, "grad_norm": 0.030415819957852364, "kl": 0.05530548095703125, "learning_rate": 4.527777777777778e-06, "loss": -0.0411, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.028621872887015343, "mask/share_reasoning": 0.8487331867218018, "mask/share_step_conf": 0.12264493852853775, "num_tokens": 8933843.0, "reward": 0.6066591739654541, "reward_std": 0.16862811148166656, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.4526539146900177, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7606644630432129, "step": 37 }, { "adv/mean_abs_final_conf": 0.737849771976471, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7637723684310913, "adv/std_final_conf": 0.9148930907249451, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354915022850037, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6269546027742748, "calib/avg_num_step_conf": 6.27734375, "calib/ece": 0.487936507936508, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9761904761904762, "calib/gap": 0.006776796973518517, "calib/mean_conf": 0.9641269841269843, "calib/mu_c": 0.9676229508196723, "calib/mu_w": 0.9608461538461538, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48396825396825405, "calib/std_conf": 0.08244545903990914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4998193548387097, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.024038585607940477, "calib/step_q_w": 0.47578076923076923, "calib/step_q_w_n": 832.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 577.52734375, "completions/mean_terminated_length": 577.52734375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.04053333333333333, "grad_norm": 0.02135089784860611, "kl": 0.06496429443359375, "learning_rate": 4.5e-06, "loss": -0.0589, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02866470254957676, "mask/share_reasoning": 0.849726676940918, "mask/share_step_conf": 0.12160862237215042, "num_tokens": 9188578.0, "reward": 0.6399968862533569, "reward_std": 0.1794516146183014, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5083702802658081, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7716234922409058, "step": 38 }, { "adv/mean_abs_final_conf": 0.7724220752716064, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7736121416091919, "adv/std_final_conf": 0.9094294309616089, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355300068855286, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4988117573483427, "calib/avg_num_step_conf": 6.46875, "calib/ece": 0.4747391304347827, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9802371541501976, "calib/gap": 0.02197204502814265, "calib/mean_conf": 0.960905138339921, "calib/mu_c": 0.9721951219512196, "calib/mu_w": 0.950223076923077, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4747391304347827, "calib/std_conf": 0.0965838169105595, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5251319463087248, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.07652711645142518, "calib/step_q_w": 0.44860482985729966, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 581.4140625, "completions/mean_terminated_length": 581.4140625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.0416, "grad_norm": 0.02459811419248581, "kl": 0.05318450927734375, "learning_rate": 4.472222222222223e-06, "loss": -0.0369, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02947234734892845, "mask/share_reasoning": 0.846280038356781, "mask/share_step_conf": 0.12424758821725845, "num_tokens": 9443508.0, "reward": 0.6575571298599243, "reward_std": 0.1858455240726471, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5201391577720642, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7949751019477844, "step": 39 }, { "adv/mean_abs_final_conf": 0.7673207521438599, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7538126707077026, "adv/std_final_conf": 0.911514163017273, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354920983314514, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5258962868117798, "calib/avg_num_step_conf": 5.91015625, "calib/ece": 0.523015873015873, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0213341869398207, "calib/mean_conf": 0.9595238095238096, "calib/mu_c": 0.9715454545454544, "calib/mu_w": 0.9502112676056337, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.523015873015873, "calib/std_conf": 0.10624438093718476, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5221390675241158, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.035979696031410935, "calib/step_q_w": 0.48615937149270483, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 621.14453125, "completions/mean_terminated_length": 621.14453125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.042666666666666665, "grad_norm": 0.029524317011237144, "kl": 0.06084442138671875, "learning_rate": 4.444444444444444e-06, "loss": -0.0962, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.028228700160980225, "mask/share_reasoning": 0.8570345640182495, "mask/share_step_conf": 0.11473676562309265, "num_tokens": 9709281.0, "reward": 0.6209744215011597, "reward_std": 0.1786309778690338, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.47219765186309814, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7697511315345764, "step": 40 }, { "adv/mean_abs_final_conf": 0.6746046543121338, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7511892914772034, "adv/std_final_conf": 0.8650860786437988, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935126781463623, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5606282552083334, "calib/avg_num_step_conf": 5.78125, "calib/ece": 0.2192226562499999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.99609375, "calib/gap": 0.0058177083333335045, "calib/mean_conf": 0.9692226562500001, "calib/mu_c": 0.9706770833333334, "calib/mu_w": 0.9648593749999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2192226562499999, "calib/std_conf": 0.01716490658128777, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5236314847942756, "calib/step_q_c_n": 1118.0, "calib/step_q_gap": -0.01351768647644258, "calib/step_q_w": 0.5371491712707182, "calib/step_q_w_n": 362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 501.53515625, "completions/mean_terminated_length": 503.5019836425781, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.04373333333333333, "grad_norm": 0.03243149816989899, "kl": 0.06836700439453125, "learning_rate": 4.416666666666667e-06, "loss": -0.0395, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03247971460223198, "mask/share_reasoning": 0.8331398963928223, "mask/share_step_conf": 0.13047410547733307, "num_tokens": 9944922.0, "reward": 0.790048360824585, "reward_std": 0.1570318639278412, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7657284736633301, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8143682479858398, "step": 41 }, { "adv/mean_abs_final_conf": 0.7103356122970581, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7571755647659302, "adv/std_final_conf": 0.907038688659668, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935427188873291, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5013734466971878, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.4032128514056226, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.963855421686747, "calib/gap": 0.014945716154349031, "calib/mean_conf": 0.956706827309237, "calib/mu_c": 0.9633093525179854, "calib/mu_w": 0.9483636363636364, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.400843373493976, "calib/std_conf": 0.08885934009920263, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49707305389221557, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.02447208570411047, "calib/step_q_w": 0.4726009681881051, "calib/step_q_w_n": 723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 512.11328125, "completions/mean_terminated_length": 512.11328125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.0448, "grad_norm": 0.02139648236334324, "kl": 0.06438064575195312, "learning_rate": 4.388888888888889e-06, "loss": -0.0541, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03238638862967491, "mask/share_reasoning": 0.8317169547080994, "mask/share_step_conf": 0.13589666783809662, "num_tokens": 10180391.0, "reward": 0.6827822327613831, "reward_std": 0.18697020411491394, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5778406262397766, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7877238988876343, "step": 42 }, { "adv/mean_abs_final_conf": 0.7501436471939087, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7426331043243408, "adv/std_final_conf": 0.9032301306724548, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354005455970764, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5683174603174603, "calib/avg_num_step_conf": 6.08984375, "calib/ece": 0.4669322709163347, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.009430476190476322, "calib/mean_conf": 0.9635059760956176, "calib/mu_c": 0.96824, "calib/mu_w": 0.9588095238095237, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.46621513944223114, "calib/std_conf": 0.06264154461454985, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5180706319702602, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.003663717076643147, "calib/step_q_w": 0.514406914893617, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 552.61328125, "completions/mean_terminated_length": 554.7803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.04586666666666667, "grad_norm": 0.021705729886889458, "kl": 0.060863494873046875, "learning_rate": 4.361111111111112e-06, "loss": -0.0639, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030267784371972084, "mask/share_reasoning": 0.8412609100341797, "mask/share_step_conf": 0.12456506490707397, "num_tokens": 10427084.0, "reward": 0.6544273495674133, "reward_std": 0.19733819365501404, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5236749649047852, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.785179615020752, "step": 43 }, { "adv/mean_abs_final_conf": 0.7124887108802795, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7413974404335022, "adv/std_final_conf": 0.8838324546813965, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353353381156921, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5385459710743802, "calib/avg_num_step_conf": 6.57421875, "calib/ece": 0.47598393574297176, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.008950800619834665, "calib/mean_conf": 0.9619277108433736, "calib/mu_c": 0.9665289256198348, "calib/mu_w": 0.9575781250000002, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.47598393574297176, "calib/std_conf": 0.06288494941264759, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5137832818532818, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.002010434171162645, "calib/step_q_w": 0.5117728476821192, "calib/step_q_w_n": 906.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 586.66015625, "completions/mean_terminated_length": 588.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 52.0, "epoch": 0.046933333333333334, "grad_norm": 0.03233075141906738, "kl": 0.052997589111328125, "learning_rate": 4.333333333333334e-06, "loss": -0.0734, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.027658920735120773, "mask/share_reasoning": 0.8431069850921631, "mask/share_step_conf": 0.12532782554626465, "num_tokens": 10683589.0, "reward": 0.6391909122467041, "reward_std": 0.17529763281345367, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5095155835151672, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7688661813735962, "step": 44 }, { "adv/mean_abs_final_conf": 0.7306512594223022, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.75229811668396, "adv/std_final_conf": 0.8930631279945374, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353950619697571, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5981738203145828, "calib/avg_num_step_conf": 6.6328125, "calib/ece": 0.4441183673469387, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9755102040816327, "calib/gap": 0.03739562783257799, "calib/mean_conf": 0.9502408163265307, "calib/mu_c": 0.9687096774193549, "calib/mu_w": 0.9313140495867769, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4441183673469387, "calib/std_conf": 0.12407513839073704, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47270334928229657, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.0007989409296283445, "calib/step_q_w": 0.4719044083526682, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2427.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 596.7578125, "completions/mean_terminated_length": 599.0980834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.048, "grad_norm": 0.023054039105772972, "kl": 0.060699462890625, "learning_rate": 4.305555555555556e-06, "loss": -0.0544, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02872752770781517, "mask/share_reasoning": 0.8372390866279602, "mask/share_step_conf": 0.13012711703777313, "num_tokens": 10941407.0, "reward": 0.6488115787506104, "reward_std": 0.1848917305469513, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5322019457817078, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7654211521148682, "step": 45 }, { "adv/mean_abs_final_conf": 0.7364819049835205, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7673929929733276, "adv/std_final_conf": 0.8841802477836609, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350518584251404, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.524188137412775, "calib/avg_num_step_conf": 7.16015625, "calib/ece": 0.40813008130081296, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": 0.0017190016103058037, "calib/mean_conf": 0.9677235772357724, "calib/mu_c": 0.968478260869565, "calib/mu_w": 0.9667592592592592, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4074390243902439, "calib/std_conf": 0.020114024882143353, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4751558577405858, "calib/step_q_c_n": 956.0, "calib/step_q_gap": 0.06045802421721064, "calib/step_q_w": 0.41469783352337514, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 591.49609375, "completions/mean_terminated_length": 596.153564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.04906666666666667, "grad_norm": 0.020944081246852875, "kl": 0.05698394775390625, "learning_rate": 4.277777777777778e-06, "loss": -0.0826, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030672479420900345, "mask/share_reasoning": 0.8261983394622803, "mask/share_step_conf": 0.13531672954559326, "num_tokens": 11197598.0, "reward": 0.6781916618347168, "reward_std": 0.1851722002029419, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.565719485282898, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7906638383865356, "step": 46 }, { "adv/mean_abs_final_conf": 0.7587028741836548, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7555489540100098, "adv/std_final_conf": 0.8831453323364258, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348834156990051, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5939169337606838, "calib/avg_num_step_conf": 6.80859375, "calib/ece": 0.3897580645161291, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": 0.006014957264957266, "calib/mean_conf": 0.9623387096774194, "calib/mu_c": 0.9648611111111111, "calib/mu_w": 0.9588461538461538, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.385725806451613, "calib/std_conf": 0.08767709149218292, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4837613160518444, "calib/step_q_c_n": 1003.0, "calib/step_q_gap": 0.039261316051844464, "calib/step_q_w": 0.44449999999999995, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 620.75390625, "completions/mean_terminated_length": 623.1882934570312, "completions/min_length": 0.0, "completions/min_terminated_length": 240.0, "epoch": 0.050133333333333335, "grad_norm": 0.031001385301351547, "kl": 0.054290771484375, "learning_rate": 4.25e-06, "loss": -0.0831, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.027538388967514038, "mask/share_reasoning": 0.8438159823417664, "mask/share_step_conf": 0.1247393935918808, "num_tokens": 11462487.0, "reward": 0.6905703544616699, "reward_std": 0.18100741505622864, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5871163606643677, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7940243482589722, "step": 47 }, { "adv/mean_abs_final_conf": 0.7352026700973511, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7768785953521729, "adv/std_final_conf": 0.8923227190971375, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934985339641571, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5392866290983607, "calib/avg_num_step_conf": 6.50390625, "calib/ece": 0.4834000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.984, "calib/gap": 0.0008196721311477528, "calib/mean_conf": 0.9554000000000001, "calib/mu_c": 0.9558196721311478, "calib/mu_w": 0.9550000000000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4754000000000001, "calib/std_conf": 0.12248771366957585, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5137683029453015, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.04376200042429307, "calib/step_q_w": 0.47000630252100845, "calib/step_q_w_n": 952.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 548.0859375, "completions/mean_terminated_length": 552.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.0512, "grad_norm": 0.021089090034365654, "kl": 0.058563232421875, "learning_rate": 4.222222222222223e-06, "loss": -0.0529, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031192826107144356, "mask/share_reasoning": 0.826957106590271, "mask/share_step_conf": 0.1340375393629074, "num_tokens": 11706485.0, "reward": 0.6443170309066772, "reward_std": 0.1885329931974411, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5049683451652527, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.783665657043457, "step": 48 }, { "adv/mean_abs_final_conf": 0.7109791040420532, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7729066610336304, "adv/std_final_conf": 0.8787632584571838, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349692463874817, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6469702042182792, "calib/avg_num_step_conf": 6.421875, "calib/ece": 0.3863709677419357, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005446267157683593, "calib/mean_conf": 0.9710483870967744, "calib/mu_c": 0.9733103448275862, "calib/mu_w": 0.9678640776699026, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3863709677419357, "calib/std_conf": 0.011381829632604772, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.53194055313987, "calib/step_q_c_n": 922.0, "calib/step_q_gap": 0.0627411071564905, "calib/step_q_w": 0.4691994459833795, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 534.62109375, "completions/mean_terminated_length": 540.9605102539062, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.05226666666666667, "grad_norm": 0.044350311160087585, "kl": 0.06148529052734375, "learning_rate": 4.194444444444445e-06, "loss": -0.0972, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029532913118600845, "mask/share_reasoning": 0.8296065330505371, "mask/share_step_conf": 0.12914179265499115, "num_tokens": 11947884.0, "reward": 0.6952216029167175, "reward_std": 0.176457017660141, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5913281440734863, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7991151213645935, "step": 49 }, { "adv/mean_abs_final_conf": 0.7298943996429443, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7647660374641418, "adv/std_final_conf": 0.8696848750114441, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347518086433411, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.567796052631579, "calib/avg_num_step_conf": 6.09765625, "calib/ece": 0.36623015873015874, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.014100000000000223, "calib/mean_conf": 0.969404761904762, "calib/mu_c": 0.975, "calib/mu_w": 0.9608999999999998, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36623015873015874, "calib/std_conf": 0.059355356877224465, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5335596026490066, "calib/step_q_c_n": 906.0, "calib/step_q_gap": 0.02541667287801419, "calib/step_q_w": 0.5081429297709924, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2129.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 551.9375, "completions/mean_terminated_length": 556.283447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.05333333333333334, "grad_norm": 0.030394205823540688, "kl": 0.05947113037109375, "learning_rate": 4.166666666666667e-06, "loss": -0.0736, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030669772997498512, "mask/share_reasoning": 0.8324046730995178, "mask/share_step_conf": 0.12911301851272583, "num_tokens": 12194540.0, "reward": 0.7142380475997925, "reward_std": 0.1702766716480255, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6199074387550354, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8085687756538391, "step": 50 }, { "adv/mean_abs_final_conf": 0.7021123170852661, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7562159895896912, "adv/std_final_conf": 0.8730514049530029, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346850514411926, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5702390545259199, "calib/avg_num_step_conf": 6.3984375, "calib/ece": 0.3886008064516131, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0035915928015042287, "calib/mean_conf": 0.977310483870968, "calib/mu_c": 0.9787876712328768, "calib/mu_w": 0.9751960784313726, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3886008064516131, "calib/std_conf": 0.01285826782186358, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.530438864628821, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.028391773216078664, "calib/step_q_w": 0.5020470914127423, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 594.25, "completions/mean_terminated_length": 596.5804443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 193.0, "epoch": 0.0544, "grad_norm": 0.02666338160634041, "kl": 0.050075531005859375, "learning_rate": 4.138888888888889e-06, "loss": -0.1021, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02876748889684677, "mask/share_reasoning": 0.8479665517807007, "mask/share_step_conf": 0.11935971677303314, "num_tokens": 12455964.0, "reward": 0.688745379447937, "reward_std": 0.16984319686889648, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5894192457199097, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7880715131759644, "step": 51 }, { "adv/mean_abs_final_conf": 0.6500508785247803, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7584314942359924, "adv/std_final_conf": 0.8351729512214661, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345820546150208, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6318421979570271, "calib/avg_num_step_conf": 6.25, "calib/ece": 0.31503571428571436, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": 0.009007749207467164, "calib/mean_conf": 0.977734126984127, "calib/mu_c": 0.9807724550898202, "calib/mu_w": 0.9717647058823531, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31503571428571436, "calib/std_conf": 0.01906205264975339, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5362592592592592, "calib/step_q_c_n": 1026.0, "calib/step_q_gap": 0.07207459026971214, "calib/step_q_w": 0.4641846689895471, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 547.9375, "completions/mean_terminated_length": 552.251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.055466666666666664, "grad_norm": 0.03358688950538635, "kl": 0.0588531494140625, "learning_rate": 4.111111111111111e-06, "loss": -0.0612, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.029904557392001152, "mask/share_reasoning": 0.8369725942611694, "mask/share_step_conf": 0.12531036138534546, "num_tokens": 12704188.0, "reward": 0.7537912130355835, "reward_std": 0.16553309559822083, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6702480316162109, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.837334394454956, "step": 52 }, { "adv/mean_abs_final_conf": 0.6293481588363647, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7667917013168335, "adv/std_final_conf": 0.8136529922485352, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934159517288208, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5346151405546038, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.38500000000000023, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003469763309854712, "calib/mean_conf": 0.9826562500000002, "calib/mu_c": 0.9840522875816994, "calib/mu_w": 0.9805825242718447, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38500000000000023, "calib/std_conf": 0.012214513331995676, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5208728897715988, "calib/step_q_c_n": 1007.0, "calib/step_q_gap": 0.014289298440329556, "calib/step_q_w": 0.5065835913312693, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 520.89453125, "completions/mean_terminated_length": 522.937255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.05653333333333333, "grad_norm": 0.02033303678035736, "kl": 0.06036376953125, "learning_rate": 4.083333333333334e-06, "loss": -0.0346, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03004293330013752, "mask/share_reasoning": 0.8352757692337036, "mask/share_step_conf": 0.1307750642299652, "num_tokens": 12943361.0, "reward": 0.7163116931915283, "reward_std": 0.15128588676452637, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6128312349319458, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8197921514511108, "step": 53 }, { "adv/mean_abs_final_conf": 0.5297777056694031, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7386770248413086, "adv/std_final_conf": 0.7592636346817017, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339870810508728, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.62256006006006, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.26154724409448826, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9763779527559056, "calib/gap": 0.05311576576576593, "calib/mean_conf": 0.9702086614173229, "calib/mu_c": 0.9856833333333334, "calib/mu_w": 0.9325675675675674, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26154724409448826, "calib/std_conf": 0.11417657862312314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5778975069252077, "calib/step_q_c_n": 1083.0, "calib/step_q_gap": 0.081314697700889, "calib/step_q_w": 0.4965828092243187, "calib/step_q_w_n": 477.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 493.16796875, "completions/mean_terminated_length": 495.10198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0576, "grad_norm": 0.020669786259531975, "kl": 0.05802154541015625, "learning_rate": 4.055555555555556e-06, "loss": -0.0352, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03211697190999985, "mask/share_reasoning": 0.8330959677696228, "mask/share_step_conf": 0.13088083267211914, "num_tokens": 13175844.0, "reward": 0.7794877290725708, "reward_std": 0.12982261180877686, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.728294312953949, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8306810259819031, "step": 54 }, { "adv/mean_abs_final_conf": 0.6623827219009399, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7723299264907837, "adv/std_final_conf": 0.8290383219718933, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347923398017883, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.61313240728941, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.44158964143426305, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.03541691092137156, "calib/mean_conf": 0.9714701195219123, "calib/mu_c": 0.9881203007518798, "calib/mu_w": 0.9527033898305083, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44158964143426305, "calib/std_conf": 0.11249645458594695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5656493333333333, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.020755089692101825, "calib/step_q_w": 0.5448942436412315, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 539.51953125, "completions/mean_terminated_length": 539.51953125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.058666666666666666, "grad_norm": 0.02158159762620926, "kl": 0.051700592041015625, "learning_rate": 4.027777777777779e-06, "loss": -0.1093, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03160548582673073, "mask/share_reasoning": 0.8430644273757935, "mask/share_step_conf": 0.1253300905227661, "num_tokens": 13421785.0, "reward": 0.6731129884719849, "reward_std": 0.1911894977092743, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5499264597892761, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7962995767593384, "step": 55 }, { "adv/mean_abs_final_conf": 0.6451209187507629, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7391470670700073, "adv/std_final_conf": 0.846241295337677, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934406578540802, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5542320539484464, "calib/avg_num_step_conf": 6.48046875, "calib/ece": 0.49583333333333346, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.006778218944980208, "calib/mean_conf": 0.9839285714285716, "calib/mu_c": 0.9873983739837399, "calib/mu_w": 0.9806201550387597, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.49583333333333346, "calib/std_conf": 0.039107641150699465, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5789016602809706, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.05683545023530845, "calib/step_q_w": 0.5220662100456621, "calib/step_q_w_n": 876.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 555.90625, "completions/mean_terminated_length": 555.90625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.05973333333333333, "grad_norm": 0.021703317761421204, "kl": 0.05384063720703125, "learning_rate": 4.000000000000001e-06, "loss": -0.0124, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030571769922971725, "mask/share_reasoning": 0.8379673957824707, "mask/share_step_conf": 0.13146084547042847, "num_tokens": 13670937.0, "reward": 0.6544367074966431, "reward_std": 0.18766939640045166, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.4979339838027954, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.810939371585846, "step": 56 }, { "adv/mean_abs_final_conf": 0.5386355519294739, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7594729661941528, "adv/std_final_conf": 0.7653334736824036, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345945715904236, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6114978975666919, "calib/avg_num_step_conf": 6.0390625, "calib/ece": 0.33146031746031757, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.02635913696836001, "calib/mean_conf": 0.9782857142857143, "calib/mu_c": 0.9875950920245399, "calib/mu_w": 0.9612359550561799, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33146031746031757, "calib/std_conf": 0.08794883587874432, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5837346723044398, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.05833467230443978, "calib/step_q_w": 0.5254, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 525.4609375, "completions/mean_terminated_length": 529.5984497070312, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.0608, "grad_norm": 0.022320527583360672, "kl": 0.0495452880859375, "learning_rate": 3.972222222222223e-06, "loss": -0.0612, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030892664566636086, "mask/share_reasoning": 0.8362057209014893, "mask/share_step_conf": 0.1250891387462616, "num_tokens": 13912247.0, "reward": 0.735082745552063, "reward_std": 0.15631389617919922, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6555935740470886, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8145719766616821, "step": 57 }, { "adv/mean_abs_final_conf": 0.6802141666412354, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7571486830711365, "adv/std_final_conf": 0.8551135063171387, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345582723617554, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5296961756815771, "calib/avg_num_step_conf": 6.99609375, "calib/ece": 0.5134645669291338, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9881889763779528, "calib/gap": 0.021724374571089777, "calib/mean_conf": 0.9740944881889764, "calib/mu_c": 0.9858119658119657, "calib/mu_w": 0.964087591240876, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5134645669291338, "calib/std_conf": 0.10456399412444867, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5574524714828898, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.03602123395793977, "calib/step_q_w": 0.52143123752495, "calib/step_q_w_n": 1002.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 593.2109375, "completions/mean_terminated_length": 593.2109375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.06186666666666667, "grad_norm": 0.024006003513932228, "kl": 0.04694366455078125, "learning_rate": 3.944444444444445e-06, "loss": -0.0508, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02852752059698105, "mask/share_reasoning": 0.8432724475860596, "mask/share_step_conf": 0.12820005416870117, "num_tokens": 14170429.0, "reward": 0.6374404430389404, "reward_std": 0.20299085974693298, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.48395466804504395, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7909262180328369, "step": 58 }, { "adv/mean_abs_final_conf": 0.5568619966506958, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7574281096458435, "adv/std_final_conf": 0.7681616544723511, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340794086456299, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.601207883026065, "calib/avg_num_step_conf": 6.1171875, "calib/ece": 0.41395256916996054, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.01675524475524459, "calib/mean_conf": 0.9765612648221345, "calib/mu_c": 0.9838461538461536, "calib/mu_w": 0.967090909090909, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4126482213438736, "calib/std_conf": 0.0853430255930299, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5790709134615385, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.050446935259903625, "calib/step_q_w": 0.5286239782016349, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 539.78515625, "completions/mean_terminated_length": 539.78515625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.06293333333333333, "grad_norm": 0.02217506803572178, "kl": 0.05609893798828125, "learning_rate": 3.916666666666667e-06, "loss": -0.0667, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03168308734893799, "mask/share_reasoning": 0.842609703540802, "mask/share_step_conf": 0.1257072240114212, "num_tokens": 14414862.0, "reward": 0.6876438856124878, "reward_std": 0.1636931449174881, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5791339874267578, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7961536645889282, "step": 59 }, { "adv/mean_abs_final_conf": 0.631127119064331, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.749966025352478, "adv/std_final_conf": 0.8395038843154907, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346640706062317, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5691349934469201, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.42136546184738954, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.009851245085190086, "calib/mean_conf": 0.9797590361445785, "calib/mu_c": 0.9840714285714285, "calib/mu_w": 0.9742201834862384, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41943775100401604, "calib/std_conf": 0.06649532828258188, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5907919463087248, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.05000544493131431, "calib/step_q_w": 0.5407865013774105, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 547.9765625, "completions/mean_terminated_length": 550.1255493164062, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.064, "grad_norm": 0.027142062783241272, "kl": 0.049556732177734375, "learning_rate": 3.88888888888889e-06, "loss": -0.0199, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031455203890800476, "mask/share_reasoning": 0.8425741195678711, "mask/share_step_conf": 0.12206438928842545, "num_tokens": 14664000.0, "reward": 0.6877095699310303, "reward_std": 0.20329821109771729, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5601452589035034, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8152737617492676, "step": 60 }, { "adv/mean_abs_final_conf": 0.5472803115844727, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.759605884552002, "adv/std_final_conf": 0.7648850679397583, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9337583780288696, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5260767854685565, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.3245098039215687, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.880418329469062e-05, "calib/mean_conf": 0.9872549019607844, "calib/mu_c": 0.9872781065088758, "calib/mu_w": 0.9872093023255811, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3245098039215687, "calib/std_conf": 0.008279062031875546, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5917596253902185, "calib/step_q_c_n": 961.0, "calib/step_q_gap": 0.04479692556783843, "calib/step_q_w": 0.5469626998223801, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 464.95703125, "completions/mean_terminated_length": 464.95703125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.06506666666666666, "grad_norm": 0.018482623621821404, "kl": 0.0516357421875, "learning_rate": 3.861111111111112e-06, "loss": -0.0108, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03660412132740021, "mask/share_reasoning": 0.8234680891036987, "mask/share_step_conf": 0.13992780447006226, "num_tokens": 14887093.0, "reward": 0.7381289005279541, "reward_std": 0.15750961005687714, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6685198545455933, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8077378273010254, "step": 61 }, { "adv/mean_abs_final_conf": 0.585199773311615, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7739436626434326, "adv/std_final_conf": 0.7961553335189819, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935192883014679, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5288201160541587, "calib/avg_num_step_conf": 6.05859375, "calib/ece": 0.42384860557768916, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9760956175298805, "calib/gap": 0.007553449387492028, "calib/mean_conf": 0.9806613545816734, "calib/mu_c": 0.9839716312056737, "calib/mu_w": 0.9764181818181816, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4213784860557768, "calib/std_conf": 0.053871257743940894, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5654795396419438, "calib/step_q_c_n": 782.0, "calib/step_q_gap": 0.05029358385520777, "calib/step_q_w": 0.515185955786736, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 537.0234375, "completions/mean_terminated_length": 539.1294555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.06613333333333334, "grad_norm": 0.022088345140218735, "kl": 0.05084991455078125, "learning_rate": 3.833333333333334e-06, "loss": -0.0193, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03091275691986084, "mask/share_reasoning": 0.8408281803131104, "mask/share_step_conf": 0.12435280531644821, "num_tokens": 15131651.0, "reward": 0.6745160818099976, "reward_std": 0.2127370536327362, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5678347945213318, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7811973094940186, "step": 62 }, { "adv/mean_abs_final_conf": 0.6797653436660767, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7620604634284973, "adv/std_final_conf": 0.8699020147323608, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348795413970947, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6173139158576052, "calib/avg_num_step_conf": 5.86328125, "calib/ece": 0.3892055335968381, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.008766796116504882, "calib/mean_conf": 0.9820909090909093, "calib/mu_c": 0.98566, "calib/mu_w": 0.9768932038834951, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3892055335968381, "calib/std_conf": 0.019291287952678496, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5847171597633136, "calib/step_q_c_n": 845.0, "calib/step_q_gap": 0.04589703781209409, "calib/step_q_w": 0.5388201219512195, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2445.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 575.84375, "completions/mean_terminated_length": 575.84375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.0672, "grad_norm": 0.02352093905210495, "kl": 0.04439544677734375, "learning_rate": 3.8055555555555556e-06, "loss": -0.0372, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030028386041522026, "mask/share_reasoning": 0.8535323143005371, "mask/share_step_conf": 0.11643929779529572, "num_tokens": 15387707.0, "reward": 0.7122840285301208, "reward_std": 0.21272410452365875, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6038464307785034, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8207215666770935, "step": 63 }, { "adv/mean_abs_final_conf": 0.7428898811340332, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7612664103507996, "adv/std_final_conf": 0.8729450106620789, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934707522392273, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5357508641659199, "calib/avg_num_step_conf": 5.82421875, "calib/ece": 0.4023320158102767, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9881422924901185, "calib/gap": 0.010471130457047617, "calib/mean_conf": 0.9794071146245059, "calib/mu_c": 0.983835616438356, "calib/mu_w": 0.9733644859813084, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4023320158102767, "calib/std_conf": 0.06388017027932623, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5645569620253165, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.009211302861329318, "calib/step_q_w": 0.5553456591639871, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 519.44921875, "completions/mean_terminated_length": 519.44921875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.06826666666666667, "grad_norm": 0.026329757645726204, "kl": 0.050830841064453125, "learning_rate": 3.777777777777778e-06, "loss": -0.0354, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0327141210436821, "mask/share_reasoning": 0.8409000635147095, "mask/share_step_conf": 0.12638577818870544, "num_tokens": 15624462.0, "reward": 0.6897340416908264, "reward_std": 0.23057618737220764, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.584220290184021, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7952476739883423, "step": 64 }, { "adv/mean_abs_final_conf": 0.5738707780838013, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7580282688140869, "adv/std_final_conf": 0.8090132474899292, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344430565834045, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.511667917917918, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.4193333333333335, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9803921568627451, "calib/gap": 0.0030048798798798515, "calib/mean_conf": 0.9840392156862745, "calib/mu_c": 0.9853472222222224, "calib/mu_w": 0.9823423423423425, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4193333333333335, "calib/std_conf": 0.02298674876951273, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6007589743589744, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.014379361002208157, "calib/step_q_w": 0.5863796133567662, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2528.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 437.65234375, "completions/mean_terminated_length": 437.65234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06933333333333333, "grad_norm": 0.03262828290462494, "kl": 0.06385040283203125, "learning_rate": 3.7500000000000005e-06, "loss": -0.0167, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.037267137318849564, "mask/share_reasoning": 0.8303743600845337, "mask/share_step_conf": 0.13235852122306824, "num_tokens": 15841525.0, "reward": 0.6861559152603149, "reward_std": 0.15331321954727173, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5766515135765076, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7956601977348328, "step": 65 }, { "adv/mean_abs_final_conf": 0.6304284930229187, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7556469440460205, "adv/std_final_conf": 0.8624138832092285, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9332277178764343, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6001536393316689, "calib/avg_num_step_conf": 6.37890625, "calib/ece": 0.46936000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.98, "calib/gap": 0.014055438192177028, "calib/mean_conf": 0.9708800000000001, "calib/mu_c": 0.977795275590551, "calib/mu_w": 0.963739837398374, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.46612000000000003, "calib/std_conf": 0.08142251285731729, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5993985815602836, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.11795353845683526, "calib/step_q_w": 0.4814450431034483, "calib/step_q_w_n": 928.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 565.57421875, "completions/mean_terminated_length": 567.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.0704, "grad_norm": 0.02561318129301071, "kl": 0.04900360107421875, "learning_rate": 3.7222222222222225e-06, "loss": -0.0122, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030985288321971893, "mask/share_reasoning": 0.8431658744812012, "mask/share_step_conf": 0.12194260954856873, "num_tokens": 16092664.0, "reward": 0.6660435795783997, "reward_std": 0.1502438485622406, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5234804749488831, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8086066246032715, "step": 66 }, { "adv/mean_abs_final_conf": 0.6108351945877075, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7629013657569885, "adv/std_final_conf": 0.8174425363540649, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9334231019020081, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6787662337662338, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.37299212598425197, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.968503937007874, "calib/gap": 0.010122077922077755, "calib/mean_conf": 0.9739370078740157, "calib/mu_c": 0.9779220779220779, "calib/mu_w": 0.9678000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37031496062992125, "calib/std_conf": 0.0479034047795392, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5916325802615934, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.07866571279171386, "calib/step_q_w": 0.5129668674698795, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 530.28515625, "completions/mean_terminated_length": 530.28515625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.07146666666666666, "grad_norm": 0.025846382603049278, "kl": 0.048938751220703125, "learning_rate": 3.694444444444445e-06, "loss": 0.0253, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0322788842022419, "mask/share_reasoning": 0.8486857414245605, "mask/share_step_conf": 0.11903537809848785, "num_tokens": 16333425.0, "reward": 0.7303739786148071, "reward_std": 0.13507795333862305, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6216679811477661, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8390799760818481, "step": 67 }, { "adv/mean_abs_final_conf": 0.6445533037185669, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7679104804992676, "adv/std_final_conf": 0.8494598865509033, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9337493181228638, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6095303506017792, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.3850199203187251, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9561752988047809, "calib/gap": 0.02775379382522236, "calib/mean_conf": 0.9706772908366533, "calib/mu_c": 0.9821768707482993, "calib/mu_w": 0.9544230769230769, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3850199203187251, "calib/std_conf": 0.0737818702981451, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.577835064935065, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.0759178235557546, "calib/step_q_w": 0.5019172413793104, "calib/step_q_w_n": 725.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 511.71875, "completions/mean_terminated_length": 511.71875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.07253333333333334, "grad_norm": 0.024281207472085953, "kl": 0.0457611083984375, "learning_rate": 3.6666666666666666e-06, "loss": -0.036, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03625905513763428, "mask/share_reasoning": 0.8334545493125916, "mask/share_step_conf": 0.13028644025325775, "num_tokens": 16568513.0, "reward": 0.713841438293457, "reward_std": 0.16001370549201965, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6050695180892944, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8226132988929749, "step": 68 }, { "adv/mean_abs_final_conf": 0.7450152635574341, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7667806148529053, "adv/std_final_conf": 0.8999899625778198, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348978400230408, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.585109819121447, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.4769196787148596, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9116465863453815, "calib/gap": 0.01980600775193797, "calib/mean_conf": 0.958847389558233, "calib/mu_c": 0.9691083333333333, "calib/mu_w": 0.9493023255813954, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4769196787148596, "calib/std_conf": 0.07261490550305577, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.564964560862866, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.04633939873575832, "calib/step_q_w": 0.5186251621271076, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 576.28125, "completions/mean_terminated_length": 576.28125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0736, "grad_norm": 0.0308236014097929, "kl": 0.047393798828125, "learning_rate": 3.638888888888889e-06, "loss": -0.1002, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030492305755615234, "mask/share_reasoning": 0.8605618476867676, "mask/share_step_conf": 0.10894586145877838, "num_tokens": 16820537.0, "reward": 0.6534982919692993, "reward_std": 0.19954201579093933, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5091630816459656, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7978335618972778, "step": 69 }, { "adv/mean_abs_final_conf": 0.6634742021560669, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7432050704956055, "adv/std_final_conf": 0.8920466899871826, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345405697822571, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8211942257217847, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.4060728744939272, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8744939271255061, "calib/gap": 0.11165485564304456, "calib/mean_conf": 0.9202429149797571, "calib/mu_c": 0.9744881889763778, "calib/mu_w": 0.8628333333333332, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4060728744939272, "calib/std_conf": 0.18078982613860656, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5868950867052023, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.08678325874821313, "calib/step_q_w": 0.5001118279569892, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 549.66796875, "completions/mean_terminated_length": 551.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.07466666666666667, "grad_norm": 0.026835061609745026, "kl": 0.038166046142578125, "learning_rate": 3.6111111111111115e-06, "loss": -0.0659, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033462610095739365, "mask/share_reasoning": 0.8393990993499756, "mask/share_step_conf": 0.12323200702667236, "num_tokens": 17068244.0, "reward": 0.6982115507125854, "reward_std": 0.17986395955085754, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5866332054138184, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.809789776802063, "step": 70 }, { "adv/mean_abs_final_conf": 0.7221082448959351, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7683535218238831, "adv/std_final_conf": 0.897761881351471, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347231984138489, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6057759887547123, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.4014285714285715, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8174603174603174, "calib/gap": 0.005144719187272551, "calib/mean_conf": 0.9160317460317461, "calib/mu_c": 0.9182978723404257, "calib/mu_w": 0.9131531531531532, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37896825396825407, "calib/std_conf": 0.1487998515375143, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5435220125786164, "calib/step_q_c_n": 795.0, "calib/step_q_gap": 0.01309999422999264, "calib/step_q_w": 0.5304220183486238, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 524.51171875, "completions/mean_terminated_length": 526.5686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.07573333333333333, "grad_norm": 0.03249030187726021, "kl": 0.04720306396484375, "learning_rate": 3.5833333333333335e-06, "loss": -0.0642, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03268418461084366, "mask/share_reasoning": 0.8373799324035645, "mask/share_step_conf": 0.1260296255350113, "num_tokens": 17306927.0, "reward": 0.696674108505249, "reward_std": 0.1723061352968216, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5973577499389648, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7959904670715332, "step": 71 }, { "adv/mean_abs_final_conf": 0.7195709347724915, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7566730976104736, "adv/std_final_conf": 0.9068205952644348, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341710805892944, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7483070672248215, "calib/avg_num_step_conf": 5.5703125, "calib/ece": 0.4055294117647057, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.792156862745098, "calib/gap": 0.08504617089386846, "calib/mean_conf": 0.9192549019607842, "calib/mu_c": 0.9606106870229008, "calib/mu_w": 0.8755645161290323, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4055294117647057, "calib/std_conf": 0.12842573853830094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5965980795610425, "calib/step_q_c_n": 729.0, "calib/step_q_gap": 0.0690299303501386, "calib/step_q_w": 0.5275681492109039, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 490.640625, "completions/mean_terminated_length": 490.640625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.0768, "grad_norm": 0.05706058070063591, "kl": 0.04985809326171875, "learning_rate": 3.555555555555556e-06, "loss": -0.0217, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03344859182834625, "mask/share_reasoning": 0.8429282903671265, "mask/share_step_conf": 0.12362314760684967, "num_tokens": 17536939.0, "reward": 0.7275208234786987, "reward_std": 0.16687637567520142, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6093425750732422, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8456990718841553, "step": 72 }, { "adv/mean_abs_final_conf": 0.7557467222213745, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7590437531471252, "adv/std_final_conf": 0.9078731536865234, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344611167907715, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6708327530984544, "calib/avg_num_step_conf": 5.53125, "calib/ece": 0.26715415019762856, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8063241106719368, "calib/gap": 0.0615018799610082, "calib/mean_conf": 0.9267588932806324, "calib/mu_c": 0.9476646706586825, "calib/mu_w": 0.8861627906976743, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2669169960474309, "calib/std_conf": 0.11452155660328063, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6067189384800965, "calib/step_q_c_n": 829.0, "calib/step_q_gap": 0.11199338481740484, "calib/step_q_w": 0.49472555366269166, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 504.953125, "completions/mean_terminated_length": 504.953125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.07786666666666667, "grad_norm": 0.04618416354060173, "kl": 0.038722991943359375, "learning_rate": 3.5277777777777784e-06, "loss": 0.0321, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032340243458747864, "mask/share_reasoning": 0.8512711524963379, "mask/share_step_conf": 0.11638855934143066, "num_tokens": 17773239.0, "reward": 0.7775202393531799, "reward_std": 0.1803993135690689, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7105652093887329, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8444753289222717, "step": 73 }, { "adv/mean_abs_final_conf": 0.7238792181015015, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7787186503410339, "adv/std_final_conf": 0.912177324295044, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341882467269897, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6956156716417911, "calib/avg_num_step_conf": 5.37890625, "calib/ece": 0.34161023622047243, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.610236220472441, "calib/gap": 0.0932620646766168, "calib/mean_conf": 0.8501929133858268, "calib/mu_c": 0.8942537313432835, "calib/mu_w": 0.8009916666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33212204724409444, "calib/std_conf": 0.20350657815253712, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5297560283687943, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.038605730749746714, "calib/step_q_w": 0.4911502976190476, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 482.01171875, "completions/mean_terminated_length": 483.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.07893333333333333, "grad_norm": 0.043702512979507446, "kl": 0.06292724609375, "learning_rate": 3.5e-06, "loss": -0.0573, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034550879150629044, "mask/share_reasoning": 0.8362744450569153, "mask/share_step_conf": 0.12526842951774597, "num_tokens": 18000562.0, "reward": 0.7414271235466003, "reward_std": 0.15016880631446838, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6466495990753174, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8362046480178833, "step": 74 }, { "adv/mean_abs_final_conf": 0.7428607940673828, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7638285160064697, "adv/std_final_conf": 0.9031973481178284, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.933974027633667, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7541257805530777, "calib/avg_num_step_conf": 5.6171875, "calib/ece": 0.17786561264822126, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6482213438735178, "calib/gap": 0.16685325602140932, "calib/mean_conf": 0.8758102766798419, "calib/mu_c": 0.9259322033898304, "calib/mu_w": 0.7590789473684211, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17703557312252957, "calib/std_conf": 0.1793572063880419, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5425856697819315, "calib/step_q_c_n": 963.0, "calib/step_q_gap": 0.07795409083456306, "calib/step_q_w": 0.4646315789473684, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1871.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 477.46484375, "completions/mean_terminated_length": 479.3372802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.08, "grad_norm": 0.05451728403568268, "kl": 0.05059051513671875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0453, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03529536724090576, "mask/share_reasoning": 0.8326973915100098, "mask/share_step_conf": 0.12810099124908447, "num_tokens": 18227545.0, "reward": 0.8223235607147217, "reward_std": 0.134343683719635, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7874187231063843, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8572283387184143, "step": 75 }, { "adv/mean_abs_final_conf": 0.6845461130142212, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7664108276367188, "adv/std_final_conf": 0.8880889415740967, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9336568713188171, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6964849354375898, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.18813492063492057, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6190476190476191, "calib/gap": 0.15908034433285512, "calib/mean_conf": 0.8217063492063492, "calib/mu_c": 0.8734705882352942, "calib/mu_w": 0.7143902439024391, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16761904761904756, "calib/std_conf": 0.24617362460619346, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.50348623853211, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.09935416306041195, "calib/step_q_w": 0.4041320754716981, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 509.1171875, "completions/mean_terminated_length": 509.1171875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08106666666666666, "grad_norm": 0.04806952923536301, "kl": 0.2750282287597656, "learning_rate": 3.444444444444445e-06, "loss": -0.013, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03508011996746063, "mask/share_reasoning": 0.8423846960067749, "mask/share_step_conf": 0.12253521382808685, "num_tokens": 18460935.0, "reward": 0.8059649467468262, "reward_std": 0.15631715953350067, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7560847997665405, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8558450937271118, "step": 76 }, { "adv/mean_abs_final_conf": 0.7558318376541138, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7622382044792175, "adv/std_final_conf": 0.926922082901001, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341432452201843, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.590406162464986, "calib/avg_num_step_conf": 5.53515625, "calib/ece": 0.19968379446640322, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5217391304347826, "calib/gap": 0.12250070028011195, "calib/mean_conf": 0.786403162055336, "calib/mu_c": 0.8275595238095238, "calib/mu_w": 0.7050588235294118, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16102766798418977, "calib/std_conf": 0.25909409584904747, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49516896918172154, "calib/step_q_c_n": 941.0, "calib/step_q_gap": 0.04163955741701569, "calib/step_q_w": 0.45352941176470585, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 477.453125, "completions/mean_terminated_length": 479.3255310058594, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.08213333333333334, "grad_norm": 0.05456750467419624, "kl": 0.0598602294921875, "learning_rate": 3.416666666666667e-06, "loss": -0.0511, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037136346101760864, "mask/share_reasoning": 0.8228158354759216, "mask/share_step_conf": 0.1361415535211563, "num_tokens": 18687827.0, "reward": 0.7886154055595398, "reward_std": 0.15151908993721008, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7406773567199707, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8365534543991089, "step": 77 }, { "adv/mean_abs_final_conf": 0.7504051923751831, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7541760206222534, "adv/std_final_conf": 0.9153277277946472, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9337966442108154, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6337736337736338, "calib/avg_num_step_conf": 6.2890625, "calib/ece": 0.19170916334661353, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4581673306772908, "calib/gap": 0.14532808857808865, "calib/mean_conf": 0.7227131474103586, "calib/mu_c": 0.7852447552447553, "calib/mu_w": 0.6399166666666667, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17235059760956176, "calib/std_conf": 0.290884441579542, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4497088465845465, "calib/step_q_c_n": 893.0, "calib/step_q_gap": 0.027031022316764075, "calib/step_q_w": 0.42267782426778244, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 560.30078125, "completions/mean_terminated_length": 560.30078125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.0832, "grad_norm": 0.043129973113536835, "kl": 0.06533050537109375, "learning_rate": 3.3888888888888893e-06, "loss": -0.0323, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030203985050320625, "mask/share_reasoning": 0.847260594367981, "mask/share_step_conf": 0.12253537029027939, "num_tokens": 18939288.0, "reward": 0.7746736407279968, "reward_std": 0.16919748485088348, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7040666341781616, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.845280647277832, "step": 78 }, { "adv/mean_abs_final_conf": 0.7007652521133423, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7351545095443726, "adv/std_final_conf": 0.9115530848503113, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9309883713722229, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6958598726114651, "calib/avg_num_step_conf": 6.41796875, "calib/ece": 0.18126482213438738, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.47035573122529645, "calib/gap": 0.18178144904458582, "calib/mean_conf": 0.7593675889328063, "calib/mu_c": 0.8283439490445859, "calib/mu_w": 0.6465625, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16003952569169966, "calib/std_conf": 0.26704871292707055, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4517631048387097, "calib/step_q_c_n": 992.0, "calib/step_q_gap": 0.043038988095238095, "calib/step_q_w": 0.4087241167434716, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 548.93359375, "completions/mean_terminated_length": 548.93359375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.08426666666666667, "grad_norm": 0.03477565571665764, "kl": 0.05063629150390625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0617, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030998801812529564, "mask/share_reasoning": 0.8435428142547607, "mask/share_step_conf": 0.12545835971832275, "num_tokens": 19186191.0, "reward": 0.8097378015518188, "reward_std": 0.13792501389980316, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7506546974182129, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8688209056854248, "step": 79 }, { "adv/mean_abs_final_conf": 0.6384887099266052, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7562040090560913, "adv/std_final_conf": 0.8373948931694031, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9335655570030212, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.688919259882254, "calib/avg_num_step_conf": 6.0390625, "calib/ece": 0.20148437500000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.71484375, "calib/gap": 0.1692038127277823, "calib/mean_conf": 0.8758593750000001, "calib/mu_c": 0.9300574712643678, "calib/mu_w": 0.7608536585365855, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19882812500000008, "calib/std_conf": 0.21077619700670516, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4759594202898551, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.04247605434073576, "calib/step_q_w": 0.43348336594911935, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 470.046875, "completions/mean_terminated_length": 471.8902282714844, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.08533333333333333, "grad_norm": 0.11761131137609482, "kl": 0.435760498046875, "learning_rate": 3.3333333333333333e-06, "loss": -0.0153, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034512780606746674, "mask/share_reasoning": 0.8250635862350464, "mask/share_step_conf": 0.13651734590530396, "num_tokens": 19408683.0, "reward": 0.8117129802703857, "reward_std": 0.16109926998615265, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7730531096458435, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.850372850894928, "step": 80 }, { "adv/mean_abs_final_conf": 0.6516029238700867, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7672962546348572, "adv/std_final_conf": 0.8549244999885559, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934212327003479, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7298333333333333, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.22283999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.648, "calib/gap": 0.23826666666666663, "calib/mean_conf": 0.8063600000000001, "calib/mu_c": 0.9016666666666666, "calib/mu_w": 0.6634, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21459999999999993, "calib/std_conf": 0.28221968464300995, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49048453473132375, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.11166582128103136, "calib/step_q_w": 0.3788187134502924, "calib/step_q_w_n": 855.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 535.36328125, "completions/mean_terminated_length": 539.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0864, "grad_norm": 0.4449962079524994, "kl": 0.2868461608886719, "learning_rate": 3.3055555555555558e-06, "loss": -0.0571, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03392023593187332, "mask/share_reasoning": 0.8354710340499878, "mask/share_step_conf": 0.12279621511697769, "num_tokens": 19651984.0, "reward": 0.7835407257080078, "reward_std": 0.16362658143043518, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7345074415206909, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8325740098953247, "step": 81 }, { "adv/mean_abs_final_conf": 0.5848948955535889, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7582812309265137, "adv/std_final_conf": 0.8100059628486633, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9338537454605103, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.683876329787234, "calib/avg_num_step_conf": 5.6953125, "calib/ece": 0.2776771653543308, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7992125984251969, "calib/gap": 0.14309042553191487, "calib/mean_conf": 0.8927952755905513, "calib/mu_c": 0.94575, "calib/mu_w": 0.8026595744680851, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27027559055118117, "calib/std_conf": 0.21623400996211056, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5147575057736721, "calib/step_q_c_n": 866.0, "calib/step_q_gap": 0.09261223550340181, "calib/step_q_w": 0.42214527027027027, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 474.65234375, "completions/mean_terminated_length": 474.65234375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.08746666666666666, "grad_norm": 0.03431045264005661, "kl": 0.057056427001953125, "learning_rate": 3.277777777777778e-06, "loss": -0.0801, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03597622364759445, "mask/share_reasoning": 0.8335458040237427, "mask/share_step_conf": 0.1304779350757599, "num_tokens": 19879047.0, "reward": 0.7802107334136963, "reward_std": 0.14769387245178223, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7121269702911377, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8482944369316101, "step": 82 }, { "adv/mean_abs_final_conf": 0.5824180245399475, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7441925406455994, "adv/std_final_conf": 0.8023613691329956, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9336576461791992, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6964833759590793, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.3376095617529881, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7649402390438247, "calib/gap": 0.18492710997442463, "calib/mean_conf": 0.8492430278884463, "calib/mu_c": 0.9339705882352942, "calib/mu_w": 0.7490434782608696, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3225099601593626, "calib/std_conf": 0.27422877132987855, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5241968162083936, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.11191183546384942, "calib/step_q_w": 0.4122849807445442, "calib/step_q_w_n": 779.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2663.0, "completions/max_terminated_length": 2663.0, "completions/mean_length": 529.0546875, "completions/mean_terminated_length": 531.1294555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.08853333333333334, "grad_norm": 0.0549030564725399, "kl": 0.053955078125, "learning_rate": 3.2500000000000002e-06, "loss": -0.0636, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03301708400249481, "mask/share_reasoning": 0.8444280624389648, "mask/share_step_conf": 0.11864862591028214, "num_tokens": 20121749.0, "reward": 0.7538405060768127, "reward_std": 0.16312643885612488, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6607023477554321, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8469786047935486, "step": 83 }, { "adv/mean_abs_final_conf": 0.595024585723877, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7575274705886841, "adv/std_final_conf": 0.8171303272247314, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340332746505737, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7007662835249041, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.30197628458498027, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7351778656126482, "calib/gap": 0.1695472541507025, "calib/mean_conf": 0.8494861660079052, "calib/mu_c": 0.9218620689655171, "calib/mu_w": 0.7523148148148147, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2891699604743083, "calib/std_conf": 0.2684211869489573, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5160706806282723, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.06495401396160566, "calib/step_q_w": 0.45111666666666667, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 477.73828125, "completions/mean_terminated_length": 479.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.0896, "grad_norm": 0.02916264347732067, "kl": 0.05471038818359375, "learning_rate": 3.2222222222222227e-06, "loss": -0.0731, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03586849942803383, "mask/share_reasoning": 0.8404359221458435, "mask/share_step_conf": 0.11978927254676819, "num_tokens": 20349970.0, "reward": 0.7618058919906616, "reward_std": 0.1683914214372635, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6817960739135742, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8418155908584595, "step": 84 }, { "adv/mean_abs_final_conf": 0.6161090135574341, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7381773591041565, "adv/std_final_conf": 0.8443658351898193, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348047375679016, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7081042157632889, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.31729838709677416, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7661290322580645, "calib/gap": 0.22803482587064694, "calib/mean_conf": 0.8498790322580647, "calib/mu_c": 0.9547014925373136, "calib/mu_w": 0.7266666666666667, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31342741935483864, "calib/std_conf": 0.2724685271183283, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5175361527967258, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.07892258347519193, "calib/step_q_w": 0.4386135693215339, "calib/step_q_w_n": 678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 543.35546875, "completions/mean_terminated_length": 547.6338500976562, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.09066666666666667, "grad_norm": 0.03892885521054268, "kl": 0.055267333984375, "learning_rate": 3.1944444444444443e-06, "loss": -0.1226, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03346104174852371, "mask/share_reasoning": 0.837459921836853, "mask/share_step_conf": 0.12126647680997849, "num_tokens": 20596893.0, "reward": 0.7521539926528931, "reward_std": 0.21601596474647522, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6710773706436157, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8332306742668152, "step": 85 }, { "adv/mean_abs_final_conf": 0.5772691965103149, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7407020330429077, "adv/std_final_conf": 0.8088988661766052, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9334074258804321, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6682353694455115, "calib/avg_num_step_conf": 5.86328125, "calib/ece": 0.2626294820717131, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.649402390438247, "calib/gap": 0.2195722883851967, "calib/mean_conf": 0.7728286852589642, "calib/mu_c": 0.8725547445255476, "calib/mu_w": 0.6529824561403509, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24482071713147407, "calib/std_conf": 0.32558431465607224, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5037477303988996, "calib/step_q_c_n": 727.0, "calib/step_q_gap": 0.08956168388727176, "calib/step_q_w": 0.4141860465116279, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 511.859375, "completions/mean_terminated_length": 517.9288940429688, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.09173333333333333, "grad_norm": 0.04384787008166313, "kl": 0.05931854248046875, "learning_rate": 3.1666666666666667e-06, "loss": -0.0858, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03420304134488106, "mask/share_reasoning": 0.8318629860877991, "mask/share_step_conf": 0.12221519649028778, "num_tokens": 20833441.0, "reward": 0.7689346075057983, "reward_std": 0.14764101803302765, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6896851062774658, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8481841087341309, "step": 86 }, { "adv/mean_abs_final_conf": 0.5547035932540894, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7570780515670776, "adv/std_final_conf": 0.7982771396636963, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9332447052001953, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5813353313353313, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.21056451612903215, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7782258064516129, "calib/gap": 0.14928404928404915, "calib/mean_conf": 0.8651612903225808, "calib/mu_c": 0.9048901098901099, "calib/mu_w": 0.7556060606060607, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1709274193548386, "calib/std_conf": 0.25982774650987445, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5179849537037037, "calib/step_q_c_n": 864.0, "calib/step_q_gap": 0.0944153566341066, "calib/step_q_w": 0.42356959706959707, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 465.76171875, "completions/mean_terminated_length": 471.28460693359375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0928, "grad_norm": 0.03590984642505646, "kl": 0.06134796142578125, "learning_rate": 3.138888888888889e-06, "loss": -0.0383, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.037011705338954926, "mask/share_reasoning": 0.829144299030304, "mask/share_step_conf": 0.12212523818016052, "num_tokens": 21058172.0, "reward": 0.7914502024650574, "reward_std": 0.1662648767232895, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7531968951225281, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8297035694122314, "step": 87 }, { "adv/mean_abs_final_conf": 0.6191021203994751, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7256580591201782, "adv/std_final_conf": 0.8432807922363281, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.933613121509552, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.772495361781076, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.1913492063492063, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6468253968253969, "calib/gap": 0.31662337662337636, "calib/mean_conf": 0.7706349206349208, "calib/mu_c": 0.8937662337662337, "calib/mu_w": 0.5771428571428573, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1754365079365079, "calib/std_conf": 0.3207883296370722, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47935856992639325, "calib/step_q_c_n": 951.0, "calib/step_q_gap": 0.06557705732135127, "calib/step_q_w": 0.413781512605042, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 512.1640625, "completions/mean_terminated_length": 514.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.09386666666666667, "grad_norm": 0.04056946933269501, "kl": 0.0717620849609375, "learning_rate": 3.1111111111111116e-06, "loss": -0.0657, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03307269513607025, "mask/share_reasoning": 0.8433363437652588, "mask/share_step_conf": 0.11968475580215454, "num_tokens": 21299134.0, "reward": 0.8107873201370239, "reward_std": 0.16860932111740112, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.768801212310791, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8527735471725464, "step": 88 }, { "adv/mean_abs_final_conf": 0.6300060749053955, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7494813203811646, "adv/std_final_conf": 0.849608838558197, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.932580828666687, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7548370832545535, "calib/avg_num_step_conf": 5.5546875, "calib/ece": 0.18722222222222218, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.49206349206349204, "calib/gap": 0.2934959349593498, "calib/mean_conf": 0.6700793650793652, "calib/mu_c": 0.8133333333333335, "calib/mu_w": 0.5198373983739837, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17269841269841263, "calib/std_conf": 0.3497663529239085, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49108462623413257, "calib/step_q_c_n": 709.0, "calib/step_q_gap": 0.0835109936955632, "calib/step_q_w": 0.4075736325385694, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 514.46875, "completions/mean_terminated_length": 516.486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.09493333333333333, "grad_norm": 0.042645957320928574, "kl": 0.063812255859375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0877, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03544268012046814, "mask/share_reasoning": 0.8371367454528809, "mask/share_step_conf": 0.12351429462432861, "num_tokens": 21539726.0, "reward": 0.786715030670166, "reward_std": 0.16263370215892792, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7260781526565552, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8473520278930664, "step": 89 }, { "adv/mean_abs_final_conf": 0.5670766830444336, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7358335256576538, "adv/std_final_conf": 0.8081160187721252, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9330614805221558, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7155321125265394, "calib/avg_num_step_conf": 5.81640625, "calib/ece": 0.20063241106719354, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.616600790513834, "calib/gap": 0.27885748407643296, "calib/mean_conf": 0.7399209486166008, "calib/mu_c": 0.845732484076433, "calib/mu_w": 0.566875, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15999999999999984, "calib/std_conf": 0.3403311478451047, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4967665505226481, "calib/step_q_c_n": 861.0, "calib/step_q_gap": 0.08776495816596014, "calib/step_q_w": 0.409001592356688, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 485.67578125, "completions/mean_terminated_length": 487.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.096, "grad_norm": 0.045059166848659515, "kl": 0.06566619873046875, "learning_rate": 3.055555555555556e-06, "loss": -0.0261, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034171998500823975, "mask/share_reasoning": 0.8349112272262573, "mask/share_step_conf": 0.1270105242729187, "num_tokens": 21767379.0, "reward": 0.8125478029251099, "reward_std": 0.12349230796098709, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7568085789680481, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8682869672775269, "step": 90 }, { "adv/mean_abs_final_conf": 0.6261551380157471, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7426559329032898, "adv/std_final_conf": 0.8279377222061157, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9330735206604004, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6463037634408602, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.24760956175298796, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6135458167330677, "calib/gap": 0.1969865591397848, "calib/mean_conf": 0.7395617529880478, "calib/mu_c": 0.8149032258064516, "calib/mu_w": 0.6179166666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18482071713147402, "calib/std_conf": 0.33865595483960037, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4733818181818181, "calib/step_q_c_n": 825.0, "calib/step_q_gap": 0.05048478502300491, "calib/step_q_w": 0.4228970331588132, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 500.83203125, "completions/mean_terminated_length": 500.83203125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.09706666666666666, "grad_norm": 0.059418559074401855, "kl": 0.0816192626953125, "learning_rate": 3.0277777777777776e-06, "loss": 0.019, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033103737980127335, "mask/share_reasoning": 0.8485387563705444, "mask/share_step_conf": 0.11835750937461853, "num_tokens": 22003304.0, "reward": 0.7714670300483704, "reward_std": 0.1684153825044632, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7130800485610962, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8298540711402893, "step": 91 }, { "adv/mean_abs_final_conf": 0.5845734477043152, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7730364799499512, "adv/std_final_conf": 0.8114038705825806, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9334744215011597, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7061542357857414, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.2132941176470587, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.615686274509804, "calib/gap": 0.25577117892870704, "calib/mean_conf": 0.7429411764705883, "calib/mu_c": 0.849261744966443, "calib/mu_w": 0.593490566037736, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18596078431372534, "calib/std_conf": 0.3379447345330534, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5279789621318374, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.08138666822502727, "calib/step_q_w": 0.4465922939068101, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 462.87890625, "completions/mean_terminated_length": 462.87890625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.09813333333333334, "grad_norm": 0.045869529247283936, "kl": 0.066070556640625, "learning_rate": 3e-06, "loss": 0.0002, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036917682737112045, "mask/share_reasoning": 0.847240149974823, "mask/share_step_conf": 0.11584216356277466, "num_tokens": 22228521.0, "reward": 0.8019878268241882, "reward_std": 0.14549410343170166, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7390902042388916, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8648854494094849, "step": 92 }, { "adv/mean_abs_final_conf": 0.654515266418457, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7242127656936646, "adv/std_final_conf": 0.8611568212509155, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343360662460327, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7357438568376068, "calib/avg_num_step_conf": 6.22265625, "calib/ece": 0.1570564516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5040322580645161, "calib/gap": 0.29607371794871806, "calib/mean_conf": 0.6822983870967742, "calib/mu_c": 0.8064583333333334, "calib/mu_w": 0.5103846153846153, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12935483870967748, "calib/std_conf": 0.3451600255290363, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.519379042690815, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.09700099391032718, "calib/step_q_w": 0.42237804878048785, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 528.12890625, "completions/mean_terminated_length": 530.2000122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0992, "grad_norm": 0.04389479383826256, "kl": 0.0801239013671875, "learning_rate": 2.9722222222222225e-06, "loss": -0.0759, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033197760581970215, "mask/share_reasoning": 0.8333528637886047, "mask/share_step_conf": 0.12954315543174744, "num_tokens": 22469498.0, "reward": 0.7923904657363892, "reward_std": 0.1726185381412506, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7471199035644531, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.83766108751297, "step": 93 }, { "adv/mean_abs_final_conf": 0.5801839828491211, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7660778760910034, "adv/std_final_conf": 0.8194279074668884, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.933512806892395, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7599281452492462, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.17992063492063484, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5674603174603174, "calib/gap": 0.3208494258035539, "calib/mean_conf": 0.7055555555555556, "calib/mu_c": 0.8443356643356642, "calib/mu_w": 0.5234862385321103, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15900793650793643, "calib/std_conf": 0.34628540287877224, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5386538461538461, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.1127048828364618, "calib/step_q_w": 0.42594896331738435, "calib/step_q_w_n": 627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 488.55078125, "completions/mean_terminated_length": 490.4667053222656, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.10026666666666667, "grad_norm": 0.04189695045351982, "kl": 0.0657196044921875, "learning_rate": 2.944444444444445e-06, "loss": -0.0902, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03538179397583008, "mask/share_reasoning": 0.8432852625846863, "mask/share_step_conf": 0.11742669343948364, "num_tokens": 22703247.0, "reward": 0.8022552728652954, "reward_std": 0.1326061487197876, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.760992169380188, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8435183763504028, "step": 94 }, { "adv/mean_abs_final_conf": 0.5955591201782227, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7577732801437378, "adv/std_final_conf": 0.8261606097221375, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340616464614868, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7149068322981367, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.20290836653386435, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6414342629482072, "calib/gap": 0.2563685300207039, "calib/mean_conf": 0.7457768924302791, "calib/mu_c": 0.8377018633540373, "calib/mu_w": 0.5813333333333334, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15362549800796793, "calib/std_conf": 0.3445322685722691, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5504002306805075, "calib/step_q_c_n": 867.0, "calib/step_q_gap": 0.07861599831536226, "calib/step_q_w": 0.4717842323651452, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 503.22265625, "completions/mean_terminated_length": 503.22265625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.10133333333333333, "grad_norm": 0.0395638570189476, "kl": 0.06453323364257812, "learning_rate": 2.916666666666667e-06, "loss": -0.0433, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035683706402778625, "mask/share_reasoning": 0.8422435522079468, "mask/share_step_conf": 0.122072733938694, "num_tokens": 22938200.0, "reward": 0.7931810617446899, "reward_std": 0.1584271341562271, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7435300946235657, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.842832088470459, "step": 95 }, { "adv/mean_abs_final_conf": 0.5166634321212769, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7450376749038696, "adv/std_final_conf": 0.7603433728218079, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9338791370391846, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7806013431013431, "calib/avg_num_step_conf": 4.671875, "calib/ece": 0.14169291338582685, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7401574803149606, "calib/gap": 0.3162042124542125, "calib/mean_conf": 0.8294881889763781, "calib/mu_c": 0.9191208791208791, "calib/mu_w": 0.6029166666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12732283464566937, "calib/std_conf": 0.28928117172950024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5911166253101737, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.11347559966914811, "calib/step_q_w": 0.4776410256410256, "calib/step_q_w_n": 390.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 435.5859375, "completions/mean_terminated_length": 435.5859375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1024, "grad_norm": 0.07836330682039261, "kl": 0.0711212158203125, "learning_rate": 2.888888888888889e-06, "loss": -0.0184, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.039429161697626114, "mask/share_reasoning": 0.8440896272659302, "mask/share_step_conf": 0.11648118495941162, "num_tokens": 23155526.0, "reward": 0.8367966413497925, "reward_std": 0.1331299990415573, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.8224198818206787, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8511732816696167, "step": 96 }, { "adv/mean_abs_final_conf": 0.5952975153923035, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7472293376922607, "adv/std_final_conf": 0.8256112337112427, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934228241443634, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6639682539682541, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.2432156862745097, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6823529411764706, "calib/gap": 0.20131428571428567, "calib/mean_conf": 0.8073725490196081, "calib/mu_c": 0.8902666666666667, "calib/mu_w": 0.688952380952381, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2311764705882352, "calib/std_conf": 0.2928610699722091, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6027027027027027, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.10918360548048056, "calib/step_q_w": 0.4935190972222221, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 433.79296875, "completions/mean_terminated_length": 433.79296875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.10346666666666667, "grad_norm": 0.05005478858947754, "kl": 0.06501007080078125, "learning_rate": 2.861111111111111e-06, "loss": -0.0095, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.038694001734256744, "mask/share_reasoning": 0.8357928395271301, "mask/share_step_conf": 0.12551318109035492, "num_tokens": 23371649.0, "reward": 0.7806545495986938, "reward_std": 0.1640351563692093, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7153007388114929, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.84600830078125, "step": 97 }, { "adv/mean_abs_final_conf": 0.6123366951942444, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7438055872917175, "adv/std_final_conf": 0.8297456502914429, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343807697296143, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6935420743639922, "calib/avg_num_step_conf": 4.48046875, "calib/ece": 0.26298804780876495, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7609561752988048, "calib/gap": 0.2340821917808219, "calib/mean_conf": 0.8301593625498008, "calib/mu_c": 0.928082191780822, "calib/mu_w": 0.6940000000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2557370517928287, "calib/std_conf": 0.2952518920902296, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6388505747126437, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.13706618809554338, "calib/step_q_w": 0.5017843866171003, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 464.91796875, "completions/mean_terminated_length": 468.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.10453333333333334, "grad_norm": 0.04752293601632118, "kl": 0.059108734130859375, "learning_rate": 2.8333333333333335e-06, "loss": 0.0004, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03807282820343971, "mask/share_reasoning": 0.8472254276275635, "mask/share_step_conf": 0.10688920319080353, "num_tokens": 23596852.0, "reward": 0.7616933584213257, "reward_std": 0.18782877922058105, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7075746059417725, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8158121109008789, "step": 98 }, { "adv/mean_abs_final_conf": 0.6620317697525024, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7611058354377747, "adv/std_final_conf": 0.8592643737792969, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352956414222717, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.735962294859623, "calib/avg_num_step_conf": 4.8515625, "calib/ece": 0.32655870445344126, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5668016194331984, "calib/gap": 0.2586864234368639, "calib/mean_conf": 0.7210526315789474, "calib/mu_c": 0.8739603960396037, "calib/mu_w": 0.6152739726027399, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.3193522267206478, "calib/std_conf": 0.33710800486864156, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5883026584867076, "calib/step_q_c_n": 489.0, "calib/step_q_gap": 0.08741554029281651, "calib/step_q_w": 0.5008871181938911, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 528.14453125, "completions/mean_terminated_length": 534.4071655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.1056, "grad_norm": 0.0692669078707695, "kl": 0.049961090087890625, "learning_rate": 2.805555555555556e-06, "loss": -0.1378, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03356289118528366, "mask/share_reasoning": 0.8508045673370361, "mask/share_step_conf": 0.10391384363174438, "num_tokens": 23837857.0, "reward": 0.7011371850967407, "reward_std": 0.22175775468349457, "rewards/accuracy_reward_step": 0.39453125, "rewards/final_brier_reward_step": 0.6394370794296265, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.762837290763855, "step": 99 }, { "adv/mean_abs_final_conf": 0.6037321090698242, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7761998176574707, "adv/std_final_conf": 0.8127713203430176, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341810345649719, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7539454347964987, "calib/avg_num_step_conf": 4.8359375, "calib/ece": 0.22964285714285715, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6746031746031746, "calib/gap": 0.31578876749089524, "calib/mean_conf": 0.7761507936507938, "calib/mu_c": 0.9152482269503547, "calib/mu_w": 0.5994594594594594, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22313492063492063, "calib/std_conf": 0.3298169662548136, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6311295180722891, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.12743613828134837, "calib/step_q_w": 0.5036933797909408, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 501.0625, "completions/mean_terminated_length": 501.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.10666666666666667, "grad_norm": 0.06930825859308243, "kl": 0.053985595703125, "learning_rate": 2.7777777777777783e-06, "loss": -0.0075, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035849228501319885, "mask/share_reasoning": 0.853874921798706, "mask/share_step_conf": 0.11027580499649048, "num_tokens": 24073537.0, "reward": 0.7880070805549622, "reward_std": 0.18869704008102417, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7400109171867371, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8360031843185425, "step": 100 }, { "adv/mean_abs_final_conf": 0.6109015941619873, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7448518872261047, "adv/std_final_conf": 0.8460630178451538, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343430399894714, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7195292620865139, "calib/avg_num_step_conf": 4.56640625, "calib/ece": 0.189402390438247, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5139442231075697, "calib/gap": 0.2612124681933843, "calib/mean_conf": 0.6891633466135458, "calib/mu_c": 0.8140458015267177, "calib/mu_w": 0.5528333333333334, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1783266932270916, "calib/std_conf": 0.3360487793205386, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5690834697217676, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.08410139086872098, "calib/step_q_w": 0.4849820788530466, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 472.94921875, "completions/mean_terminated_length": 474.803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.10773333333333333, "grad_norm": 0.057462096214294434, "kl": 0.0570526123046875, "learning_rate": 2.7500000000000004e-06, "loss": 0.0116, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03623095527291298, "mask/share_reasoning": 0.8547747135162354, "mask/share_step_conf": 0.10508811473846436, "num_tokens": 24301604.0, "reward": 0.755883514881134, "reward_std": 0.16906806826591492, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7198277711868286, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7919393181800842, "step": 101 }, { "adv/mean_abs_final_conf": 0.6098600625991821, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7454801797866821, "adv/std_final_conf": 0.8268715143203735, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9331259727478027, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8043790849673204, "calib/avg_num_step_conf": 4.5859375, "calib/ece": 0.1588537549407114, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5810276679841897, "calib/gap": 0.3814215686274509, "calib/mean_conf": 0.7331620553359685, "calib/mu_c": 0.8839215686274511, "calib/mu_w": 0.5025000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1436363636363636, "calib/std_conf": 0.3366216155834159, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6155306451612903, "calib/step_q_c_n": 620.0, "calib/step_q_gap": 0.15861909281471986, "calib/step_q_w": 0.45691155234657044, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 417.44140625, "completions/mean_terminated_length": 419.0784606933594, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.1088, "grad_norm": 0.055821649730205536, "kl": 0.065460205078125, "learning_rate": 2.7222222222222224e-06, "loss": 0.0168, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04316910356283188, "mask/share_reasoning": 0.8308782577514648, "mask/share_step_conf": 0.12204640358686447, "num_tokens": 24515165.0, "reward": 0.8302434086799622, "reward_std": 0.1441684067249298, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8039737939834595, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8565130233764648, "step": 102 }, { "adv/mean_abs_final_conf": 0.47849929332733154, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7427432537078857, "adv/std_final_conf": 0.7303540110588074, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9331086277961731, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7756747860434497, "calib/avg_num_step_conf": 4.25, "calib/ece": 0.16671936758893272, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5849802371541502, "calib/gap": 0.29099868334430545, "calib/mean_conf": 0.7420553359683795, "calib/mu_c": 0.8547741935483871, "calib/mu_w": 0.5637755102040817, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14806324110671928, "calib/std_conf": 0.32512892684644035, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6111454823889739, "calib/step_q_c_n": 653.0, "calib/step_q_gap": 0.10831329848092786, "calib/step_q_w": 0.5028321839080461, "calib/step_q_w_n": 435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2081.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 490.5390625, "completions/mean_terminated_length": 490.5390625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.10986666666666667, "grad_norm": 0.05225846916437149, "kl": 0.05483245849609375, "learning_rate": 2.6944444444444444e-06, "loss": -0.0374, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036248303949832916, "mask/share_reasoning": 0.8617591857910156, "mask/share_step_conf": 0.10199250280857086, "num_tokens": 24745295.0, "reward": 0.804502010345459, "reward_std": 0.13372300565242767, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7684851288795471, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8405188322067261, "step": 103 }, { "adv/mean_abs_final_conf": 0.6716189384460449, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7436313629150391, "adv/std_final_conf": 0.8857026100158691, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934344470500946, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7777537930183792, "calib/avg_num_step_conf": 5.0234375, "calib/ece": 0.15611764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.40784313725490196, "calib/gap": 0.36534353028247196, "calib/mean_conf": 0.6069411764705882, "calib/mu_c": 0.7989256198347108, "calib/mu_w": 0.4335820895522388, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14427450980392154, "calib/std_conf": 0.3562374957841152, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5718568994889267, "calib/step_q_c_n": 587.0, "calib/step_q_gap": 0.1134920926219739, "calib/step_q_w": 0.4583648068669528, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 458.58984375, "completions/mean_terminated_length": 460.3882751464844, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.11093333333333333, "grad_norm": 0.06820174306631088, "kl": 0.0662841796875, "learning_rate": 2.666666666666667e-06, "loss": -0.0621, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03526720404624939, "mask/share_reasoning": 0.8457778692245483, "mask/share_step_conf": 0.11504866182804108, "num_tokens": 24969374.0, "reward": 0.8067743182182312, "reward_std": 0.14951029419898987, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.781417965888977, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8321306705474854, "step": 104 }, { "adv/mean_abs_final_conf": 0.7189830541610718, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7476561665534973, "adv/std_final_conf": 0.9116973876953125, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934876561164856, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7085136619081455, "calib/avg_num_step_conf": 4.19140625, "calib/ece": 0.19212, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.444, "calib/gap": 0.2806310961824172, "calib/mean_conf": 0.6106, "calib/mu_c": 0.7374452554744526, "calib/mu_w": 0.4568141592920354, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12736000000000003, "calib/std_conf": 0.3685941399425661, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5463205828779599, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.1413396668474256, "calib/step_q_w": 0.4049809160305343, "calib/step_q_w_n": 524.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 463.27734375, "completions/mean_terminated_length": 468.7707824707031, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.112, "grad_norm": 0.058943260461091995, "kl": 0.063934326171875, "learning_rate": 2.6388888888888893e-06, "loss": -0.073, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03884696960449219, "mask/share_reasoning": 0.8451870679855347, "mask/share_step_conf": 0.10424716770648956, "num_tokens": 25193733.0, "reward": 0.7750270962715149, "reward_std": 0.19859516620635986, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7339316606521606, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8161225914955139, "step": 105 }, { "adv/mean_abs_final_conf": 0.6058851480484009, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7565438151359558, "adv/std_final_conf": 0.8274356126785278, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341694116592407, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.756379062948406, "calib/avg_num_step_conf": 4.70703125, "calib/ece": 0.1823070866141732, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.531496062992126, "calib/gap": 0.3437558175806351, "calib/mean_conf": 0.666984251968504, "calib/mu_c": 0.8253284671532847, "calib/mu_w": 0.48157264957264956, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1549606299212598, "calib/std_conf": 0.36867525099588855, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.555424292845258, "calib/step_q_c_n": 601.0, "calib/step_q_gap": 0.1408630345671123, "calib/step_q_w": 0.41456125827814566, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 456.69921875, "completions/mean_terminated_length": 456.69921875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.11306666666666666, "grad_norm": 0.09702730923891068, "kl": 0.1219024658203125, "learning_rate": 2.6111111111111113e-06, "loss": -0.0079, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036941565573215485, "mask/share_reasoning": 0.8542109727859497, "mask/share_step_conf": 0.1088474690914154, "num_tokens": 25415232.0, "reward": 0.8071750402450562, "reward_std": 0.15165673196315765, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7641385793685913, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.850211501121521, "step": 106 }, { "adv/mean_abs_final_conf": 0.5851588249206543, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7315694689750671, "adv/std_final_conf": 0.8237432837486267, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343537092208862, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7436145510835913, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.17437007874015742, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.515748031496063, "calib/gap": 0.3058230134158927, "calib/mean_conf": 0.6931102362204725, "calib/mu_c": 0.815921052631579, "calib/mu_w": 0.5100980392156863, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13452755905511804, "calib/std_conf": 0.3450290579648353, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5534992223950232, "calib/step_q_c_n": 643.0, "calib/step_q_gap": 0.15557438579371607, "calib/step_q_w": 0.3979248366013072, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 463.87890625, "completions/mean_terminated_length": 463.87890625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.11413333333333334, "grad_norm": 0.05170339345932007, "kl": 0.06480026245117188, "learning_rate": 2.5833333333333337e-06, "loss": 0.0818, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03820245712995529, "mask/share_reasoning": 0.848260760307312, "mask/share_step_conf": 0.11353675276041031, "num_tokens": 25638601.0, "reward": 0.8206563591957092, "reward_std": 0.14223712682724, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7725800275802612, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.868732750415802, "step": 107 }, { "adv/mean_abs_final_conf": 0.5855361819267273, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7494146823883057, "adv/std_final_conf": 0.8314166069030762, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9335763454437256, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.740392796369824, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.15551181102362197, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6377952755905512, "calib/gap": 0.3404169030062393, "calib/mean_conf": 0.7590551181102363, "calib/mu_c": 0.8689534883720929, "calib/mu_w": 0.5285365853658536, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11870078740157472, "calib/std_conf": 0.3434870582230404, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5365839909808343, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.08348328840472191, "calib/step_q_w": 0.4531007025761124, "calib/step_q_w_n": 427.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 493.80859375, "completions/mean_terminated_length": 495.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 91.0, "epoch": 0.1152, "grad_norm": 0.04973267391324043, "kl": 0.071014404296875, "learning_rate": 2.5555555555555557e-06, "loss": -0.0653, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03569352626800537, "mask/share_reasoning": 0.8444836139678955, "mask/share_step_conf": 0.11591663956642151, "num_tokens": 25868248.0, "reward": 0.8126301765441895, "reward_std": 0.1515788584947586, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7874480485916138, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8378124237060547, "step": 108 }, { "adv/mean_abs_final_conf": 0.5870604515075684, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7535527944564819, "adv/std_final_conf": 0.8096221685409546, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340676069259644, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8179909560723514, "calib/avg_num_step_conf": 4.98046875, "calib/ece": 0.13384738955823292, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.46987951807228917, "calib/gap": 0.467429069767442, "calib/mean_conf": 0.5943453815261045, "calib/mu_c": 0.8196124031007753, "calib/mu_w": 0.35218333333333335, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10506024096385541, "calib/std_conf": 0.3991824088947798, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5310361067503925, "calib/step_q_c_n": 637.0, "calib/step_q_gap": 0.20546243903879374, "calib/step_q_w": 0.32557366771159874, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 496.203125, "completions/mean_terminated_length": 500.1102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.11626666666666667, "grad_norm": 0.045122113078832626, "kl": 0.06939697265625, "learning_rate": 2.5277777777777778e-06, "loss": -0.0858, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035198867321014404, "mask/share_reasoning": 0.8491263389587402, "mask/share_step_conf": 0.10786230862140656, "num_tokens": 26099876.0, "reward": 0.8194234371185303, "reward_std": 0.16372306644916534, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7931814193725586, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8456655144691467, "step": 109 }, { "adv/mean_abs_final_conf": 0.7107264995574951, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7442977428436279, "adv/std_final_conf": 0.8976032733917236, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342363476753235, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6777255639097743, "calib/avg_num_step_conf": 3.9921875, "calib/ece": 0.24999999999999994, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5770750988142292, "calib/gap": 0.28871303258145364, "calib/mean_conf": 0.6849407114624507, "calib/mu_c": 0.8218796992481203, "calib/mu_w": 0.5331666666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20462450592885373, "calib/std_conf": 0.3900842318619402, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5682617187500001, "calib/step_q_c_n": 512.0, "calib/step_q_gap": 0.13175191482843146, "calib/step_q_w": 0.43650980392156863, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 431.84375, "completions/mean_terminated_length": 431.84375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.11733333333333333, "grad_norm": 0.06673236936330795, "kl": 0.06946563720703125, "learning_rate": 2.5e-06, "loss": -0.0362, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03971089422702789, "mask/share_reasoning": 0.8535101413726807, "mask/share_step_conf": 0.10677894949913025, "num_tokens": 26315348.0, "reward": 0.7538522481918335, "reward_std": 0.19147515296936035, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.704800009727478, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8029043674468994, "step": 110 }, { "adv/mean_abs_final_conf": 0.5409973859786987, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7609439492225647, "adv/std_final_conf": 0.7927687764167786, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348319172859192, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7098196120981961, "calib/avg_num_step_conf": 3.90625, "calib/ece": 0.22963562753036426, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.611336032388664, "calib/gap": 0.2984239793842397, "calib/mean_conf": 0.7072874493927125, "calib/mu_c": 0.8293150684931507, "calib/mu_w": 0.530891089108911, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.17291497975708492, "calib/std_conf": 0.37932970175825215, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5571863799283153, "calib/step_q_c_n": 558.0, "calib/step_q_gap": 0.16211850662514787, "calib/step_q_w": 0.39506787330316745, "calib/step_q_w_n": 442.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 443.15234375, "completions/mean_terminated_length": 444.8902282714844, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.1184, "grad_norm": 0.04155619442462921, "kl": 0.06301116943359375, "learning_rate": 2.4722222222222226e-06, "loss": -0.0501, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.04270726814866066, "mask/share_reasoning": 0.847565770149231, "mask/share_step_conf": 0.1058206781744957, "num_tokens": 26536203.0, "reward": 0.7658801674842834, "reward_std": 0.16763192415237427, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7185871005058289, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8131731748580933, "step": 111 }, { "adv/mean_abs_final_conf": 0.6597450971603394, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7538222074508667, "adv/std_final_conf": 0.8582755327224731, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354592561721802, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7894385496451408, "calib/avg_num_step_conf": 4.04296875, "calib/ece": 0.17149999999999999, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4375, "calib/gap": 0.4543096057901766, "calib/mean_conf": 0.5293333333333334, "calib/mu_c": 0.7318796992481205, "calib/mu_w": 0.27757009345794387, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.0733333333333333, "calib/std_conf": 0.42730206593878706, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5136745886654479, "calib/step_q_c_n": 547.0, "calib/step_q_gap": 0.21873606407528395, "calib/step_q_w": 0.29493852459016395, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 546.83203125, "completions/mean_terminated_length": 548.9765014648438, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.11946666666666667, "grad_norm": 0.06552662700414658, "kl": 0.06233978271484375, "learning_rate": 2.4444444444444447e-06, "loss": -0.0972, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.0337638258934021, "mask/share_reasoning": 0.8707628846168518, "mask/share_step_conf": 0.0915670171380043, "num_tokens": 26784112.0, "reward": 0.7577349543571472, "reward_std": 0.21430005133152008, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7407132387161255, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7747566103935242, "step": 112 }, { "adv/mean_abs_final_conf": 0.660697340965271, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7497421503067017, "adv/std_final_conf": 0.8753498196601868, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342508912086487, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7964874691598495, "calib/avg_num_step_conf": 4.4296875, "calib/ece": 0.18213438735177867, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.34782608695652173, "calib/gap": 0.4065037008180757, "calib/mean_conf": 0.4911462450592886, "calib/mu_c": 0.6550331125827816, "calib/mu_w": 0.24852941176470594, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.03822134387351779, "calib/std_conf": 0.3982390466744529, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4483663220088626, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.12703152988632432, "calib/step_q_w": 0.3213347921225383, "calib/step_q_w_n": 457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 409.04296875, "completions/mean_terminated_length": 410.6470947265625, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.12053333333333334, "grad_norm": 0.04978412389755249, "kl": 0.09747314453125, "learning_rate": 2.4166666666666667e-06, "loss": -0.1001, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04079805687069893, "mask/share_reasoning": 0.8408189415931702, "mask/share_step_conf": 0.11447672545909882, "num_tokens": 26994027.0, "reward": 0.8243892192840576, "reward_std": 0.14668944478034973, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7760382890701294, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8727402687072754, "step": 113 }, { "adv/mean_abs_final_conf": 0.6948554515838623, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7704617977142334, "adv/std_final_conf": 0.8857570290565491, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9337559938430786, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7926232993197279, "calib/avg_num_step_conf": 4.3671875, "calib/ece": 0.16404761904761905, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.47875, "calib/mean_conf": 0.6176190476190476, "calib/mu_c": 0.7772023809523809, "calib/mu_w": 0.2984523809523809, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.057499999999999996, "calib/std_conf": 0.41437068094218465, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.48268125854993166, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.23211278309773525, "calib/step_q_w": 0.2505684754521964, "calib/step_q_w_n": 387.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 427.68359375, "completions/mean_terminated_length": 429.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.1216, "grad_norm": 0.07593639940023422, "kl": 0.0800933837890625, "learning_rate": 2.388888888888889e-06, "loss": -0.0511, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.042328353971242905, "mask/share_reasoning": 0.8368827104568481, "mask/share_step_conf": 0.11688268184661865, "num_tokens": 27208538.0, "reward": 0.834463894367218, "reward_std": 0.16004055738449097, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7997835874557495, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8691442012786865, "step": 114 }, { "adv/mean_abs_final_conf": 0.7465718984603882, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7727314233779907, "adv/std_final_conf": 0.9083997011184692, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353749752044678, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7171296889002688, "calib/avg_num_step_conf": 4.15234375, "calib/ece": 0.23284584980237144, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3952569169960474, "calib/gap": 0.34606324414287537, "calib/mean_conf": 0.4921343873517787, "calib/mu_c": 0.6384931506849315, "calib/mu_w": 0.29242990654205614, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07395256916996037, "calib/std_conf": 0.42591008083843535, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4040268987341772, "calib/step_q_c_n": 632.0, "calib/step_q_gap": 0.07579255998707746, "calib/step_q_w": 0.32823433874709973, "calib/step_q_w_n": 431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 400.41796875, "completions/mean_terminated_length": 400.41796875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.12266666666666666, "grad_norm": 0.06251219660043716, "kl": 0.09407806396484375, "learning_rate": 2.361111111111111e-06, "loss": -0.105, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.040772609412670135, "mask/share_reasoning": 0.8457399606704712, "mask/share_step_conf": 0.11348745226860046, "num_tokens": 27416309.0, "reward": 0.7573823928833008, "reward_std": 0.22537976503372192, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7236734628677368, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7910914421081543, "step": 115 }, { "adv/mean_abs_final_conf": 0.6926306486129761, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7407446503639221, "adv/std_final_conf": 0.88161700963974, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343625903129578, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7317920918367348, "calib/avg_num_step_conf": 4.44140625, "calib/ece": 0.23539682539682544, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.40476190476190477, "calib/gap": 0.36535714285714294, "calib/mean_conf": 0.5276190476190477, "calib/mu_c": 0.6900000000000001, "calib/mu_w": 0.3246428571428571, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10373015873015878, "calib/std_conf": 0.4323705945895947, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4250610820244328, "calib/step_q_c_n": 573.0, "calib/step_q_gap": 0.1750078905350711, "calib/step_q_w": 0.2500531914893617, "calib/step_q_w_n": 564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 470.4765625, "completions/mean_terminated_length": 470.4765625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12373333333333333, "grad_norm": 0.07935766875743866, "kl": 0.0879974365234375, "learning_rate": 2.3333333333333336e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03920193389058113, "mask/share_reasoning": 0.8547663688659668, "mask/share_step_conf": 0.10603173077106476, "num_tokens": 27641271.0, "reward": 0.7947357296943665, "reward_std": 0.1647808700799942, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7312819957733154, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8581894040107727, "step": 116 }, { "adv/mean_abs_final_conf": 0.678554892539978, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7296802997589111, "adv/std_final_conf": 0.8839506506919861, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351888298988342, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7323589648110221, "calib/avg_num_step_conf": 4.30078125, "calib/ece": 0.2168070866141732, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.468503937007874, "calib/gap": 0.3472533358158008, "calib/mean_conf": 0.5983031496062992, "calib/mu_c": 0.7773983739837398, "calib/mu_w": 0.430145038167939, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16542913385826766, "calib/std_conf": 0.4125910525014487, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47146185567010307, "calib/step_q_c_n": 485.0, "calib/step_q_gap": 0.13851542709867448, "calib/step_q_w": 0.3329464285714286, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 452.59765625, "completions/mean_terminated_length": 452.59765625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1248, "grad_norm": 0.04363831505179405, "kl": 0.0821533203125, "learning_rate": 2.305555555555556e-06, "loss": -0.1175, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0384393110871315, "mask/share_reasoning": 0.8544684648513794, "mask/share_step_conf": 0.10709226131439209, "num_tokens": 27863736.0, "reward": 0.7832147479057312, "reward_std": 0.18473175168037415, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7263078093528748, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8401218056678772, "step": 117 }, { "adv/mean_abs_final_conf": 0.5879744291305542, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7720198035240173, "adv/std_final_conf": 0.7948587536811829, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346683621406555, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7168510084580351, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.26019920318725087, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6254980079681275, "calib/gap": 0.28801821730644106, "calib/mean_conf": 0.726573705179283, "calib/mu_c": 0.8482068965517241, "calib/mu_w": 0.560188679245283, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.20454183266932263, "calib/std_conf": 0.3862087321728891, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4322253315758167, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.11845049713873068, "calib/step_q_w": 0.31377483443708604, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 485.765625, "completions/mean_terminated_length": 487.6706237792969, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.12586666666666665, "grad_norm": 0.037945739924907684, "kl": 0.0703125, "learning_rate": 2.277777777777778e-06, "loss": -0.0543, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036695875227451324, "mask/share_reasoning": 0.845686674118042, "mask/share_step_conf": 0.1137111485004425, "num_tokens": 28092100.0, "reward": 0.7686008810997009, "reward_std": 0.19075638055801392, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7060140371322632, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8311876058578491, "step": 118 }, { "adv/mean_abs_final_conf": 0.6270201802253723, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7426635026931763, "adv/std_final_conf": 0.8438059687614441, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.929425835609436, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7711167352103765, "calib/avg_num_step_conf": 4.48046875, "calib/ece": 0.21615748031496046, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6141732283464567, "calib/gap": 0.40604340398608035, "calib/mean_conf": 0.6786771653543308, "calib/mu_c": 0.8529241379310345, "calib/mu_w": 0.44688073394495414, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16198425196850375, "calib/std_conf": 0.41690956896091674, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.504838953488372, "calib/step_q_c_n": 516.0, "calib/step_q_gap": 0.27353942892418825, "calib/step_q_w": 0.23129952456418382, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 502.93359375, "completions/mean_terminated_length": 502.93359375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.12693333333333334, "grad_norm": 0.05070538446307182, "kl": 0.07465362548828125, "learning_rate": 2.25e-06, "loss": -0.0048, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036642082035541534, "mask/share_reasoning": 0.8676231503486633, "mask/share_step_conf": 0.09573476016521454, "num_tokens": 28325915.0, "reward": 0.8076252937316895, "reward_std": 0.20259562134742737, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7510377764701843, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8642127513885498, "step": 119 }, { "adv/mean_abs_final_conf": 0.5917130708694458, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7489132881164551, "adv/std_final_conf": 0.8114206790924072, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350486993789673, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7984442237669647, "calib/avg_num_step_conf": 4.14453125, "calib/ece": 0.1860236220472442, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6299212598425197, "calib/gap": 0.48683813306852053, "calib/mean_conf": 0.6788582677165355, "calib/mu_c": 0.8609433962264154, "calib/mu_w": 0.37410526315789483, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11944881889763792, "calib/std_conf": 0.4254833698538593, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44790935672514615, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.12578734081002674, "calib/step_q_w": 0.3221220159151194, "calib/step_q_w_n": 377.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 414.00390625, "completions/mean_terminated_length": 414.00390625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.128, "grad_norm": 0.04255034402012825, "kl": 0.0883026123046875, "learning_rate": 2.222222222222222e-06, "loss": -0.0381, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04009437561035156, "mask/share_reasoning": 0.8499317765235901, "mask/share_step_conf": 0.10997384041547775, "num_tokens": 28538588.0, "reward": 0.8278021812438965, "reward_std": 0.17815116047859192, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.803676962852478, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8519275188446045, "step": 120 }, { "adv/mean_abs_final_conf": 0.5864112377166748, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7474852800369263, "adv/std_final_conf": 0.8054826259613037, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345322847366333, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6949251789199737, "calib/avg_num_step_conf": 5.1015625, "calib/ece": 0.27322310756972107, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7330677290836654, "calib/gap": 0.2780366948601172, "calib/mean_conf": 0.8007131474103586, "calib/mu_c": 0.9181310344827588, "calib/mu_w": 0.6400943396226416, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2481235059760956, "calib/std_conf": 0.3492089714818343, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4264383808095952, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.09734604904120703, "calib/step_q_w": 0.32909233176838815, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 505.58203125, "completions/mean_terminated_length": 507.5647277832031, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.12906666666666666, "grad_norm": 0.04652104154229164, "kl": 0.06574249267578125, "learning_rate": 2.1944444444444445e-06, "loss": -0.0736, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035734884440898895, "mask/share_reasoning": 0.8535595536231995, "mask/share_step_conf": 0.10679930448532104, "num_tokens": 28773073.0, "reward": 0.7692856788635254, "reward_std": 0.19570022821426392, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7019648551940918, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.836606502532959, "step": 121 }, { "adv/mean_abs_final_conf": 0.5605442523956299, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7494107484817505, "adv/std_final_conf": 0.7812092304229736, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340590834617615, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7425825593395254, "calib/avg_num_step_conf": 4.8125, "calib/ece": 0.22480314960629924, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7440944881889764, "calib/gap": 0.3767298761609905, "calib/mean_conf": 0.7833858267716536, "calib/mu_c": 0.9346710526315789, "calib/mu_w": 0.5579411764705884, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20488188976377955, "calib/std_conf": 0.37253699382426914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.578234328358209, "calib/step_q_c_n": 670.0, "calib/step_q_gap": 0.27546564508418764, "calib/step_q_w": 0.30276868327402134, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 451.31640625, "completions/mean_terminated_length": 451.31640625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.13013333333333332, "grad_norm": 0.041189853101968765, "kl": 0.07921600341796875, "learning_rate": 2.166666666666667e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03811654448509216, "mask/share_reasoning": 0.8494611978530884, "mask/share_step_conf": 0.11242222785949707, "num_tokens": 28995954.0, "reward": 0.8048402667045593, "reward_std": 0.17405804991722107, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.761760950088501, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8479195833206177, "step": 122 }, { "adv/mean_abs_final_conf": 0.6796097755432129, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7547227740287781, "adv/std_final_conf": 0.8744975924491882, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348601698875427, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6908712343028174, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.2880400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": 0.2701782809551696, "calib/mean_conf": 0.7597200000000001, "calib/mu_c": 0.8775177304964539, "calib/mu_w": 0.6073394495412843, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.24188000000000004, "calib/std_conf": 0.38318601435856187, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.40191558441558445, "calib/step_q_c_n": 616.0, "calib/step_q_gap": 0.1488362936354426, "calib/step_q_w": 0.25307929078014185, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 541.19140625, "completions/mean_terminated_length": 543.3137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.1312, "grad_norm": 0.04248799383640289, "kl": 0.0621795654296875, "learning_rate": 2.138888888888889e-06, "loss": -0.0327, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034889135509729385, "mask/share_reasoning": 0.8600262999534607, "mask/share_step_conf": 0.10117833316326141, "num_tokens": 29239787.0, "reward": 0.7408413887023926, "reward_std": 0.22834190726280212, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6735906600952148, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8080922365188599, "step": 123 }, { "adv/mean_abs_final_conf": 0.5587438344955444, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7677050828933716, "adv/std_final_conf": 0.8090704679489136, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350267052650452, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6976533690915186, "calib/avg_num_step_conf": 4.6171875, "calib/ece": 0.2434444444444443, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.746031746031746, "calib/gap": 0.2915293328863561, "calib/mean_conf": 0.7932063492063494, "calib/mu_c": 0.9031082802547772, "calib/mu_w": 0.6115789473684211, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20681746031746015, "calib/std_conf": 0.3628922249701894, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5021832386363637, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.1642752888455688, "calib/step_q_w": 0.3379079497907949, "calib/step_q_w_n": 478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2170.0, "completions/max_terminated_length": 2170.0, "completions/mean_length": 486.1328125, "completions/mean_terminated_length": 486.1328125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.13226666666666667, "grad_norm": 0.06367672979831696, "kl": 0.0710906982421875, "learning_rate": 2.1111111111111114e-06, "loss": -0.0453, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03547365963459015, "mask/share_reasoning": 0.8601535558700562, "mask/share_step_conf": 0.10437280684709549, "num_tokens": 29471053.0, "reward": 0.7744747996330261, "reward_std": 0.180791974067688, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7260277271270752, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.822921872138977, "step": 124 }, { "adv/mean_abs_final_conf": 0.6150028109550476, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7723146080970764, "adv/std_final_conf": 0.8241979479789734, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339441061019897, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6313887793982393, "calib/avg_num_step_conf": 4.73046875, "calib/ece": 0.35696356275303653, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7206477732793523, "calib/gap": 0.17255551175929595, "calib/mean_conf": 0.7746963562753035, "calib/mu_c": 0.8571317829457364, "calib/mu_w": 0.6845762711864405, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3046963562753037, "calib/std_conf": 0.3729559797901317, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5079631782945737, "calib/step_q_c_n": 516.0, "calib/step_q_gap": 0.1762365595895377, "calib/step_q_w": 0.331726618705036, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 523.515625, "completions/mean_terminated_length": 525.5686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.13333333333333333, "grad_norm": 0.04430394247174263, "kl": 0.06725311279296875, "learning_rate": 2.0833333333333334e-06, "loss": -0.0845, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03517274558544159, "mask/share_reasoning": 0.8596398830413818, "mask/share_step_conf": 0.10128115117549896, "num_tokens": 29709881.0, "reward": 0.7005262970924377, "reward_std": 0.18713583052158356, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.611504316329956, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7895482778549194, "step": 125 }, { "adv/mean_abs_final_conf": 0.5662118196487427, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7217642068862915, "adv/std_final_conf": 0.7941136956214905, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9217687249183655, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7514391583405016, "calib/avg_num_step_conf": 4.63671875, "calib/ece": 0.2498979674796747, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6707317073170732, "calib/gap": 0.40203121153973354, "calib/mean_conf": 0.7146947154471545, "calib/mu_c": 0.9091732283464564, "calib/mu_w": 0.5071420168067229, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.22416626016260152, "calib/std_conf": 0.4150856757635778, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5724484052532833, "calib/step_q_c_n": 533.0, "calib/step_q_gap": 0.3069042156508369, "calib/step_q_w": 0.2655441896024464, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 493.0703125, "completions/mean_terminated_length": 498.9170227050781, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.1344, "grad_norm": 0.04063578322529793, "kl": 0.08344650268554688, "learning_rate": 2.0555555555555555e-06, "loss": -0.1481, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03813691437244415, "mask/share_reasoning": 0.8446433544158936, "mask/share_step_conf": 0.10550101101398468, "num_tokens": 29941571.0, "reward": 0.7483680844306946, "reward_std": 0.21162956953048706, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6991752982139587, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7975609302520752, "step": 126 }, { "adv/mean_abs_final_conf": 0.5917777419090271, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7527652978897095, "adv/std_final_conf": 0.7995696067810059, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351709485054016, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7078985602958658, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.27238866396761124, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.659919028340081, "calib/gap": 0.3514172500330208, "calib/mean_conf": 0.6968421052631579, "calib/mu_c": 0.8576119402985076, "calib/mu_w": 0.5061946902654868, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21336032388663959, "calib/std_conf": 0.42501410713479154, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5186550632911392, "calib/step_q_c_n": 632.0, "calib/step_q_gap": 0.22045269992334005, "calib/step_q_w": 0.29820236336779915, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 524.87890625, "completions/mean_terminated_length": 524.87890625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.13546666666666668, "grad_norm": 0.05239735171198845, "kl": 0.06211090087890625, "learning_rate": 2.027777777777778e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.037127815186977386, "mask/share_reasoning": 0.8489212989807129, "mask/share_step_conf": 0.1139509379863739, "num_tokens": 30179612.0, "reward": 0.7568548917770386, "reward_std": 0.22605092823505402, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6964148283004761, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8172948360443115, "step": 127 }, { "adv/mean_abs_final_conf": 0.6582461595535278, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.781390905380249, "adv/std_final_conf": 0.8269768357276917, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352853894233704, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6791420515574651, "calib/avg_num_step_conf": 4.265625, "calib/ece": 0.31408163265306127, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7142857142857143, "calib/gap": 0.258388157894737, "calib/mean_conf": 0.7728571428571429, "calib/mu_c": 0.8909774436090228, "calib/mu_w": 0.6325892857142857, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.27204081632653065, "calib/std_conf": 0.38220413393892005, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.56017578125, "calib/step_q_c_n": 512.0, "calib/step_q_gap": 0.24564302262931037, "calib/step_q_w": 0.31453275862068963, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 490.71484375, "completions/mean_terminated_length": 496.53363037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.13653333333333334, "grad_norm": 0.06866651773452759, "kl": 0.066314697265625, "learning_rate": 2.0000000000000003e-06, "loss": -0.0601, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.04000090807676315, "mask/share_reasoning": 0.8460492491722107, "mask/share_step_conf": 0.10223108530044556, "num_tokens": 30411899.0, "reward": 0.7223763465881348, "reward_std": 0.24244219064712524, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.647929310798645, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7968234419822693, "step": 128 }, { "adv/mean_abs_final_conf": 0.6572911143302917, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7690467834472656, "adv/std_final_conf": 0.8524337410926819, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346084594726562, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6090235256203675, "calib/avg_num_step_conf": 4.7265625, "calib/ece": 0.3544801587301588, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7738095238095238, "calib/gap": 0.15475494682565272, "calib/mean_conf": 0.8045595238095239, "calib/mu_c": 0.8702689655172414, "calib/mu_w": 0.7155140186915887, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2918214285714287, "calib/std_conf": 0.3597985663696813, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5545014492753623, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.11338606465997764, "calib/step_q_w": 0.44111538461538463, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2840.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 443.8984375, "completions/mean_terminated_length": 443.8984375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1376, "grad_norm": 0.0634075254201889, "kl": 0.0722503662109375, "learning_rate": 1.9722222222222224e-06, "loss": 0.0209, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03835558146238327, "mask/share_reasoning": 0.8439362049102783, "mask/share_step_conf": 0.11770817637443542, "num_tokens": 30627921.0, "reward": 0.7297005653381348, "reward_std": 0.20747110247612, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6390316486358643, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8203694820404053, "step": 129 }, { "adv/mean_abs_final_conf": 0.5336794853210449, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7394791841506958, "adv/std_final_conf": 0.7902102470397949, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340017437934875, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6969356486210418, "calib/avg_num_step_conf": 4.05078125, "calib/ece": 0.22712598425196848, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7362204724409449, "calib/gap": 0.3584725910793326, "calib/mean_conf": 0.7657874015748032, "calib/mu_c": 0.8913939393939393, "calib/mu_w": 0.5329213483146067, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.17165354330708657, "calib/std_conf": 0.39243331606505466, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5255920000000001, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.10928132038834965, "calib/step_q_w": 0.4163106796116504, "calib/step_q_w_n": 412.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 407.18359375, "completions/mean_terminated_length": 408.7804260253906, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.13866666666666666, "grad_norm": 0.04523677006363869, "kl": 0.07286834716796875, "learning_rate": 1.944444444444445e-06, "loss": -0.0415, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.042861148715019226, "mask/share_reasoning": 0.8466463088989258, "mask/share_step_conf": 0.10658632218837738, "num_tokens": 30837448.0, "reward": 0.7943114042282104, "reward_std": 0.16882899403572083, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7463679313659668, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8422548770904541, "step": 130 }, { "adv/mean_abs_final_conf": 0.6024599671363831, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7664506435394287, "adv/std_final_conf": 0.8044640421867371, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934328556060791, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7555746687054026, "calib/avg_num_step_conf": 4.625, "calib/ece": 0.2511462450592885, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5454545454545454, "calib/gap": 0.4317214576962283, "calib/mean_conf": 0.5933596837944665, "calib/mu_c": 0.8390825688073394, "calib/mu_w": 0.4073611111111112, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2068379446640316, "calib/std_conf": 0.45306790959702653, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5259521829521829, "calib/step_q_c_n": 481.0, "calib/step_q_gap": 0.18824236787394683, "calib/step_q_w": 0.3377098150782361, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 446.3671875, "completions/mean_terminated_length": 446.3671875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.13973333333333332, "grad_norm": 0.04830202832818031, "kl": 0.07916259765625, "learning_rate": 1.916666666666667e-06, "loss": -0.0219, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03808034956455231, "mask/share_reasoning": 0.8501123189926147, "mask/share_step_conf": 0.11180734634399414, "num_tokens": 31057926.0, "reward": 0.7807968258857727, "reward_std": 0.1836085319519043, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7262164354324341, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8353773355484009, "step": 131 }, { "adv/mean_abs_final_conf": 0.532532811164856, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7684825658798218, "adv/std_final_conf": 0.7594923377037048, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9334070682525635, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7674441654552663, "calib/avg_num_step_conf": 4.6328125, "calib/ece": 0.21380392156862743, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6588235294117647, "calib/gap": 0.4566994846042024, "calib/mean_conf": 0.6786666666666666, "calib/mu_c": 0.8470186335403727, "calib/mu_w": 0.39031914893617026, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13054901960784313, "calib/std_conf": 0.438481992237241, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5852209469153515, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.2458119489603413, "calib/step_q_w": 0.3394089979550102, "calib/step_q_w_n": 489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 450.15625, "completions/mean_terminated_length": 451.9216003417969, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.1408, "grad_norm": 0.06343978643417358, "kl": 0.071075439453125, "learning_rate": 1.888888888888889e-06, "loss": -0.0479, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03952445834875107, "mask/share_reasoning": 0.8402537107467651, "mask/share_step_conf": 0.11631553620100021, "num_tokens": 31278758.0, "reward": 0.8163669109344482, "reward_std": 0.17662331461906433, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7783671617507935, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8543666005134583, "step": 132 }, { "adv/mean_abs_final_conf": 0.6812657117843628, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7408666610717773, "adv/std_final_conf": 0.8782057166099548, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352087378501892, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7184826950914593, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.27525896414342643, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.38645418326693226, "calib/gap": 0.37108501118568243, "calib/mean_conf": 0.4380478087649402, "calib/mu_c": 0.6583333333333334, "calib/mu_w": 0.287248322147651, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1534661354581674, "calib/std_conf": 0.4592274103833413, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43027338129496406, "calib/step_q_c_n": 556.0, "calib/step_q_gap": 0.13507753884419815, "calib/step_q_w": 0.2951958424507659, "calib/step_q_w_n": 914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 577.0546875, "completions/mean_terminated_length": 579.3176879882812, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.14186666666666667, "grad_norm": 0.043094903230667114, "kl": 0.0680389404296875, "learning_rate": 1.8611111111111113e-06, "loss": -0.0796, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03033336251974106, "mask/share_reasoning": 0.8618874549865723, "mask/share_step_conf": 0.10387295484542847, "num_tokens": 31532828.0, "reward": 0.7787151336669922, "reward_std": 0.19636546075344086, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.7078253626823425, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8496048450469971, "step": 133 }, { "adv/mean_abs_final_conf": 0.7835479378700256, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7636693716049194, "adv/std_final_conf": 0.915665328502655, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.93454509973526, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6835707502374169, "calib/avg_num_step_conf": 4.62890625, "calib/ece": 0.3326984126984126, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.40476190476190477, "calib/gap": 0.2778803418803419, "calib/mean_conf": 0.4380952380952381, "calib/mu_c": 0.5671111111111111, "calib/mu_w": 0.28923076923076924, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1175396825396824, "calib/std_conf": 0.4661344146456114, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.43978489736070386, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.14602744209231422, "calib/step_q_w": 0.29375745526838964, "calib/step_q_w_n": 503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 550.03515625, "completions/mean_terminated_length": 550.03515625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.14293333333333333, "grad_norm": 0.08178877085447311, "kl": 0.08254241943359375, "learning_rate": 1.8333333333333333e-06, "loss": -0.0811, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03342124819755554, "mask/share_reasoning": 0.8704813718795776, "mask/share_step_conf": 0.09609738737344742, "num_tokens": 31782589.0, "reward": 0.718065619468689, "reward_std": 0.24075214564800262, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6291484236717224, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8069828152656555, "step": 134 }, { "adv/mean_abs_final_conf": 0.6751786470413208, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7397767305374146, "adv/std_final_conf": 0.8756873607635498, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341914057731628, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6747403748733536, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.32762845849802363, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3952569169960474, "calib/gap": 0.3019674518743669, "calib/mean_conf": 0.4383794466403162, "calib/mu_c": 0.5720567375886526, "calib/mu_w": 0.2700892857142857, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10434782608695645, "calib/std_conf": 0.46559085489573315, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4196435185185185, "calib/step_q_c_n": 648.0, "calib/step_q_gap": 0.11991778662329505, "calib/step_q_w": 0.29972573189522345, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 507.140625, "completions/mean_terminated_length": 509.12945556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.144, "grad_norm": 0.06607788056135178, "kl": 0.06987762451171875, "learning_rate": 1.8055555555555557e-06, "loss": -0.0134, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03679756075143814, "mask/share_reasoning": 0.8516077995300293, "mask/share_step_conf": 0.10768839716911316, "num_tokens": 32018297.0, "reward": 0.7513500452041626, "reward_std": 0.21713411808013916, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6563273072242737, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8463728427886963, "step": 135 }, { "adv/mean_abs_final_conf": 0.5889009237289429, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7550169229507446, "adv/std_final_conf": 0.8102096915245056, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9327640533447266, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7787312484108824, "calib/avg_num_step_conf": 5.25, "calib/ece": 0.23396825396825383, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.34523809523809523, "calib/gap": 0.450045766590389, "calib/mean_conf": 0.37047619047619046, "calib/mu_c": 0.6169298245614034, "calib/mu_w": 0.1668840579710145, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0760317460317459, "calib/std_conf": 0.45675156865067534, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4747941747572816, "calib/step_q_c_n": 515.0, "calib/step_q_gap": 0.22916329417827072, "calib/step_q_w": 0.24563088057901086, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 505.6171875, "completions/mean_terminated_length": 505.6171875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.14506666666666668, "grad_norm": 0.03182613477110863, "kl": 0.0801849365234375, "learning_rate": 1.777777777777778e-06, "loss": -0.0865, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03527706116437912, "mask/share_reasoning": 0.8453294634819031, "mask/share_step_conf": 0.1193934828042984, "num_tokens": 32256223.0, "reward": 0.8134219646453857, "reward_std": 0.16991086304187775, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7480453252792358, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8787985444068909, "step": 136 }, { "adv/mean_abs_final_conf": 0.6382880210876465, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7336064577102661, "adv/std_final_conf": 0.8308758735656738, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345904588699341, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7714179958274446, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.2379098360655738, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.38934426229508196, "calib/gap": 0.46010969782623334, "calib/mean_conf": 0.43709016393442623, "calib/mu_c": 0.657716535433071, "calib/mu_w": 0.1976068376068376, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07725409836065578, "calib/std_conf": 0.46236222589657855, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4219350710900474, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.19407037764395862, "calib/step_q_w": 0.22786469344608878, "calib/step_q_w_n": 946.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 532.28515625, "completions/mean_terminated_length": 536.4763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.14613333333333334, "grad_norm": 0.07119835913181305, "kl": 0.07416915893554688, "learning_rate": 1.75e-06, "loss": -0.0667, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03368588909506798, "mask/share_reasoning": 0.8412888050079346, "mask/share_step_conf": 0.11721283197402954, "num_tokens": 32499472.0, "reward": 0.780731737613678, "reward_std": 0.22005677223205566, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7236812114715576, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8377822637557983, "step": 137 }, { "adv/mean_abs_final_conf": 0.6097594499588013, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.768215537071228, "adv/std_final_conf": 0.8194006085395813, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343672394752502, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7697385881104034, "calib/avg_num_step_conf": 5.09765625, "calib/ece": 0.23857707509881432, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.44664031620553357, "calib/gap": 0.48071191613588116, "calib/mean_conf": 0.4977865612648221, "calib/mu_c": 0.6801910828025478, "calib/mu_w": 0.19947916666666665, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05790513833992102, "calib/std_conf": 0.4651250861308752, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4216188679245283, "calib/step_q_c_n": 795.0, "calib/step_q_gap": 0.13706984831668517, "calib/step_q_w": 0.2845490196078431, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 480.7265625, "completions/mean_terminated_length": 480.7265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1472, "grad_norm": 0.08591213822364807, "kl": 0.08078765869140625, "learning_rate": 1.7222222222222224e-06, "loss": -0.0342, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03658757731318474, "mask/share_reasoning": 0.847145676612854, "mask/share_step_conf": 0.11626674234867096, "num_tokens": 32726874.0, "reward": 0.8037103414535522, "reward_std": 0.18178507685661316, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7506031394004822, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8568174839019775, "step": 138 }, { "adv/mean_abs_final_conf": 0.6061331033706665, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7485789060592651, "adv/std_final_conf": 0.8287302255630493, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.933654248714447, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7235772357723577, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.29862204724409447, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4409448818897638, "calib/gap": 0.3605257452574526, "calib/mean_conf": 0.5006692913385827, "calib/mu_c": 0.6284146341463415, "calib/mu_w": 0.26788888888888884, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07681102362204724, "calib/std_conf": 0.460937021593922, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4152287581699346, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.1379692771320104, "calib/step_q_w": 0.2772594810379242, "calib/step_q_w_n": 501.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 446.5078125, "completions/mean_terminated_length": 448.25885009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.14826666666666666, "grad_norm": 0.06311675906181335, "kl": 0.084686279296875, "learning_rate": 1.6944444444444446e-06, "loss": -0.0278, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03723848611116409, "mask/share_reasoning": 0.8437467813491821, "mask/share_step_conf": 0.11510850489139557, "num_tokens": 32944276.0, "reward": 0.7825126051902771, "reward_std": 0.15829968452453613, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6972042918205261, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8678209781646729, "step": 139 }, { "adv/mean_abs_final_conf": 0.5985676646232605, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7452235221862793, "adv/std_final_conf": 0.8259685039520264, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9336854219436646, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7688974991194083, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.18932539682539679, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6587301587301587, "calib/gap": 0.4551680169073617, "calib/mean_conf": 0.6959920634920636, "calib/mu_c": 0.8495209580838322, "calib/mu_w": 0.3943529411764705, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11130952380952376, "calib/std_conf": 0.4273592406534957, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5014871794871795, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.19293484095711266, "calib/step_q_w": 0.3085523385300668, "calib/step_q_w_n": 449.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 459.90625, "completions/mean_terminated_length": 461.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.14933333333333335, "grad_norm": 0.09437122941017151, "kl": 0.15157318115234375, "learning_rate": 1.6666666666666667e-06, "loss": -0.0518, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036479588598012924, "mask/share_reasoning": 0.8461323380470276, "mask/share_step_conf": 0.11348183453083038, "num_tokens": 33167028.0, "reward": 0.8225182890892029, "reward_std": 0.19686385989189148, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7759605646133423, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8690760731697083, "step": 140 }, { "adv/mean_abs_final_conf": 0.5633289813995361, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7426817417144775, "adv/std_final_conf": 0.8024340271949768, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339341521263123, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8573802740469407, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.12928853754940706, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5849802371541502, "calib/gap": 0.6341113824447158, "calib/mean_conf": 0.6238339920948617, "calib/mu_c": 0.8519135802469135, "calib/mu_w": 0.2178021978021978, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.056403162055335895, "calib/std_conf": 0.45360732887048, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.47482968369829687, "calib/step_q_c_n": 822.0, "calib/step_q_gap": 0.212613997423787, "calib/step_q_w": 0.26221568627450986, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 512.6796875, "completions/mean_terminated_length": 512.6796875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1504, "grad_norm": 0.04484057426452637, "kl": 0.06557464599609375, "learning_rate": 1.638888888888889e-06, "loss": -0.0214, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03456936031579971, "mask/share_reasoning": 0.8581772446632385, "mask/share_step_conf": 0.10725339502096176, "num_tokens": 33405370.0, "reward": 0.8522858023643494, "reward_std": 0.16684626042842865, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.8418089747428894, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8627626299858093, "step": 141 }, { "adv/mean_abs_final_conf": 0.584743320941925, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7665185332298279, "adv/std_final_conf": 0.8165276050567627, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341987371444702, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7693518518518518, "calib/avg_num_step_conf": 5.62109375, "calib/ece": 0.22447058823529412, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5686274509803921, "calib/gap": 0.476101851851852, "calib/mean_conf": 0.6058039215686275, "calib/mu_c": 0.829851851851852, "calib/mu_w": 0.35374999999999995, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1504313725490196, "calib/std_conf": 0.46283698987905836, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5230730337078652, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.21269614237361484, "calib/step_q_w": 0.31037689133425034, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 508.7734375, "completions/mean_terminated_length": 510.7686462402344, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.15146666666666667, "grad_norm": 0.044705916196107864, "kl": 0.06641387939453125, "learning_rate": 1.6111111111111113e-06, "loss": 0.0199, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03387663513422012, "mask/share_reasoning": 0.8435350060462952, "mask/share_step_conf": 0.11868210136890411, "num_tokens": 33640776.0, "reward": 0.8184232711791992, "reward_std": 0.16215607523918152, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.765038251876831, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8718082904815674, "step": 142 }, { "adv/mean_abs_final_conf": 0.622256875038147, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7556319236755371, "adv/std_final_conf": 0.8251966834068298, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9331852793693542, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.790186673580503, "calib/avg_num_step_conf": 5.2109375, "calib/ece": 0.2248152610441768, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.606425702811245, "calib/gap": 0.41621947109152185, "calib/mean_conf": 0.6765421686746989, "calib/mu_c": 0.8704436090225565, "calib/mu_w": 0.4542241379310346, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.18361044176706837, "calib/std_conf": 0.4200678077182785, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4771777251184835, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.16781181926969607, "calib/step_q_w": 0.3093659058487874, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 507.43359375, "completions/mean_terminated_length": 511.42913818359375, "completions/min_length": 0.0, "completions/min_terminated_length": 67.0, "epoch": 0.15253333333333333, "grad_norm": 0.042031846940517426, "kl": 0.0727386474609375, "learning_rate": 1.5833333333333333e-06, "loss": -0.1356, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03369224816560745, "mask/share_reasoning": 0.8472211360931396, "mask/share_step_conf": 0.1112741082906723, "num_tokens": 33878015.0, "reward": 0.786507785320282, "reward_std": 0.19533541798591614, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7297797203063965, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.843235969543457, "step": 143 }, { "adv/mean_abs_final_conf": 0.5098259449005127, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7644120454788208, "adv/std_final_conf": 0.7566507458686829, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9335551261901855, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7450912129229912, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.2165217391304348, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6047430830039525, "calib/gap": 0.423139534883721, "calib/mean_conf": 0.6461660079051385, "calib/mu_c": 0.79, "calib/mu_w": 0.366860465116279, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10130434782608694, "calib/std_conf": 0.44501001910676913, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49962261904761907, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.17035817460317465, "calib/step_q_w": 0.3292644444444444, "calib/step_q_w_n": 450.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 491.984375, "completions/mean_terminated_length": 491.984375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.1536, "grad_norm": 0.03865968808531761, "kl": 0.07563018798828125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0603, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036693550646305084, "mask/share_reasoning": 0.8460485935211182, "mask/share_step_conf": 0.11725786328315735, "num_tokens": 34108091.0, "reward": 0.8114147186279297, "reward_std": 0.16153846681118011, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7582898139953613, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.864539623260498, "step": 144 }, { "adv/mean_abs_final_conf": 0.5739729404449463, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7518049478530884, "adv/std_final_conf": 0.808935821056366, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343997240066528, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6671747967479674, "calib/avg_num_step_conf": 5.796875, "calib/ece": 0.2733267716535432, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7440944881889764, "calib/gap": 0.21942615176151747, "calib/mean_conf": 0.809232283464567, "calib/mu_c": 0.886981707317073, "calib/mu_w": 0.6675555555555556, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21844488188976366, "calib/std_conf": 0.34307591556340367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5095975473801562, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.14406943835119534, "calib/step_q_w": 0.3655281090289608, "calib/step_q_w_n": 587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 478.62890625, "completions/mean_terminated_length": 478.62890625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.15466666666666667, "grad_norm": 0.06480003148317337, "kl": 0.1052093505859375, "learning_rate": 1.527777777777778e-06, "loss": 0.0093, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037643514573574066, "mask/share_reasoning": 0.8297535181045532, "mask/share_step_conf": 0.1326029896736145, "num_tokens": 34333324.0, "reward": 0.7896007299423218, "reward_std": 0.17306607961654663, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.721485435962677, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.857715904712677, "step": 145 }, { "adv/mean_abs_final_conf": 0.6691650152206421, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7208544015884399, "adv/std_final_conf": 0.8580728769302368, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354173541069031, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7247138047138046, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.2909387755102042, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6775510204081633, "calib/gap": 0.4084545454545453, "calib/mean_conf": 0.7133877551020407, "calib/mu_c": 0.9384545454545453, "calib/mu_w": 0.53, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2776734693877552, "calib/std_conf": 0.4278884470459194, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5301604278074865, "calib/step_q_c_n": 561.0, "calib/step_q_gap": 0.18630798739886, "calib/step_q_w": 0.3438524404086265, "calib/step_q_w_n": 881.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 552.32421875, "completions/mean_terminated_length": 554.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.15573333333333333, "grad_norm": 0.03820987418293953, "kl": 0.0627593994140625, "learning_rate": 1.5e-06, "loss": -0.0077, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03305232897400856, "mask/share_reasoning": 0.851200520992279, "mask/share_step_conf": 0.11184092611074448, "num_tokens": 34581935.0, "reward": 0.7427989840507507, "reward_std": 0.2363566756248474, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.667646050453186, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8179517984390259, "step": 146 }, { "adv/mean_abs_final_conf": 0.6376939415931702, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7715792655944824, "adv/std_final_conf": 0.8261988162994385, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342957139015198, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.67941405237231, "calib/avg_num_step_conf": 5.41015625, "calib/ece": 0.3275100401606425, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6947791164658634, "calib/gap": 0.3267571947109149, "calib/mean_conf": 0.7224497991967871, "calib/mu_c": 0.8969827586206895, "calib/mu_w": 0.5702255639097746, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.29204819277108424, "calib/std_conf": 0.42023901031809546, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5681996726677578, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.18967254088481206, "calib/step_q_w": 0.3785271317829458, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 506.42578125, "completions/mean_terminated_length": 510.41339111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.1568, "grad_norm": 0.06945173442363739, "kl": 0.0741424560546875, "learning_rate": 1.4722222222222225e-06, "loss": -0.0975, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033999308943748474, "mask/share_reasoning": 0.8442288637161255, "mask/share_step_conf": 0.11395932734012604, "num_tokens": 34815260.0, "reward": 0.7255409955978394, "reward_std": 0.21569323539733887, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6502288579940796, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8008532524108887, "step": 147 }, { "adv/mean_abs_final_conf": 0.557876467704773, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7473621368408203, "adv/std_final_conf": 0.8042396306991577, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342260360717773, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7970934256055363, "calib/avg_num_step_conf": 5.23828125, "calib/ece": 0.18360784313725492, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6627450980392157, "calib/gap": 0.4832352941176471, "calib/mean_conf": 0.7098039215686275, "calib/mu_c": 0.8708823529411766, "calib/mu_w": 0.38764705882352946, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11337254901960786, "calib/std_conf": 0.4221173993193727, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5242913385826772, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.15986655982161524, "calib/step_q_w": 0.36442477876106194, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 481.53125, "completions/mean_terminated_length": 481.53125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.15786666666666666, "grad_norm": 0.03422103449702263, "kl": 0.07001495361328125, "learning_rate": 1.4444444444444445e-06, "loss": -0.022, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03782453387975693, "mask/share_reasoning": 0.8377889394760132, "mask/share_step_conf": 0.1243865117430687, "num_tokens": 35043644.0, "reward": 0.829301118850708, "reward_std": 0.18109026551246643, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.8058764934539795, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8527256846427917, "step": 148 }, { "adv/mean_abs_final_conf": 0.6228979825973511, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7397005558013916, "adv/std_final_conf": 0.8394556045532227, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934718668460846, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7786641689813318, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.22928000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.704, "calib/gap": 0.4324539758413537, "calib/mean_conf": 0.7425600000000002, "calib/mu_c": 0.9380291970802919, "calib/mu_w": 0.5055752212389382, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21192000000000003, "calib/std_conf": 0.4013694637114288, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5624447949526814, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.20254710920359492, "calib/step_q_w": 0.3598976857490865, "calib/step_q_w_n": 821.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 535.34765625, "completions/mean_terminated_length": 537.4470825195312, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.15893333333333334, "grad_norm": 0.08719879388809204, "kl": 0.06774139404296875, "learning_rate": 1.4166666666666667e-06, "loss": -0.0734, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0348748117685318, "mask/share_reasoning": 0.8489192724227905, "mask/share_step_conf": 0.11229971051216125, "num_tokens": 35285149.0, "reward": 0.7994047999382019, "reward_std": 0.21757547557353973, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7456910610198975, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8531185388565063, "step": 149 }, { "adv/mean_abs_final_conf": 0.6043723821640015, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.765477180480957, "adv/std_final_conf": 0.8277738094329834, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9337379336357117, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6841843501326259, "calib/avg_num_step_conf": 4.91015625, "calib/ece": 0.26562248995983945, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7309236947791165, "calib/gap": 0.31072015915119355, "calib/mean_conf": 0.7849799196787149, "calib/mu_c": 0.9147586206896551, "calib/mu_w": 0.6040384615384615, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23413654618473906, "calib/std_conf": 0.37364494077326327, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5627369207772795, "calib/step_q_c_n": 669.0, "calib/step_q_gap": 0.1477709343827217, "calib/step_q_w": 0.41496598639455784, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 446.51171875, "completions/mean_terminated_length": 448.2627868652344, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.16, "grad_norm": 0.054330550134181976, "kl": 0.07338714599609375, "learning_rate": 1.3888888888888892e-06, "loss": -0.0385, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04155741631984711, "mask/share_reasoning": 0.8253986835479736, "mask/share_step_conf": 0.12913760542869568, "num_tokens": 35504416.0, "reward": 0.772121012210846, "reward_std": 0.19345763325691223, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7073625326156616, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8368796110153198, "step": 150 }, { "adv/mean_abs_final_conf": 0.6646082401275635, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7533245086669922, "adv/std_final_conf": 0.8691293597221375, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349757432937622, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7634751773049645, "calib/avg_num_step_conf": 5.0625, "calib/ece": 0.24605577689243022, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5617529880478087, "calib/gap": 0.4519587362991618, "calib/mean_conf": 0.6250199203187251, "calib/mu_c": 0.8789090909090909, "calib/mu_w": 0.4269503546099291, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.216414342629482, "calib/std_conf": 0.4448226957776302, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4954858757062146, "calib/step_q_c_n": 531.0, "calib/step_q_gap": 0.1185368560983715, "calib/step_q_w": 0.3769490196078431, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 494.80859375, "completions/mean_terminated_length": 496.7490539550781, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.16106666666666666, "grad_norm": 0.05440312251448631, "kl": 0.0706787109375, "learning_rate": 1.3611111111111112e-06, "loss": -0.041, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033775001764297485, "mask/share_reasoning": 0.8540380597114563, "mask/share_step_conf": 0.10828067362308502, "num_tokens": 35738111.0, "reward": 0.773845911026001, "reward_std": 0.2120620310306549, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.72899329662323, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.818698525428772, "step": 151 }, { "adv/mean_abs_final_conf": 0.7054992914199829, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7607833743095398, "adv/std_final_conf": 0.9004095196723938, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348529577255249, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7035671519505893, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.3051190476190476, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.301888825865003, "calib/mean_conf": 0.6210714285714285, "calib/mu_c": 0.7756097560975611, "calib/mu_w": 0.4737209302325581, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.219047619047619, "calib/std_conf": 0.4392129085363309, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4830955414012739, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.12309120614115832, "calib/step_q_w": 0.3600043352601156, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 488.7265625, "completions/mean_terminated_length": 488.7265625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.16213333333333332, "grad_norm": 0.04921410232782364, "kl": 0.074859619140625, "learning_rate": 1.3333333333333334e-06, "loss": -0.073, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03677456080913544, "mask/share_reasoning": 0.8412362933158875, "mask/share_step_conf": 0.1219891905784607, "num_tokens": 35968617.0, "reward": 0.7600628137588501, "reward_std": 0.1953212320804596, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.679622232913971, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.840503454208374, "step": 152 }, { "adv/mean_abs_final_conf": 0.7010814547538757, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7753474712371826, "adv/std_final_conf": 0.9040634632110596, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346261620521545, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6131807927811795, "calib/avg_num_step_conf": 4.796875, "calib/ece": 0.3587301587301588, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.44047619047619047, "calib/gap": 0.17447631324524665, "calib/mean_conf": 0.5193650793650795, "calib/mu_c": 0.593448275862069, "calib/mu_w": 0.4189719626168224, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15134920634920643, "calib/std_conf": 0.455563049548449, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.468414442700157, "calib/step_q_c_n": 637.0, "calib/step_q_gap": 0.13057180310624833, "calib/step_q_w": 0.33784263959390864, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 466.2109375, "completions/mean_terminated_length": 468.03924560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.1632, "grad_norm": 0.04134545847773552, "kl": 0.0821990966796875, "learning_rate": 1.3055555555555556e-06, "loss": -0.0493, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035583529621362686, "mask/share_reasoning": 0.8485206961631775, "mask/share_step_conf": 0.11198952794075012, "num_tokens": 36195287.0, "reward": 0.7215287089347839, "reward_std": 0.19878242909908295, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6165081858634949, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8265491724014282, "step": 153 }, { "adv/mean_abs_final_conf": 0.6727048754692078, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7404109239578247, "adv/std_final_conf": 0.8876039981842041, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341225028038025, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7406015037593985, "calib/avg_num_step_conf": 4.84765625, "calib/ece": 0.24614173228346453, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.452755905511811, "calib/gap": 0.4022581246504691, "calib/mean_conf": 0.5460629921259843, "calib/mu_c": 0.7566942148760331, "calib/mu_w": 0.35443609022556394, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15791338582677159, "calib/std_conf": 0.4529286284710704, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4864110535405871, "calib/step_q_c_n": 579.0, "calib/step_q_gap": 0.14947147650131215, "calib/step_q_w": 0.33693957703927496, "calib/step_q_w_n": 662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 452.73046875, "completions/mean_terminated_length": 452.73046875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.16426666666666667, "grad_norm": 0.04027803987264633, "kl": 0.08528900146484375, "learning_rate": 1.2777777777777779e-06, "loss": -0.0591, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03721775487065315, "mask/share_reasoning": 0.845561146736145, "mask/share_step_conf": 0.11722112447023392, "num_tokens": 36415626.0, "reward": 0.7952397465705872, "reward_std": 0.1660701334476471, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7354468703269958, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8550325632095337, "step": 154 }, { "adv/mean_abs_final_conf": 0.7376800775527954, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7585830688476562, "adv/std_final_conf": 0.8894400000572205, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341614246368408, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7306756503836797, "calib/avg_num_step_conf": 4.8125, "calib/ece": 0.26125984251968504, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3464566929133858, "calib/gap": 0.36348243808097813, "calib/mean_conf": 0.4148031496062992, "calib/mu_c": 0.6108547008547007, "calib/mu_w": 0.24737226277372262, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10771653543307089, "calib/std_conf": 0.4486152805147634, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4349628252788104, "calib/step_q_c_n": 538.0, "calib/step_q_gap": 0.11674956879466053, "calib/step_q_w": 0.3182132564841499, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 421.6470947265625, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.16533333333333333, "grad_norm": 0.059748951345682144, "kl": 0.0921630859375, "learning_rate": 1.25e-06, "loss": -0.0633, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03900737315416336, "mask/share_reasoning": 0.8342753052711487, "mask/share_step_conf": 0.12281106412410736, "num_tokens": 36630362.0, "reward": 0.7750540971755981, "reward_std": 0.19510385394096375, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7151566743850708, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8349515199661255, "step": 155 }, { "adv/mean_abs_final_conf": 0.6774182319641113, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7479729056358337, "adv/std_final_conf": 0.8583827018737793, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9322972297668457, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6803336092188197, "calib/avg_num_step_conf": 5.5703125, "calib/ece": 0.3261904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.45634920634920634, "calib/gap": 0.29275864264340745, "calib/mean_conf": 0.5063492063492064, "calib/mu_c": 0.6376258992805756, "calib/mu_w": 0.3448672566371681, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14047619047619042, "calib/std_conf": 0.4690070618729229, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3990714477211797, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.08732880066235615, "calib/step_q_w": 0.31174264705882354, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 501.6015625, "completions/mean_terminated_length": 501.6015625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1664, "grad_norm": 0.0492730550467968, "kl": 0.08318328857421875, "learning_rate": 1.2222222222222223e-06, "loss": 0.0033, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03676208108663559, "mask/share_reasoning": 0.8379840850830078, "mask/share_step_conf": 0.1252538412809372, "num_tokens": 36863532.0, "reward": 0.7542530298233032, "reward_std": 0.18153919279575348, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6649140119552612, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8435919284820557, "step": 156 }, { "adv/mean_abs_final_conf": 0.5950201153755188, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7625734210014343, "adv/std_final_conf": 0.8319129943847656, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340895414352417, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8078882868937047, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.20484251968503933, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4645669291338583, "calib/gap": 0.5139125386996902, "calib/mean_conf": 0.532244094488189, "calib/mu_c": 0.7386184210526314, "calib/mu_w": 0.22470588235294117, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06933070866141736, "calib/std_conf": 0.46117711794783167, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4866437571592211, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.15837281241768814, "calib/step_q_w": 0.32827094474153296, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 460.09765625, "completions/mean_terminated_length": 461.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.16746666666666668, "grad_norm": 0.03275960311293602, "kl": 0.08441925048828125, "learning_rate": 1.1944444444444446e-06, "loss": -0.0446, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03766844421625137, "mask/share_reasoning": 0.8255374431610107, "mask/share_step_conf": 0.1328878104686737, "num_tokens": 37085045.0, "reward": 0.8159181475639343, "reward_std": 0.1707736998796463, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7823695540428162, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8494666814804077, "step": 157 }, { "adv/mean_abs_final_conf": 0.6904779672622681, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.754570484161377, "adv/std_final_conf": 0.8861827254295349, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348433017730713, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6587600139324278, "calib/avg_num_step_conf": 5.50390625, "calib/ece": 0.2991269841269841, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.42063492063492064, "calib/gap": 0.2818557993730407, "calib/mean_conf": 0.5152380952380952, "calib/mu_c": 0.6125454545454545, "calib/mu_w": 0.3306896551724138, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07980158730158732, "calib/std_conf": 0.44961220377882105, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4330023923444976, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.10898843073891301, "calib/step_q_w": 0.3240139616055846, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 481.2265625, "completions/mean_terminated_length": 481.2265625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.16853333333333334, "grad_norm": 0.06318576633930206, "kl": 0.0872955322265625, "learning_rate": 1.1666666666666668e-06, "loss": -0.0824, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0401587039232254, "mask/share_reasoning": 0.8288679122924805, "mask/share_step_conf": 0.13097335398197174, "num_tokens": 37313479.0, "reward": 0.7451825141906738, "reward_std": 0.2053423523902893, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6652324199676514, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8251326084136963, "step": 158 }, { "adv/mean_abs_final_conf": 0.630803108215332, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7304748296737671, "adv/std_final_conf": 0.8433563113212585, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339118599891663, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6708116883116884, "calib/avg_num_step_conf": 4.97265625, "calib/ece": 0.33316, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.444, "calib/gap": 0.26135064935064933, "calib/mean_conf": 0.5157200000000001, "calib/mu_c": 0.6307142857142857, "calib/mu_w": 0.36936363636363634, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14444000000000004, "calib/std_conf": 0.4637211248153355, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47020600858369094, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.13975304691121704, "calib/step_q_w": 0.3304529616724739, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 454.875, "completions/mean_terminated_length": 458.4566955566406, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.1696, "grad_norm": 0.06016838923096657, "kl": 0.08831024169921875, "learning_rate": 1.138888888888889e-06, "loss": -0.0159, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.038090698421001434, "mask/share_reasoning": 0.8347824215888977, "mask/share_step_conf": 0.11931438744068146, "num_tokens": 37534711.0, "reward": 0.7516264915466309, "reward_std": 0.19282618165016174, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6458941698074341, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8573589324951172, "step": 159 }, { "adv/mean_abs_final_conf": 0.6906576156616211, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7718011736869812, "adv/std_final_conf": 0.8629789352416992, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352890849113464, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7526358475263585, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.26812244897959187, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3795918367346939, "calib/gap": 0.40404028115706936, "calib/mean_conf": 0.44167346938775515, "calib/mu_c": 0.6197810218978101, "calib/mu_w": 0.21574074074074076, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.07530612244897958, "calib/std_conf": 0.4569946807958693, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.44134020618556696, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.1309851765997681, "calib/step_q_w": 0.31035502958579886, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 513.6171875, "completions/mean_terminated_length": 515.6314086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.17066666666666666, "grad_norm": 0.03252209350466728, "kl": 0.085205078125, "learning_rate": 1.111111111111111e-06, "loss": -0.0741, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.034298818558454514, "mask/share_reasoning": 0.8432285785675049, "mask/share_step_conf": 0.1185663640499115, "num_tokens": 37771037.0, "reward": 0.7507309913635254, "reward_std": 0.23102092742919922, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6908586025238037, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8106033802032471, "step": 160 }, { "adv/mean_abs_final_conf": 0.5563451051712036, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7617762088775635, "adv/std_final_conf": 0.7966976165771484, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341981410980225, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7169324577861163, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.2749411764705882, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.42745098039215684, "calib/gap": 0.3759762798177432, "calib/mean_conf": 0.49125490196078425, "calib/mu_c": 0.6254268292682926, "calib/mu_w": 0.24945054945054945, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06152941176470587, "calib/std_conf": 0.46211703140142424, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.43188876013904987, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.11476941857526385, "calib/step_q_w": 0.317119341563786, "calib/step_q_w_n": 486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 442.19140625, "completions/mean_terminated_length": 442.19140625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.17173333333333332, "grad_norm": 0.05513952672481537, "kl": 0.09521484375, "learning_rate": 1.0833333333333335e-06, "loss": -0.0572, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03878117725253105, "mask/share_reasoning": 0.8327029943466187, "mask/share_step_conf": 0.12851576507091522, "num_tokens": 37988158.0, "reward": 0.7575365900993347, "reward_std": 0.14906707406044006, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6898187398910522, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8252544403076172, "step": 161 }, { "adv/mean_abs_final_conf": 0.6162930727005005, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7637404203414917, "adv/std_final_conf": 0.8308175802230835, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9338586926460266, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7578620464203989, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.2686328125000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.51171875, "calib/gap": 0.3779320039228506, "calib/mean_conf": 0.5837890625000001, "calib/mu_c": 0.7240372670807453, "calib/mu_w": 0.34610526315789475, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11175781250000004, "calib/std_conf": 0.45333410954876435, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4803115577889447, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.12179609387141888, "calib/step_q_w": 0.3585154639175258, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 419.67578125, "completions/mean_terminated_length": 421.32159423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.1728, "grad_norm": 0.06877302378416061, "kl": 0.0907135009765625, "learning_rate": 1.0555555555555557e-06, "loss": 0.0355, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03779768943786621, "mask/share_reasoning": 0.8321254253387451, "mask/share_step_conf": 0.12617066502571106, "num_tokens": 38199739.0, "reward": 0.7955017685890198, "reward_std": 0.1514347791671753, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7354754209518433, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8555281162261963, "step": 162 }, { "adv/mean_abs_final_conf": 0.6081020832061768, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7466784119606018, "adv/std_final_conf": 0.8114362359046936, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9338409900665283, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7797653768876791, "calib/avg_num_step_conf": 5.12890625, "calib/ece": 0.19672000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.444, "calib/gap": 0.4820027221466071, "calib/mean_conf": 0.51448, "calib/mu_c": 0.7284892086330936, "calib/mu_w": 0.24648648648648652, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07760000000000006, "calib/std_conf": 0.4561420059586707, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5087028657616892, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.1811028657616892, "calib/step_q_w": 0.3276, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 492.984375, "completions/mean_terminated_length": 496.86614990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.17386666666666667, "grad_norm": 0.059631235897541046, "kl": 0.0717620849609375, "learning_rate": 1.0277777777777777e-06, "loss": -0.0608, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.036981552839279175, "mask/share_reasoning": 0.8366219401359558, "mask/share_step_conf": 0.11858400702476501, "num_tokens": 38430775.0, "reward": 0.8108958601951599, "reward_std": 0.185151606798172, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7628577947616577, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8589339256286621, "step": 163 }, { "adv/mean_abs_final_conf": 0.6528012156486511, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7602795362472534, "adv/std_final_conf": 0.8653119206428528, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9333934783935547, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7601586687306501, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.21524000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.484, "calib/gap": 0.42793343653250765, "calib/mean_conf": 0.58148, "calib/mu_c": 0.7766176470588235, "calib/mu_w": 0.3486842105263159, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12636000000000003, "calib/std_conf": 0.4410839031295519, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4856828885400314, "calib/step_q_c_n": 637.0, "calib/step_q_gap": 0.1675082386800874, "calib/step_q_w": 0.318174649859944, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 508.0390625, "completions/mean_terminated_length": 510.0314025878906, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.17493333333333333, "grad_norm": 0.0942251980304718, "kl": 0.1238250732421875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034159790724515915, "mask/share_reasoning": 0.8542314171791077, "mask/share_step_conf": 0.10770251601934433, "num_tokens": 38666969.0, "reward": 0.8032075762748718, "reward_std": 0.16987591981887817, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7502793073654175, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.856135904788971, "step": 164 }, { "adv/mean_abs_final_conf": 0.5365115404129028, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.738160252571106, "adv/std_final_conf": 0.7709892988204956, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342562556266785, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7521900800403354, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.24726190476190468, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.49206349206349204, "calib/gap": 0.4190754395916051, "calib/mean_conf": 0.5639285714285714, "calib/mu_c": 0.7784552845528454, "calib/mu_w": 0.3593798449612403, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.161547619047619, "calib/std_conf": 0.45445855938638285, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.46590625, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.12072669198895025, "calib/step_q_w": 0.34517955801104977, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 512.60546875, "completions/mean_terminated_length": 512.60546875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.176, "grad_norm": 0.03339356184005737, "kl": 0.076690673828125, "learning_rate": 9.722222222222224e-07, "loss": -0.0881, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03438538312911987, "mask/share_reasoning": 0.8467801809310913, "mask/share_step_conf": 0.11883437633514404, "num_tokens": 38903772.0, "reward": 0.7943029999732971, "reward_std": 0.1667207032442093, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7338140606880188, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8547918796539307, "step": 165 }, { "adv/mean_abs_final_conf": 0.5652823448181152, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7558987140655518, "adv/std_final_conf": 0.8116322159767151, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9338290095329285, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8307422969187674, "calib/avg_num_step_conf": 5.6875, "calib/ece": 0.1436614173228345, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5787401574803149, "calib/gap": 0.5618151260504202, "calib/mean_conf": 0.646732283464567, "calib/mu_c": 0.8325294117647059, "calib/mu_w": 0.2707142857142857, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.06055118110236205, "calib/std_conf": 0.4369948728292332, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4977944382647386, "calib/step_q_c_n": 899.0, "calib/step_q_gap": 0.1694156231839487, "calib/step_q_w": 0.3283788150807899, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 491.43359375, "completions/mean_terminated_length": 493.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.17706666666666668, "grad_norm": 0.03352775797247887, "kl": 0.07314300537109375, "learning_rate": 9.444444444444445e-07, "loss": -0.0695, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03505018725991249, "mask/share_reasoning": 0.8370988368988037, "mask/share_step_conf": 0.1239447072148323, "num_tokens": 39135763.0, "reward": 0.8427225351333618, "reward_std": 0.1596928834915161, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.8177686929702759, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8676763772964478, "step": 166 }, { "adv/mean_abs_final_conf": 0.5534255504608154, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.745092511177063, "adv/std_final_conf": 0.795141875743866, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934163510799408, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6478658536585366, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.23289682539682524, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7023809523809523, "calib/gap": 0.2849002217294899, "calib/mean_conf": 0.7765476190476192, "calib/mu_c": 0.8760365853658536, "calib/mu_w": 0.5911363636363637, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1793253968253967, "calib/std_conf": 0.3697108288091863, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5175310734463276, "calib/step_q_c_n": 885.0, "calib/step_q_gap": 0.022448331129542753, "calib/step_q_w": 0.4950827423167849, "calib/step_q_w_n": 423.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1950.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 451.62109375, "completions/mean_terminated_length": 451.62109375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.17813333333333334, "grad_norm": 0.030953530222177505, "kl": 0.078460693359375, "learning_rate": 9.166666666666666e-07, "loss": -0.022, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037154220044612885, "mask/share_reasoning": 0.8443901538848877, "mask/share_step_conf": 0.11845562607049942, "num_tokens": 39356986.0, "reward": 0.7836979627609253, "reward_std": 0.15814566612243652, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.734974205493927, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8324216604232788, "step": 167 }, { "adv/mean_abs_final_conf": 0.6224509477615356, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7292006611824036, "adv/std_final_conf": 0.8391119837760925, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339708089828491, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7255116959064327, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.23704860557768925, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6533864541832669, "calib/gap": 0.34066127060074425, "calib/mean_conf": 0.7330645418326693, "calib/mu_c": 0.867428947368421, "calib/mu_w": 0.5267676767676768, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18226772908366537, "calib/std_conf": 0.39824627205395124, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5217144754316069, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.20040992997706147, "calib/step_q_w": 0.3213045454545454, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 533.8125, "completions/mean_terminated_length": 533.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1792, "grad_norm": 0.06673835963010788, "kl": 0.09064483642578125, "learning_rate": 8.88888888888889e-07, "loss": 0.0176, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03487485274672508, "mask/share_reasoning": 0.8501797914505005, "mask/share_step_conf": 0.11494536697864532, "num_tokens": 39598314.0, "reward": 0.79753577709198, "reward_std": 0.1876133680343628, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7344002723693848, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8606711626052856, "step": 168 }, { "adv/mean_abs_final_conf": 0.5384137630462646, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7396717071533203, "adv/std_final_conf": 0.7766261696815491, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9322170615196228, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7509878819810327, "calib/avg_num_step_conf": 4.6328125, "calib/ece": 0.2440399999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.656, "calib/gap": 0.3588935721812435, "calib/mean_conf": 0.7184400000000001, "calib/mu_c": 0.8677397260273974, "calib/mu_w": 0.5088461538461538, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18923999999999988, "calib/std_conf": 0.4037984229785946, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6070416024653313, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.20452763598488438, "calib/step_q_w": 0.40251396648044696, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 472.47265625, "completions/mean_terminated_length": 476.1929016113281, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.18026666666666666, "grad_norm": 0.055275809019804, "kl": 0.080841064453125, "learning_rate": 8.611111111111112e-07, "loss": -0.1261, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035248324275016785, "mask/share_reasoning": 0.8490923643112183, "mask/share_step_conf": 0.10784684866666794, "num_tokens": 39823451.0, "reward": 0.7923622131347656, "reward_std": 0.18291811645030975, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7293832302093506, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8553411960601807, "step": 169 }, { "adv/mean_abs_final_conf": 0.6205732822418213, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7723401784896851, "adv/std_final_conf": 0.844386637210846, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344258308410645, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7086206896551723, "calib/avg_num_step_conf": 5.56640625, "calib/ece": 0.272730923694779, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7068273092369478, "calib/gap": 0.31264389920424396, "calib/mean_conf": 0.761004016064257, "calib/mu_c": 0.8915862068965518, "calib/mu_w": 0.5789423076923078, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.22570281124497985, "calib/std_conf": 0.39005534050769003, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5763198959687906, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.20108818865171746, "calib/step_q_w": 0.37523170731707317, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 508.46875, "completions/mean_terminated_length": 510.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.18133333333333335, "grad_norm": 0.042287420481443405, "kl": 0.0757293701171875, "learning_rate": 8.333333333333333e-07, "loss": -0.0018, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03357735648751259, "mask/share_reasoning": 0.8427532911300659, "mask/share_step_conf": 0.11976308375597, "num_tokens": 40057771.0, "reward": 0.770465612411499, "reward_std": 0.2155264914035797, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6976511478424072, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8432800769805908, "step": 170 }, { "adv/mean_abs_final_conf": 0.665807843208313, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7515328526496887, "adv/std_final_conf": 0.8829014301300049, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342430233955383, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.711092509920635, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.3056299212598425, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.594488188976378, "calib/gap": 0.29211929563492056, "calib/mean_conf": 0.6565748031496065, "calib/mu_c": 0.801484375, "calib/mu_w": 0.5093650793650795, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22913385826771654, "calib/std_conf": 0.42909224857868916, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.54494227994228, "calib/step_q_c_n": 693.0, "calib/step_q_gap": 0.14006921492680008, "calib/step_q_w": 0.40487306501547987, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 477.64453125, "completions/mean_terminated_length": 477.64453125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.1824, "grad_norm": 0.0317617803812027, "kl": 0.06999969482421875, "learning_rate": 8.055555555555557e-07, "loss": -0.0782, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03570724278688431, "mask/share_reasoning": 0.8483747243881226, "mask/share_step_conf": 0.11591806262731552, "num_tokens": 40286944.0, "reward": 0.7655474543571472, "reward_std": 0.19288235902786255, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6793617010116577, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8517332077026367, "step": 171 }, { "adv/mean_abs_final_conf": 0.6147419214248657, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7623175382614136, "adv/std_final_conf": 0.8438313603401184, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339761137962341, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7227416570771, "calib/avg_num_step_conf": 5.1171875, "calib/ece": 0.23525609756097562, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7804878048780488, "calib/gap": 0.27829789988492515, "calib/mean_conf": 0.8279146341463414, "calib/mu_c": 0.9274683544303797, "calib/mu_w": 0.6491704545454545, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21044715447154475, "calib/std_conf": 0.3294144793671585, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5150586701434159, "calib/step_q_c_n": 767.0, "calib/step_q_gap": 0.10485240863328693, "calib/step_q_w": 0.41020626151012896, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 486.46875, "completions/mean_terminated_length": 486.46875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.18346666666666667, "grad_norm": 0.0316920168697834, "kl": 0.07049560546875, "learning_rate": 7.777777777777779e-07, "loss": -0.0391, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03700914978981018, "mask/share_reasoning": 0.8399681448936462, "mask/share_step_conf": 0.12302268296480179, "num_tokens": 40514832.0, "reward": 0.7855240702629089, "reward_std": 0.20845410227775574, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7256511449813843, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8453969955444336, "step": 172 }, { "adv/mean_abs_final_conf": 0.5969452261924744, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7282835841178894, "adv/std_final_conf": 0.8132307529449463, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352379441261292, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6944219066937118, "calib/avg_num_step_conf": 5.34765625, "calib/ece": 0.3042914979757085, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.805668016194332, "calib/gap": 0.2197221095334685, "calib/mean_conf": 0.854574898785425, "calib/mu_c": 0.9453103448275861, "calib/mu_w": 0.7255882352941176, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.28591093117408906, "calib/std_conf": 0.3066399487270928, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5201515892420537, "calib/step_q_c_n": 818.0, "calib/step_q_gap": 0.01553452935094668, "calib/step_q_w": 0.5046170598911071, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 489.921875, "completions/mean_terminated_length": 495.7312316894531, "completions/min_length": 0.0, "completions/min_terminated_length": 73.0, "epoch": 0.18453333333333333, "grad_norm": 0.03221871331334114, "kl": 0.07108306884765625, "learning_rate": 7.5e-07, "loss": -0.0237, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.036594174802303314, "mask/share_reasoning": 0.8292064070701599, "mask/share_step_conf": 0.12248068302869797, "num_tokens": 40743412.0, "reward": 0.7309811115264893, "reward_std": 0.20213276147842407, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.670396089553833, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7915661334991455, "step": 173 }, { "adv/mean_abs_final_conf": 0.6975492238998413, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7517229318618774, "adv/std_final_conf": 0.8743834495544434, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348888993263245, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6683354591836735, "calib/avg_num_step_conf": 5.31640625, "calib/ece": 0.37138888888888894, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5873015873015873, "calib/gap": 0.21378571428571425, "calib/mean_conf": 0.7062301587301588, "calib/mu_c": 0.8250000000000001, "calib/mu_w": 0.6112142857142858, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.31658730158730164, "calib/std_conf": 0.4008845311862497, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5150877192982457, "calib/step_q_c_n": 570.0, "calib/step_q_gap": 0.08085257391265782, "calib/step_q_w": 0.4342351453855879, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 517.17578125, "completions/mean_terminated_length": 517.17578125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1856, "grad_norm": 0.0422983393073082, "kl": 0.06575775146484375, "learning_rate": 7.222222222222222e-07, "loss": -0.0301, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033281851559877396, "mask/share_reasoning": 0.8519580364227295, "mask/share_step_conf": 0.1147601306438446, "num_tokens": 40980041.0, "reward": 0.7034145593643188, "reward_std": 0.20858539640903473, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6095452904701233, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.797283947467804, "step": 174 }, { "adv/mean_abs_final_conf": 0.625144362449646, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7550072073936462, "adv/std_final_conf": 0.8507199883460999, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350666999816895, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7613138686131387, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.24036437246963566, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5182186234817814, "calib/gap": 0.41225414731254134, "calib/mean_conf": 0.6087044534412956, "calib/mu_c": 0.8373636363636363, "calib/mu_w": 0.42510948905109497, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2018623481781377, "calib/std_conf": 0.43674442180772827, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5487040280210157, "calib/step_q_c_n": 571.0, "calib/step_q_gap": 0.1899890692459863, "calib/step_q_w": 0.3587149587750294, "calib/step_q_w_n": 849.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 537.10546875, "completions/mean_terminated_length": 537.10546875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.18666666666666668, "grad_norm": 0.02944857068359852, "kl": 0.0681304931640625, "learning_rate": 6.944444444444446e-07, "loss": -0.063, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033590368926525116, "mask/share_reasoning": 0.8462784290313721, "mask/share_step_conf": 0.12013113498687744, "num_tokens": 41223364.0, "reward": 0.7608479857444763, "reward_std": 0.20738910138607025, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.7097246050834656, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8119714260101318, "step": 175 }, { "adv/mean_abs_final_conf": 0.6091908812522888, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7560907006263733, "adv/std_final_conf": 0.8196773529052734, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.933989405632019, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7352638352638352, "calib/avg_num_step_conf": 5.2578125, "calib/ece": 0.23609163346613551, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6175298804780877, "calib/gap": 0.37361383526383535, "calib/mean_conf": 0.6888406374501992, "calib/mu_c": 0.8540642857142858, "calib/mu_w": 0.4804504504504505, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18358167330677297, "calib/std_conf": 0.4162838042481054, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.533147102526003, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.1934888558692422, "calib/step_q_w": 0.33965824665676075, "calib/step_q_w_n": 673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 487.98046875, "completions/mean_terminated_length": 487.98046875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.18773333333333334, "grad_norm": 0.02829635515809059, "kl": 0.07366561889648438, "learning_rate": 6.666666666666667e-07, "loss": -0.0086, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03810054063796997, "mask/share_reasoning": 0.837080180644989, "mask/share_step_conf": 0.12481928616762161, "num_tokens": 41452351.0, "reward": 0.7918046712875366, "reward_std": 0.19538137316703796, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7325851321220398, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8510242700576782, "step": 176 }, { "adv/mean_abs_final_conf": 0.6599198579788208, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7703679800033569, "adv/std_final_conf": 0.8570880889892578, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340048432350159, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7609843546284225, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.251366935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5725806451612904, "calib/gap": 0.37065202086049526, "calib/mean_conf": 0.6704798387096775, "calib/mu_c": 0.8468384615384615, "calib/mu_w": 0.4761864406779663, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19882661290322587, "calib/std_conf": 0.41608720733717763, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5185956112852664, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.1675077559881088, "calib/step_q_w": 0.35108785529715764, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2388.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 501.27734375, "completions/mean_terminated_length": 507.22137451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.1888, "grad_norm": 0.06848868727684021, "kl": 0.1034698486328125, "learning_rate": 6.388888888888889e-07, "loss": -0.1139, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03480444848537445, "mask/share_reasoning": 0.8327597379684448, "mask/share_step_conf": 0.12071707844734192, "num_tokens": 41684510.0, "reward": 0.76722252368927, "reward_std": 0.18885210156440735, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7177945375442505, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8166505694389343, "step": 177 }, { "adv/mean_abs_final_conf": 0.6745896339416504, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7268320322036743, "adv/std_final_conf": 0.8851636648178101, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9334161877632141, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7961158613375755, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.1793951612903225, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6290322580645161, "calib/gap": 0.4568800954464108, "calib/mean_conf": 0.7118145161290322, "calib/mu_c": 0.908936170212766, "calib/mu_w": 0.4520560747663552, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16133064516129025, "calib/std_conf": 0.3976578783029185, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5568945686900958, "calib/step_q_c_n": 626.0, "calib/step_q_gap": 0.2220762432556566, "calib/step_q_w": 0.3348183254344392, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 471.46875, "completions/mean_terminated_length": 473.31768798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.18986666666666666, "grad_norm": 0.04752740263938904, "kl": 0.0688934326171875, "learning_rate": 6.111111111111112e-07, "loss": -0.1167, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.037102676928043365, "mask/share_reasoning": 0.8386009335517883, "mask/share_step_conf": 0.12039016932249069, "num_tokens": 41911278.0, "reward": 0.8107980489730835, "reward_std": 0.19798102974891663, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7743703126907349, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8472259044647217, "step": 178 }, { "adv/mean_abs_final_conf": 0.7388206124305725, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7421302795410156, "adv/std_final_conf": 0.9340841770172119, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341541528701782, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7627878074306645, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.19992031872509955, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5418326693227091, "calib/gap": 0.3899116954474098, "calib/mean_conf": 0.6619123505976097, "calib/mu_c": 0.8234693877551021, "calib/mu_w": 0.43355769230769237, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13808764940239038, "calib/std_conf": 0.4118904143496424, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5138596491228069, "calib/step_q_c_n": 798.0, "calib/step_q_gap": 0.147195719976269, "calib/step_q_w": 0.3666639291465379, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 473.6875, "completions/mean_terminated_length": 477.4173278808594, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.19093333333333334, "grad_norm": 0.04047829657793045, "kl": 0.07904815673828125, "learning_rate": 5.833333333333334e-07, "loss": -0.0288, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034941166639328, "mask/share_reasoning": 0.8364151120185852, "mask/share_step_conf": 0.12083126604557037, "num_tokens": 42138806.0, "reward": 0.8064365386962891, "reward_std": 0.19271725416183472, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7548269033432007, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8580461144447327, "step": 179 }, { "adv/mean_abs_final_conf": 0.6840267181396484, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.768051028251648, "adv/std_final_conf": 0.8719254732131958, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345057606697083, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7068607068607068, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.2423412698412697, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5952380952380952, "calib/gap": 0.33203482328482326, "calib/mean_conf": 0.6820238095238096, "calib/mu_c": 0.8190540540540541, "calib/mu_w": 0.4870192307692308, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1685317460317459, "calib/std_conf": 0.4115373223500814, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.479326431181486, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.06518008971807138, "calib/step_q_w": 0.4141463414634146, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 536.5859375, "completions/mean_terminated_length": 538.6902465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 42.0, "epoch": 0.192, "grad_norm": 0.057193268090486526, "kl": 0.08069610595703125, "learning_rate": 5.555555555555555e-07, "loss": -0.0664, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032121311873197556, "mask/share_reasoning": 0.8479681015014648, "mask/share_step_conf": 0.11600431054830551, "num_tokens": 42380028.0, "reward": 0.7702518701553345, "reward_std": 0.20088596642017365, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7169581651687622, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.823545515537262, "step": 180 }, { "adv/mean_abs_final_conf": 0.7163760662078857, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7323043942451477, "adv/std_final_conf": 0.9061623215675354, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349632859230042, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7605711683580536, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.24403225806451612, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5604838709677419, "calib/gap": 0.36672911787665885, "calib/mean_conf": 0.6743548387096775, "calib/mu_c": 0.8547619047619047, "calib/mu_w": 0.4880327868852459, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20516129032258063, "calib/std_conf": 0.41098375671434817, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5386635944700461, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.14001010962156119, "calib/step_q_w": 0.3986534848484849, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 479.37109375, "completions/mean_terminated_length": 479.37109375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.19306666666666666, "grad_norm": 0.03152427449822426, "kl": 0.075347900390625, "learning_rate": 5.277777777777779e-07, "loss": -0.0389, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03595711290836334, "mask/share_reasoning": 0.841357946395874, "mask/share_step_conf": 0.12268491089344025, "num_tokens": 42609011.0, "reward": 0.772752046585083, "reward_std": 0.23635204136371613, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7130539417266846, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8324500918388367, "step": 181 }, { "adv/mean_abs_final_conf": 0.6520901918411255, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7489023804664612, "adv/std_final_conf": 0.8481500744819641, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345879554748535, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7276109307359306, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.24531599999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.684, "calib/gap": 0.270469696969697, "calib/mean_conf": 0.7782760000000001, "calib/mu_c": 0.8821363636363637, "calib/mu_w": 0.6116666666666667, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20379599999999998, "calib/std_conf": 0.3521563173137747, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5275940959409594, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.15863234730707965, "calib/step_q_w": 0.36896174863387976, "calib/step_q_w_n": 549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 505.16796875, "completions/mean_terminated_length": 507.1490478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.19413333333333332, "grad_norm": 0.038688868284225464, "kl": 0.06845474243164062, "learning_rate": 5.000000000000001e-07, "loss": -0.0739, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03429903835058212, "mask/share_reasoning": 0.840556263923645, "mask/share_step_conf": 0.12123845517635345, "num_tokens": 42844494.0, "reward": 0.7823052406311035, "reward_std": 0.19520391523838043, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7236179709434509, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8409925699234009, "step": 182 }, { "adv/mean_abs_final_conf": 0.6207889318466187, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7769383788108826, "adv/std_final_conf": 0.8270253539085388, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348618984222412, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.689572192513369, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.2885772357723577, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5487804878048781, "calib/gap": 0.2507139037433156, "calib/mean_conf": 0.6483333333333331, "calib/mu_c": 0.7604411764705883, "calib/mu_w": 0.5097272727272727, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1920325203252032, "calib/std_conf": 0.41847291423197214, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5079073033707865, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.11369397003745313, "calib/step_q_w": 0.39421333333333336, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 510.3359375, "completions/mean_terminated_length": 514.3543090820312, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.1952, "grad_norm": 0.04733794927597046, "kl": 0.07053375244140625, "learning_rate": 4.7222222222222226e-07, "loss": -0.134, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03429635614156723, "mask/share_reasoning": 0.8464791774749756, "mask/share_step_conf": 0.11141195148229599, "num_tokens": 43081820.0, "reward": 0.7362129092216492, "reward_std": 0.19574016332626343, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6615542769432068, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8108716011047363, "step": 183 }, { "adv/mean_abs_final_conf": 0.6285792589187622, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7463577389717102, "adv/std_final_conf": 0.8468300104141235, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343642592430115, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7495745887691434, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.1942000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.672, "calib/gap": 0.3661500283607485, "calib/mean_conf": 0.7625200000000001, "calib/mu_c": 0.8884756097560974, "calib/mu_w": 0.5223255813953489, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1503600000000001, "calib/std_conf": 0.37392572738446334, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.506656942823804, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.16948271414866795, "calib/step_q_w": 0.3371742286751361, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 498.54296875, "completions/mean_terminated_length": 498.54296875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.19626666666666667, "grad_norm": 0.0312087070196867, "kl": 0.0698089599609375, "learning_rate": 4.444444444444445e-07, "loss": -0.0037, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03531767427921295, "mask/share_reasoning": 0.8455306887626648, "mask/share_step_conf": 0.11915168166160583, "num_tokens": 43314727.0, "reward": 0.8005227446556091, "reward_std": 0.18379950523376465, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7651569843292236, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8358885049819946, "step": 184 }, { "adv/mean_abs_final_conf": 0.5852677822113037, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7569340467453003, "adv/std_final_conf": 0.7941950559616089, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345303773880005, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8296296296296296, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.21619433198380572, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6396761133603239, "calib/gap": 0.4553373015873017, "calib/mean_conf": 0.7179757085020242, "calib/mu_c": 0.9244444444444445, "calib/mu_w": 0.46910714285714283, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1938056680161944, "calib/std_conf": 0.40363323560332265, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5647240356083085, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.2594624376701642, "calib/step_q_w": 0.30526159793814434, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 484.98046875, "completions/mean_terminated_length": 494.6414489746094, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.19733333333333333, "grad_norm": 0.04685940220952034, "kl": 0.06523895263671875, "learning_rate": 4.1666666666666667e-07, "loss": -0.1251, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03478875011205673, "mask/share_reasoning": 0.8340494632720947, "mask/share_step_conf": 0.11163052171468735, "num_tokens": 43545802.0, "reward": 0.7864350080490112, "reward_std": 0.1933988332748413, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7540351748466492, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8188347220420837, "step": 185 }, { "adv/mean_abs_final_conf": 0.58069908618927, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7645832300186157, "adv/std_final_conf": 0.7975696921348572, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9337515830993652, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.779527048914804, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.21015873015873013, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6111111111111112, "calib/gap": 0.4250068027210885, "calib/mean_conf": 0.6784920634920635, "calib/mu_c": 0.8555782312925171, "calib/mu_w": 0.4305714285714286, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15265873015873013, "calib/std_conf": 0.4219758276766112, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5433672086720868, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.2078892472395799, "calib/step_q_w": 0.33547796143250685, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 499.56640625, "completions/mean_terminated_length": 499.56640625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.1984, "grad_norm": 0.05687674134969711, "kl": 0.067626953125, "learning_rate": 3.8888888888888895e-07, "loss": -0.0359, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03511364758014679, "mask/share_reasoning": 0.8423900008201599, "mask/share_step_conf": 0.12249638140201569, "num_tokens": 43778731.0, "reward": 0.8191449046134949, "reward_std": 0.16841217875480652, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7642945051193237, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8739952445030212, "step": 186 }, { "adv/mean_abs_final_conf": 0.6501079797744751, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7731779217720032, "adv/std_final_conf": 0.8458935618400574, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339480996131897, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6654769630110318, "calib/avg_num_step_conf": 5.8359375, "calib/ece": 0.2888353413654618, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6024096385542169, "calib/gap": 0.23776638546398454, "calib/mean_conf": 0.7121285140562249, "calib/mu_c": 0.8219402985074628, "calib/mu_w": 0.5841739130434782, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23140562248995977, "calib/std_conf": 0.3897047172112078, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5273561643835617, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.1337697769490067, "calib/step_q_w": 0.393586387434555, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 525.34765625, "completions/mean_terminated_length": 527.4078979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.19946666666666665, "grad_norm": 0.033688999712467194, "kl": 0.06855010986328125, "learning_rate": 3.611111111111111e-07, "loss": -0.0526, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03296507149934769, "mask/share_reasoning": 0.8472602367401123, "mask/share_step_conf": 0.1158684566617012, "num_tokens": 44014764.0, "reward": 0.7461711168289185, "reward_std": 0.20649182796478271, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6634917855262756, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8288504481315613, "step": 187 }, { "adv/mean_abs_final_conf": 0.6807498335838318, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7415391206741333, "adv/std_final_conf": 0.8884475827217102, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349989891052246, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7320123124278569, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.23581673306772918, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5816733067729084, "calib/gap": 0.35707836347313066, "calib/mean_conf": 0.6776494023904384, "calib/mu_c": 0.8384057971014492, "calib/mu_w": 0.4813274336283186, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.18183266932270925, "calib/std_conf": 0.40719482081828656, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4818503401360544, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.06637682488725838, "calib/step_q_w": 0.415473515248796, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 525.28125, "completions/mean_terminated_length": 527.3411865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.20053333333333334, "grad_norm": 0.030032223090529442, "kl": 0.0639801025390625, "learning_rate": 3.3333333333333335e-07, "loss": -0.074, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0333237498998642, "mask/share_reasoning": 0.8468795418739319, "mask/share_step_conf": 0.11589042842388153, "num_tokens": 44253308.0, "reward": 0.7769525051116943, "reward_std": 0.22362953424453735, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7210996150970459, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8328053951263428, "step": 188 }, { "adv/mean_abs_final_conf": 0.6388331651687622, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7636686563491821, "adv/std_final_conf": 0.8439778685569763, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342179298400879, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7751247090123046, "calib/avg_num_step_conf": 4.953125, "calib/ece": 0.2163095238095238, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.44047619047619047, "calib/gap": 0.4233182573994014, "calib/mean_conf": 0.5520238095238096, "calib/mu_c": 0.7149677419354838, "calib/mu_w": 0.29164948453608247, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07662698412698413, "calib/std_conf": 0.44296470467954524, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5057278911564627, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.19395866038723186, "calib/step_q_w": 0.3117692307692308, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2642.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 496.68359375, "completions/mean_terminated_length": 500.594482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.2016, "grad_norm": 0.03743334114551544, "kl": 0.07477569580078125, "learning_rate": 3.055555555555556e-07, "loss": -0.004, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03554224595427513, "mask/share_reasoning": 0.8450006246566772, "mask/share_step_conf": 0.11164465546607971, "num_tokens": 44488227.0, "reward": 0.8015495538711548, "reward_std": 0.1766367256641388, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7476855516433716, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8554134964942932, "step": 189 }, { "adv/mean_abs_final_conf": 0.6988445520401001, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7438215613365173, "adv/std_final_conf": 0.9049623012542725, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344231486320496, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.753372061965812, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.23603174603174604, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5396825396825397, "calib/gap": 0.35544871794871796, "calib/mean_conf": 0.6442063492063492, "calib/mu_c": 0.7796153846153846, "calib/mu_w": 0.42416666666666664, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13059523809523807, "calib/std_conf": 0.4238941788705319, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47418057142857145, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.10364353439153445, "calib/step_q_w": 0.370537037037037, "calib/step_q_w_n": 540.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 537.078125, "completions/mean_terminated_length": 537.078125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.20266666666666666, "grad_norm": 0.033774811774492264, "kl": 0.06618499755859375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0439, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03249313682317734, "mask/share_reasoning": 0.853238046169281, "mask/share_step_conf": 0.11426880955696106, "num_tokens": 44731327.0, "reward": 0.7934824228286743, "reward_std": 0.18484032154083252, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7358730435371399, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8510918021202087, "step": 190 }, { "adv/mean_abs_final_conf": 0.6249486804008484, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7530295848846436, "adv/std_final_conf": 0.8457292318344116, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347898364067078, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6813389227642277, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.3143027888446215, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6294820717131474, "calib/gap": 0.2817003302845529, "calib/mean_conf": 0.7018725099601594, "calib/mu_c": 0.8455284552845529, "calib/mu_w": 0.563828125, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26306772908366527, "calib/std_conf": 0.41427236609269846, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5229447852760737, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.12827888665856219, "calib/step_q_w": 0.3946658986175115, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 483.26171875, "completions/mean_terminated_length": 483.26171875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.20373333333333332, "grad_norm": 0.04842465743422508, "kl": 0.0717315673828125, "learning_rate": 2.5000000000000004e-07, "loss": 0.017, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03717897832393646, "mask/share_reasoning": 0.8303982019424438, "mask/share_step_conf": 0.1324227899312973, "num_tokens": 44959210.0, "reward": 0.7430580258369446, "reward_std": 0.19263812899589539, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6573207378387451, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.828795313835144, "step": 191 }, { "adv/mean_abs_final_conf": 0.6029754877090454, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7458151578903198, "adv/std_final_conf": 0.822044312953949, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339615106582642, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.802116935483871, "calib/avg_num_step_conf": 4.9140625, "calib/ece": 0.15169960474308294, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5217391304347826, "calib/gap": 0.4747728494623658, "calib/mean_conf": 0.6448537549407115, "calib/mu_c": 0.8193750000000002, "calib/mu_w": 0.3446021505376344, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08207114624505922, "calib/std_conf": 0.41213908196282156, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47786852085967124, "calib/step_q_c_n": 791.0, "calib/step_q_gap": 0.11938029816159845, "calib/step_q_w": 0.3584882226980728, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 496.9609375, "completions/mean_terminated_length": 496.9609375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2048, "grad_norm": 0.040831126272678375, "kl": 0.0685577392578125, "learning_rate": 2.2222222222222224e-07, "loss": -0.0166, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03833800554275513, "mask/share_reasoning": 0.8447378873825073, "mask/share_step_conf": 0.11692406237125397, "num_tokens": 45191408.0, "reward": 0.8363451957702637, "reward_std": 0.1784243881702423, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.800956130027771, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8717343211174011, "step": 192 }, { "adv/mean_abs_final_conf": 0.7382488250732422, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7463130950927734, "adv/std_final_conf": 0.904547393321991, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350979328155518, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7253340184994861, "calib/avg_num_step_conf": 5.17578125, "calib/ece": 0.23023904382470117, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4940239043824701, "calib/gap": 0.3053943987667011, "calib/mean_conf": 0.6239442231075698, "calib/mu_c": 0.7602158273381296, "calib/mu_w": 0.45482142857142854, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15019920318725097, "calib/std_conf": 0.4082905536156902, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.47446372239747636, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.11466632731788157, "calib/step_q_w": 0.3597973950795948, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 486.83203125, "completions/mean_terminated_length": 490.66534423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.20586666666666667, "grad_norm": 0.02862304076552391, "kl": 0.0765533447265625, "learning_rate": 1.9444444444444447e-07, "loss": -0.0848, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03511703014373779, "mask/share_reasoning": 0.8438476324081421, "mask/share_step_conf": 0.11322282254695892, "num_tokens": 45421749.0, "reward": 0.7781213521957397, "reward_std": 0.2304728627204895, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7139929533004761, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8422497510910034, "step": 193 }, { "adv/mean_abs_final_conf": 0.5932942628860474, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7308098673820496, "adv/std_final_conf": 0.8161402940750122, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343204498291016, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8292651593011305, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.15593625498007982, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5776892430278885, "calib/gap": 0.5534686536485098, "calib/mean_conf": 0.6386454183266933, "calib/mu_c": 0.885611510791367, "calib/mu_w": 0.3321428571428572, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12039840637450214, "calib/std_conf": 0.4369896086825614, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5464127423822714, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.19464887833933758, "calib/step_q_w": 0.35176386404293386, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 470.05859375, "completions/mean_terminated_length": 473.75982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.20693333333333333, "grad_norm": 0.061810459941625595, "kl": 0.071075439453125, "learning_rate": 1.6666666666666668e-07, "loss": 0.002, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03704763948917389, "mask/share_reasoning": 0.8432860374450684, "mask/share_step_conf": 0.11185386776924133, "num_tokens": 45648028.0, "reward": 0.8355604410171509, "reward_std": 0.17185883224010468, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.8120867013931274, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8590341806411743, "step": 194 }, { "adv/mean_abs_final_conf": 0.6721044182777405, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7346723079681396, "adv/std_final_conf": 0.8902400135993958, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346536993980408, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7843706293706294, "calib/avg_num_step_conf": 5.3671875, "calib/ece": 0.18090534979423875, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5267489711934157, "calib/gap": 0.4724454545454546, "calib/mean_conf": 0.6101234567901234, "calib/mu_c": 0.8045454545454547, "calib/mu_w": 0.33210000000000006, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1012757201646091, "calib/std_conf": 0.4387228766957551, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.523060606060606, "calib/step_q_c_n": 693.0, "calib/step_q_gap": 0.20959070885062064, "calib/step_q_w": 0.31346989720998536, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 514.64453125, "completions/mean_terminated_length": 516.6627807617188, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.208, "grad_norm": 0.04098103940486908, "kl": 0.07430267333984375, "learning_rate": 1.3888888888888888e-07, "loss": -0.0417, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03490000218153, "mask/share_reasoning": 0.8435318470001221, "mask/share_step_conf": 0.11766190826892853, "num_tokens": 45885761.0, "reward": 0.7753250002861023, "reward_std": 0.21513205766677856, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7455902099609375, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8050597906112671, "step": 195 }, { "adv/mean_abs_final_conf": 0.5985764265060425, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7298129796981812, "adv/std_final_conf": 0.8426317572593689, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353283047676086, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7379987129987129, "calib/avg_num_step_conf": 4.90625, "calib/ece": 0.22306772908366523, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6613545816733067, "calib/gap": 0.3861351351351352, "calib/mean_conf": 0.7302390438247013, "calib/mu_c": 0.9010000000000001, "calib/mu_w": 0.5148648648648649, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19776892430278875, "calib/std_conf": 0.3945898352430957, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.548481089258699, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.13627940858643012, "calib/step_q_w": 0.41220168067226887, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 402.6015625, "completions/mean_terminated_length": 407.3755187988281, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.20906666666666668, "grad_norm": 0.04002168029546738, "kl": 0.08524322509765625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0302, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03996644169092178, "mask/share_reasoning": 0.824980616569519, "mask/share_step_conf": 0.12333419919013977, "num_tokens": 46091371.0, "reward": 0.7817299365997314, "reward_std": 0.2106354832649231, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7396625280380249, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8237974047660828, "step": 196 }, { "adv/mean_abs_final_conf": 0.675631046295166, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.738574743270874, "adv/std_final_conf": 0.8583427667617798, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342004060745239, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8050909090909091, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.1962560975609756, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.47560975609756095, "calib/gap": 0.44174671074380156, "calib/mean_conf": 0.6146300813008131, "calib/mu_c": 0.831912, "calib/mu_w": 0.3901652892561984, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15137804878048777, "calib/std_conf": 0.42352444635002, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4821104387291981, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.1184095515302121, "calib/step_q_w": 0.363700887198986, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 508.63671875, "completions/mean_terminated_length": 512.6417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.21013333333333334, "grad_norm": 0.038508955389261246, "kl": 0.07263565063476562, "learning_rate": 8.333333333333334e-08, "loss": -0.0688, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03412006050348282, "mask/share_reasoning": 0.8306388854980469, "mask/share_step_conf": 0.1274285465478897, "num_tokens": 46326638.0, "reward": 0.7938829064369202, "reward_std": 0.19586849212646484, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7496906518936157, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8380751609802246, "step": 197 }, { "adv/mean_abs_final_conf": 0.6216574907302856, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7498562335968018, "adv/std_final_conf": 0.848364531993866, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340881109237671, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8403380102040816, "calib/avg_num_step_conf": 5.9921875, "calib/ece": 0.15583333333333332, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.49206349206349204, "calib/gap": 0.5348571428571429, "calib/mean_conf": 0.5803571428571429, "calib/mu_c": 0.8180714285714286, "calib/mu_w": 0.2832142857142857, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09031746031746035, "calib/std_conf": 0.4407932417503575, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49728750000000005, "calib/step_q_c_n": 800.0, "calib/step_q_gap": 0.15315943460490467, "calib/step_q_w": 0.3441280653950954, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 471.89453125, "completions/mean_terminated_length": 473.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.2112, "grad_norm": 0.04384619742631912, "kl": 0.08667755126953125, "learning_rate": 5.555555555555556e-08, "loss": 0.0126, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03809526562690735, "mask/share_reasoning": 0.82027268409729, "mask/share_step_conf": 0.13772578537464142, "num_tokens": 46552827.0, "reward": 0.8343632221221924, "reward_std": 0.15485632419586182, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.809451162815094, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8592753410339355, "step": 198 }, { "adv/mean_abs_final_conf": 0.6543087959289551, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7627103328704834, "adv/std_final_conf": 0.8575649261474609, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934026300907135, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7413175213396754, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.21360714285714302, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5674603174603174, "calib/gap": 0.35828025021176774, "calib/mean_conf": 0.6854325396825396, "calib/mu_c": 0.8318724832214766, "calib/mu_w": 0.4735922330097088, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1538849206349208, "calib/std_conf": 0.4021402259437888, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5151649076517151, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.1486289599392968, "calib/step_q_w": 0.36653594771241826, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 504.8828125, "completions/mean_terminated_length": 504.8828125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.21226666666666666, "grad_norm": 0.05067715048789978, "kl": 0.0714569091796875, "learning_rate": 2.777777777777778e-08, "loss": -0.027, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03707854449748993, "mask/share_reasoning": 0.8380366563796997, "mask/share_step_conf": 0.12488484382629395, "num_tokens": 46786277.0, "reward": 0.8067346811294556, "reward_std": 0.18213021755218506, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7490285038948059, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8644406795501709, "step": 199 }, { "adv/mean_abs_final_conf": 0.5960859060287476, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7385691404342651, "adv/std_final_conf": 0.8230341672897339, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934851348400116, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8526765188834153, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.1406, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.576, "calib/gap": 0.5908275862068966, "calib/mean_conf": 0.63268, "calib/mu_c": 0.8808275862068965, "calib/mu_w": 0.29, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09664, "calib/std_conf": 0.44494181372399694, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.55555587808418, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.23116214986118355, "calib/step_q_w": 0.3243937282229965, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 471.0859375, "completions/mean_terminated_length": 472.933349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.21333333333333335, "grad_norm": 0.033435944467782974, "kl": 0.08514785766601562, "learning_rate": 0.0, "loss": -0.0914, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03687979280948639, "mask/share_reasoning": 0.8445696234703064, "mask/share_step_conf": 0.11464434862136841, "num_tokens": 47014923.0, "reward": 0.8379338383674622, "reward_std": 0.18549615144729614, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.8198585510253906, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8560090661048889, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 2.6277464703912847, "train_runtime": 14184.3282, "train_samples_per_second": 3.61, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47014923, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }