{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.7557821869850159, "adv/mean_abs_reasoning": 0.28040462732315063, "adv/mean_abs_step_conf": 0.7450977563858032, "adv/ratio_final_to_reasoning": 2.69532708571895, "adv/ratio_step_to_reasoning": 2.657223468452679, "adv/std_final_conf": 0.9257818460464478, "adv/std_reasoning": 0.5727222561836243, "adv/std_step_conf": 0.9239600300788879, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.3273979591836735, "calib/avg_num_step_conf": 14.59765625, "calib/ece": 0.23243902439024394, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.008130081300813009, "calib/gap": -0.04614489795918364, "calib/mean_conf": 0.6646341463414636, "calib/mu_c": 0.6552551020408164, "calib/mu_w": 0.7014, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05016260162601624, "calib/std_conf": 0.05917169015101882, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.583372, "calib/step_q_c_n": 2500.0, "calib/step_q_gap": -0.0778082748585287, "calib/step_q_w": 0.6611802748585287, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 750.2265625, "completions/mean_terminated_length": 780.7235717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.0010666666666666667, "grad_norm": 0.3164731562137604, "kl": 0.00047022104263305664, "learning_rate": 0.0, "loss": -0.1563, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01929234340786934, "mask/share_reasoning": 0.7498296499252319, "mask/share_step_conf": 0.19181546568870544, "num_tokens": 299642.0, "reward": 0.972690999507904, "reward_std": 0.10009393095970154, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.7708241939544678, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8292452096939087, "step": 1 }, { "adv/mean_abs_final_conf": 0.7929245233535767, "adv/mean_abs_reasoning": 0.4050842523574829, "adv/mean_abs_step_conf": 0.743950366973877, "adv/ratio_final_to_reasoning": 1.9574311238685933, "adv/ratio_step_to_reasoning": 1.8365324315726497, "adv/std_final_conf": 0.9301473498344421, "adv/std_reasoning": 0.6612725853919983, "adv/std_step_conf": 0.9270046949386597, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5763110307414105, "calib/avg_num_step_conf": 14.078125, "calib/ece": 0.04704724409448811, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.008169981916817282, "calib/mean_conf": 0.6691732283464566, "calib/mu_c": 0.6717142857142857, "calib/mu_w": 0.6635443037974684, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013622047244094477, "calib/std_conf": 0.060200661111313364, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5911686697057605, "calib/step_q_c_n": 2413.0, "calib/step_q_gap": -0.011375410898773475, "calib/step_q_w": 0.602544080604534, "calib/step_q_w_n": 1191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 867.8828125, "completions/mean_terminated_length": 871.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.0021333333333333334, "grad_norm": 0.5657607316970825, "kl": 0.0006206929683685303, "learning_rate": 2.5000000000000004e-07, "loss": -0.0461, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01878987066447735, "mask/share_reasoning": 0.7856365442276001, "mask/share_step_conf": 0.1916673481464386, "num_tokens": 625108.0, "reward": 0.9864938259124756, "reward_std": 0.08656304329633713, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7790628671646118, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8587685227394104, "step": 2 }, { "adv/mean_abs_final_conf": 0.7545986175537109, "adv/mean_abs_reasoning": 0.35498642921447754, "adv/mean_abs_step_conf": 0.76069575548172, "adv/ratio_final_to_reasoning": 2.125711169363586, "adv/ratio_step_to_reasoning": 2.142886862365431, "adv/std_final_conf": 0.928034245967865, "adv/std_reasoning": 0.6401829719543457, "adv/std_step_conf": 0.9276528358459473, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.37104406980183374, "calib/avg_num_step_conf": 13.0546875, "calib/ece": 0.14906249999999985, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.019274376417233396, "calib/mean_conf": 0.662578125, "calib/mu_c": 0.6588888888888891, "calib/mu_w": 0.6781632653061225, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0015234375000000126, "calib/std_conf": 0.0476764173516045, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5831838905775076, "calib/step_q_c_n": 2632.0, "calib/step_q_gap": -0.01008371505629524, "calib/step_q_w": 0.5932676056338029, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 792.18359375, "completions/mean_terminated_length": 798.4212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.0032, "grad_norm": 0.46344953775405884, "kl": 0.0004182755947113037, "learning_rate": 5.000000000000001e-07, "loss": -0.0502, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019262004643678665, "mask/share_reasoning": 0.7811808586120605, "mask/share_step_conf": 0.19174468517303467, "num_tokens": 933163.0, "reward": 1.0210323333740234, "reward_std": 0.07212791591882706, "rewards/accuracy_reward_step": 0.80859375, "rewards/final_brier_reward_step": 0.815670371055603, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8646755218505859, "step": 3 }, { "adv/mean_abs_final_conf": 0.7179367542266846, "adv/mean_abs_reasoning": 0.345045268535614, "adv/mean_abs_step_conf": 0.7400221824645996, "adv/ratio_final_to_reasoning": 2.0807030836088174, "adv/ratio_step_to_reasoning": 2.144710419027867, "adv/std_final_conf": 0.9140108823776245, "adv/std_reasoning": 0.6401801109313965, "adv/std_step_conf": 0.9261416792869568, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.42529143037177064, "calib/avg_num_step_conf": 14.08203125, "calib/ece": 0.09102766798418971, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.015810276679841896, "calib/gap": -0.019239130434782648, "calib/mean_conf": 0.6725296442687746, "calib/mu_c": 0.6672826086956521, "calib/mu_w": 0.6865217391304348, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018142292490118568, "calib/std_conf": 0.06134142858091719, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5873686371100164, "calib/step_q_c_n": 2436.0, "calib/step_q_gap": -0.027370456132070786, "calib/step_q_w": 0.6147390932420872, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 858.5703125, "completions/mean_terminated_length": 872.198486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.004266666666666667, "grad_norm": 0.35170620679855347, "kl": 0.000628054141998291, "learning_rate": 7.5e-07, "loss": -0.0719, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018083900213241577, "mask/share_reasoning": 0.7784430980682373, "mask/share_step_conf": 0.18784795701503754, "num_tokens": 1259125.0, "reward": 0.9762076139450073, "reward_std": 0.08280573785305023, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7780355215072632, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8329733610153198, "step": 4 }, { "adv/mean_abs_final_conf": 0.7614573836326599, "adv/mean_abs_reasoning": 0.2771398425102234, "adv/mean_abs_step_conf": 0.7567918300628662, "adv/ratio_final_to_reasoning": 2.747556528630742, "adv/ratio_step_to_reasoning": 2.7307218738675187, "adv/std_final_conf": 0.9287629723548889, "adv/std_reasoning": 0.5725516080856323, "adv/std_step_conf": 0.9243844151496887, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.3000189945548943, "calib/avg_num_step_conf": 13.95703125, "calib/ece": 0.14501960784313725, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": -0.05246296061795597, "calib/mean_conf": 0.6776470588235295, "calib/mu_c": 0.6558389261744967, "calib/mu_w": 0.7083018867924527, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1191764705882353, "calib/std_conf": 0.07599247215798953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5817224355458036, "calib/step_q_c_n": 1823.0, "calib/step_q_gap": -0.03756327873991072, "calib/step_q_w": 0.6192857142857143, "calib/step_q_w_n": 1750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 880.78125, "completions/mean_terminated_length": 887.716552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.005333333333333333, "grad_norm": 0.3038475811481476, "kl": 0.000532984733581543, "learning_rate": 1.0000000000000002e-06, "loss": -0.0171, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019166070967912674, "mask/share_reasoning": 0.7803407311439514, "mask/share_step_conf": 0.19268068671226501, "num_tokens": 1591293.0, "reward": 0.9236041307449341, "reward_std": 0.07161444425582886, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7143359184265137, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8172474503517151, "step": 5 }, { "adv/mean_abs_final_conf": 0.6889976859092712, "adv/mean_abs_reasoning": 0.28010645508766174, "adv/mean_abs_step_conf": 0.7499129772186279, "adv/ratio_final_to_reasoning": 2.4597708242519563, "adv/ratio_step_to_reasoning": 2.677242753952015, "adv/std_final_conf": 0.9128808379173279, "adv/std_reasoning": 0.5958722829818726, "adv/std_step_conf": 0.9258525371551514, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5150865051903114, "calib/avg_num_step_conf": 12.91015625, "calib/ece": 0.05305882352941173, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": 0.002882352941176558, "calib/mean_conf": 0.657921568627451, "calib/mu_c": 0.6588823529411765, "calib/mu_w": 0.6559999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02215686274509806, "calib/std_conf": 0.060384338684163145, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5877802995914663, "calib/step_q_c_n": 2203.0, "calib/step_q_gap": 0.001128756941738529, "calib/step_q_w": 0.5866515426497277, "calib/step_q_w_n": 1102.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 761.68359375, "completions/mean_terminated_length": 767.6810913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.0064, "grad_norm": 0.46888935565948486, "kl": 0.0006018877029418945, "learning_rate": 1.25e-06, "loss": -0.0189, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02118426002562046, "mask/share_reasoning": 0.7746874690055847, "mask/share_step_conf": 0.19631579518318176, "num_tokens": 1892236.0, "reward": 0.9766287803649902, "reward_std": 0.0638386458158493, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7723073959350586, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8489189147949219, "step": 6 }, { "adv/mean_abs_final_conf": 0.7676518559455872, "adv/mean_abs_reasoning": 0.391650527715683, "adv/mean_abs_step_conf": 0.7619491815567017, "adv/ratio_final_to_reasoning": 1.960042950594109, "adv/ratio_step_to_reasoning": 1.9454823308953522, "adv/std_final_conf": 0.9287219643592834, "adv/std_reasoning": 0.6614062786102295, "adv/std_step_conf": 0.9284455180168152, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.445343137254902, "calib/avg_num_step_conf": 15.23828125, "calib/ece": 0.11024193548387104, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.004032258064516129, "calib/gap": -0.008529411764705896, "calib/mean_conf": 0.6723387096774194, "calib/mu_c": 0.6699999999999999, "calib/mu_w": 0.6785294117647058, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028387096774193578, "calib/std_conf": 0.05420404804201914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5904171680705977, "calib/step_q_c_n": 2493.0, "calib/step_q_gap": -0.05832572965667504, "calib/step_q_w": 0.6487428977272728, "calib/step_q_w_n": 1408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 859.6171875, "completions/mean_terminated_length": 887.3467407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 326.0, "epoch": 0.007466666666666667, "grad_norm": 0.28746259212493896, "kl": 0.0005101561546325684, "learning_rate": 1.5e-06, "loss": -0.1686, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01740226149559021, "mask/share_reasoning": 0.7679719924926758, "mask/share_step_conf": 0.183375746011734, "num_tokens": 2219722.0, "reward": 0.9667080640792847, "reward_std": 0.1161876767873764, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7670531272888184, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8319879770278931, "step": 7 }, { "adv/mean_abs_final_conf": 0.7538819313049316, "adv/mean_abs_reasoning": 0.3192325234413147, "adv/mean_abs_step_conf": 0.754244863986969, "adv/ratio_final_to_reasoning": 2.3615448801334917, "adv/ratio_step_to_reasoning": 2.36268177144433, "adv/std_final_conf": 0.9285581111907959, "adv/std_reasoning": 0.59605473279953, "adv/std_step_conf": 0.9257758855819702, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.3724663359319631, "calib/avg_num_step_conf": 14.99609375, "calib/ece": 0.07892430278884457, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00796812749003984, "calib/gap": -0.02692345854004241, "calib/mean_conf": 0.6673705179282868, "calib/mu_c": 0.6582530120481929, "calib/mu_w": 0.6851764705882353, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.042470119521912354, "calib/std_conf": 0.06068876061242304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5855540101993509, "calib/step_q_c_n": 2157.0, "calib/step_q_gap": -0.04563820145344344, "calib/step_q_w": 0.6311922116527944, "calib/step_q_w_n": 1682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 853.9609375, "completions/mean_terminated_length": 864.0869750976562, "completions/min_length": 0.0, "completions/min_terminated_length": 331.0, "epoch": 0.008533333333333334, "grad_norm": 0.3648718595504761, "kl": 0.0007760524749755859, "learning_rate": 1.75e-06, "loss": -0.0928, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01900193654000759, "mask/share_reasoning": 0.7783862352371216, "mask/share_step_conf": 0.19089308381080627, "num_tokens": 2544848.0, "reward": 0.9505767822265625, "reward_std": 0.0890641063451767, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7454074025154114, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8299646973609924, "step": 8 }, { "adv/mean_abs_final_conf": 0.7502264976501465, "adv/mean_abs_reasoning": 0.27437955141067505, "adv/mean_abs_step_conf": 0.7304888963699341, "adv/ratio_final_to_reasoning": 2.7342653408134336, "adv/ratio_step_to_reasoning": 2.662329946288824, "adv/std_final_conf": 0.9264436364173889, "adv/std_reasoning": 0.5727154016494751, "adv/std_step_conf": 0.9253325462341309, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5530455828963292, "calib/avg_num_step_conf": 14.61328125, "calib/ece": 0.11448412698412688, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.031746031746031744, "calib/gap": 0.008864058087938553, "calib/mean_conf": 0.6733730158730159, "calib/mu_c": 0.6757297297297297, "calib/mu_w": 0.6668656716417911, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.026865079365079356, "calib/std_conf": 0.07309574435824197, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6033937823834197, "calib/step_q_c_n": 2702.0, "calib/step_q_gap": -0.0029584794067631925, "calib/step_q_w": 0.6063522617901829, "calib/step_q_w_n": 1039.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 878.859375, "completions/mean_terminated_length": 885.779541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.0096, "grad_norm": 0.5035936236381531, "kl": 0.0025275349617004395, "learning_rate": 2.0000000000000003e-06, "loss": -0.065, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018660465255379677, "mask/share_reasoning": 0.7826741933822632, "mask/share_step_conf": 0.1908527910709381, "num_tokens": 2877372.0, "reward": 0.9792848229408264, "reward_std": 0.09020000696182251, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7867535352706909, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8304097652435303, "step": 9 }, { "adv/mean_abs_final_conf": 0.7672736644744873, "adv/mean_abs_reasoning": 0.2947046160697937, "adv/mean_abs_step_conf": 0.7513777017593384, "adv/ratio_final_to_reasoning": 2.6035345991756604, "adv/ratio_step_to_reasoning": 2.549595970975197, "adv/std_final_conf": 0.9274757504463196, "adv/std_reasoning": 0.5728297233581543, "adv/std_step_conf": 0.9279826879501343, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.3941049842689187, "calib/avg_num_step_conf": 15.6484375, "calib/ece": 0.13285140562248995, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.008032128514056224, "calib/gap": -0.02286636860407354, "calib/mean_conf": 0.6695582329317268, "calib/mu_c": 0.6634972677595627, "calib/mu_w": 0.6863636363636363, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0337349397590361, "calib/std_conf": 0.06462904359181194, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5924566246056782, "calib/step_q_c_n": 2536.0, "calib/step_q_gap": -0.03517194682289326, "calib/step_q_w": 0.6276285714285714, "calib/step_q_w_n": 1470.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 870.2109375, "completions/mean_terminated_length": 887.5458374023438, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.010666666666666666, "grad_norm": 1.4716449975967407, "kl": 0.0009799003601074219, "learning_rate": 2.25e-06, "loss": -0.0546, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018368223682045937, "mask/share_reasoning": 0.7772632837295532, "mask/share_step_conf": 0.184837207198143, "num_tokens": 3206946.0, "reward": 0.9648333191871643, "reward_std": 0.09700104594230652, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7662937641143799, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8258729577064514, "step": 10 }, { "adv/mean_abs_final_conf": 0.7241643667221069, "adv/mean_abs_reasoning": 0.2893093228340149, "adv/mean_abs_step_conf": 0.7398701906204224, "adv/ratio_final_to_reasoning": 2.503079954798349, "adv/ratio_step_to_reasoning": 2.557367261354752, "adv/std_final_conf": 0.9281130433082581, "adv/std_reasoning": 0.5960187911987305, "adv/std_step_conf": 0.927370011806488, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.30027993109388457, "calib/avg_num_step_conf": 14.37890625, "calib/ece": 0.20280632411067206, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": -0.052347832328452526, "calib/mean_conf": 0.6737944664031621, "calib/mu_c": 0.6570348837209302, "calib/mu_w": 0.7093827160493827, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09837944664031616, "calib/std_conf": 0.06373603271786303, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5814794267221451, "calib/step_q_c_n": 2163.0, "calib/step_q_gap": -0.04432426234241349, "calib/step_q_w": 0.6258036890645586, "calib/step_q_w_n": 1518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 862.87109375, "completions/mean_terminated_length": 873.102783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.011733333333333333, "grad_norm": 0.28209924697875977, "kl": 0.001111447811126709, "learning_rate": 2.5e-06, "loss": -0.0369, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018755219876766205, "mask/share_reasoning": 0.7777705192565918, "mask/share_step_conf": 0.19175554811954498, "num_tokens": 3532321.0, "reward": 0.9579741954803467, "reward_std": 0.09043378382921219, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7466034889221191, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8373136520385742, "step": 11 }, { "adv/mean_abs_final_conf": 0.758876621723175, "adv/mean_abs_reasoning": 0.3993332087993622, "adv/mean_abs_step_conf": 0.7696208953857422, "adv/ratio_final_to_reasoning": 1.9003594116422684, "adv/ratio_step_to_reasoning": 1.927264946733805, "adv/std_final_conf": 0.9296309351921082, "adv/std_reasoning": 0.6818552017211914, "adv/std_step_conf": 0.9270579814910889, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.4729411764705882, "calib/avg_num_step_conf": 15.2109375, "calib/ece": 0.10591666666666667, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.020833333333333332, "calib/gap": -0.024739495798319244, "calib/mean_conf": 0.6693333333333332, "calib/mu_c": 0.6621176470588236, "calib/mu_w": 0.6868571428571428, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03345833333333333, "calib/std_conf": 0.07183584682377517, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5856936936936938, "calib/step_q_c_n": 2220.0, "calib/step_q_gap": -0.06974836126449013, "calib/step_q_w": 0.655442054958184, "calib/step_q_w_n": 1674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 788.79296875, "completions/mean_terminated_length": 827.5859985351562, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.0128, "grad_norm": 1.20479416847229, "kl": 0.0018391609191894531, "learning_rate": 2.7500000000000004e-06, "loss": -0.282, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01954209804534912, "mask/share_reasoning": 0.7416031956672668, "mask/share_step_conf": 0.19197970628738403, "num_tokens": 3838428.0, "reward": 0.9281233549118042, "reward_std": 0.15242928266525269, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7253308296203613, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8113845586776733, "step": 12 }, { "adv/mean_abs_final_conf": 0.7560792565345764, "adv/mean_abs_reasoning": 0.3063671588897705, "adv/mean_abs_step_conf": 0.7653182744979858, "adv/ratio_final_to_reasoning": 2.46788611179637, "adv/ratio_step_to_reasoning": 2.498042796987075, "adv/std_final_conf": 0.9291857481002808, "adv/std_reasoning": 0.5960760712623596, "adv/std_step_conf": 0.9274040460586548, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4347240330291178, "calib/avg_num_step_conf": 13.59375, "calib/ece": 0.10771653543307086, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007647109952194753, "calib/mean_conf": 0.6614173228346457, "calib/mu_c": 0.6596410256410257, "calib/mu_w": 0.6672881355932204, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.000708661417322833, "calib/std_conf": 0.051987648585324675, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.584913112164297, "calib/step_q_c_n": 2532.0, "calib/step_q_gap": -0.01700671905933171, "calib/step_q_w": 0.6019198312236287, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 799.1640625, "completions/mean_terminated_length": 808.6403198242188, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.013866666666666666, "grad_norm": 0.842889666557312, "kl": 0.006323099136352539, "learning_rate": 3e-06, "loss": -0.0192, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019865388050675392, "mask/share_reasoning": 0.7777748703956604, "mask/share_step_conf": 0.19064095616340637, "num_tokens": 4147606.0, "reward": 1.002134084701538, "reward_std": 0.07658325135707855, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7986539006233215, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8548331260681152, "step": 13 }, { "adv/mean_abs_final_conf": 0.7556318044662476, "adv/mean_abs_reasoning": 0.39360201358795166, "adv/mean_abs_step_conf": 0.7404717803001404, "adv/ratio_final_to_reasoning": 1.919786429896907, "adv/ratio_step_to_reasoning": 1.8812703053783528, "adv/std_final_conf": 0.9296389818191528, "adv/std_reasoning": 0.6613546013832092, "adv/std_step_conf": 0.928806483745575, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5233257308729007, "calib/avg_num_step_conf": 14.515625, "calib/ece": 0.08031999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.008, "calib/gap": -0.0016186329393874654, "calib/mean_conf": 0.66952, "calib/mu_c": 0.668930817610063, "calib/mu_w": 0.6705494505494505, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05691999999999995, "calib/std_conf": 0.06258889358344658, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5909621523027816, "calib/step_q_c_n": 2193.0, "calib/step_q_gap": -0.021506659253357596, "calib/step_q_w": 0.6124688115561392, "calib/step_q_w_n": 1523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 894.11328125, "completions/mean_terminated_length": 915.572021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 299.0, "epoch": 0.014933333333333333, "grad_norm": 1.3575252294540405, "kl": 0.0033931732177734375, "learning_rate": 3.2500000000000002e-06, "loss": -0.1371, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018348682671785355, "mask/share_reasoning": 0.7763521671295166, "mask/share_step_conf": 0.18186160922050476, "num_tokens": 4481899.0, "reward": 0.9478188157081604, "reward_std": 0.10849031805992126, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7448296546936035, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8312766551971436, "step": 14 }, { "adv/mean_abs_final_conf": 0.766565203666687, "adv/mean_abs_reasoning": 0.34837883710861206, "adv/mean_abs_step_conf": 0.7703143358230591, "adv/ratio_final_to_reasoning": 2.2003782147871958, "adv/ratio_step_to_reasoning": 2.2111398677839396, "adv/std_final_conf": 0.9282213449478149, "adv/std_reasoning": 0.6185281276702881, "adv/std_step_conf": 0.9270145297050476, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4305745554035568, "calib/avg_num_step_conf": 14.1484375, "calib/ece": 0.022851562499999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": -0.0032407660738713595, "calib/mean_conf": 0.6673828125000001, "calib/mu_c": 0.6662941176470588, "calib/mu_w": 0.6695348837209302, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013085937499999986, "calib/std_conf": 0.06358884300401953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5945896003437904, "calib/step_q_c_n": 2327.0, "calib/step_q_gap": 0.0011610289152189646, "calib/step_q_w": 0.5934285714285714, "calib/step_q_w_n": 1295.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 780.1875, "completions/mean_terminated_length": 789.4387817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.016, "grad_norm": 1.614044189453125, "kl": 0.005997180938720703, "learning_rate": 3.5e-06, "loss": 0.0244, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019837863743305206, "mask/share_reasoning": 0.7707360982894897, "mask/share_step_conf": 0.19770731031894684, "num_tokens": 4789507.0, "reward": 0.9701565504074097, "reward_std": 0.07043170928955078, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7714160680770874, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8360846042633057, "step": 15 }, { "adv/mean_abs_final_conf": 0.7369842529296875, "adv/mean_abs_reasoning": 0.29876458644866943, "adv/mean_abs_step_conf": 0.7509787082672119, "adv/ratio_final_to_reasoning": 2.466772456836374, "adv/ratio_step_to_reasoning": 2.5136135349704074, "adv/std_final_conf": 0.9282191395759583, "adv/std_reasoning": 0.6184563040733337, "adv/std_step_conf": 0.928688645362854, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.47979353393085783, "calib/avg_num_step_conf": 16.7265625, "calib/ece": 0.09012429149797564, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.008097165991902834, "calib/gap": 0.0033440140845071298, "calib/mean_conf": 0.6859137651821863, "calib/mu_c": 0.686875, "calib/mu_w": 0.6835309859154929, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03174372469635628, "calib/std_conf": 0.07859145252322318, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6029948491537895, "calib/step_q_c_n": 2718.0, "calib/step_q_gap": -0.022205086907591487, "calib/step_q_w": 0.625199936061381, "calib/step_q_w_n": 1564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 1002.15234375, "completions/mean_terminated_length": 1038.6680908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.017066666666666667, "grad_norm": 1.8644980192184448, "kl": 0.0037343502044677734, "learning_rate": 3.7500000000000005e-06, "loss": -0.1494, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.015444772318005562, "mask/share_reasoning": 0.770982563495636, "mask/share_step_conf": 0.1784164011478424, "num_tokens": 5154906.0, "reward": 0.9652820825576782, "reward_std": 0.09969577193260193, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7618999481201172, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8381954431533813, "step": 16 }, { "adv/mean_abs_final_conf": 0.7771555185317993, "adv/mean_abs_reasoning": 0.30706483125686646, "adv/mean_abs_step_conf": 0.7563447952270508, "adv/ratio_final_to_reasoning": 2.530916729704196, "adv/ratio_step_to_reasoning": 2.4631436694694346, "adv/std_final_conf": 0.9285280108451843, "adv/std_reasoning": 0.5727678537368774, "adv/std_step_conf": 0.9267243146896362, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5275313954509528, "calib/avg_num_step_conf": 15.15625, "calib/ece": 0.11527777777777774, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.0006911390181786325, "calib/mean_conf": 0.6821825396825397, "calib/mu_c": 0.682020725388601, "calib/mu_w": 0.6827118644067797, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0157936507936508, "calib/std_conf": 0.06813559086671489, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6004387504387505, "calib/step_q_c_n": 2849.0, "calib/step_q_gap": -0.020104411539910938, "calib/step_q_w": 0.6205431619786614, "calib/step_q_w_n": 1031.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 865.37109375, "completions/mean_terminated_length": 882.6095581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.018133333333333335, "grad_norm": 1.9307397603988647, "kl": 0.004225015640258789, "learning_rate": 4.000000000000001e-06, "loss": -0.1216, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018658708781003952, "mask/share_reasoning": 0.7641027569770813, "mask/share_step_conf": 0.19770726561546326, "num_tokens": 5479969.0, "reward": 0.99280846118927, "reward_std": 0.08533492684364319, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.796156644821167, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8418039083480835, "step": 17 }, { "adv/mean_abs_final_conf": 0.7508172988891602, "adv/mean_abs_reasoning": 0.3334640860557556, "adv/mean_abs_step_conf": 0.7501944303512573, "adv/ratio_final_to_reasoning": 2.2515687004555645, "adv/ratio_step_to_reasoning": 2.2497008275302663, "adv/std_final_conf": 0.9283583760261536, "adv/std_reasoning": 0.6185499429702759, "adv/std_step_conf": 0.9289292693138123, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.3601041815327529, "calib/avg_num_step_conf": 16.375, "calib/ece": 0.138734693877551, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.00816326530612245, "calib/gap": -0.03346653346653328, "calib/mean_conf": 0.6777551020408162, "calib/mu_c": 0.6653246753246754, "calib/mu_w": 0.6987912087912087, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09395918367346939, "calib/std_conf": 0.0701329730051465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.591326385599242, "calib/step_q_c_n": 2111.0, "calib/step_q_gap": -0.053623446212387016, "calib/step_q_w": 0.644949831811629, "calib/step_q_w_n": 2081.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3052.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 927.08203125, "completions/mean_terminated_length": 953.14453125, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.0192, "grad_norm": 1.95222806930542, "kl": 0.005169868469238281, "learning_rate": 4.25e-06, "loss": -0.0423, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01695508509874344, "mask/share_reasoning": 0.7752803564071655, "mask/share_step_conf": 0.18042084574699402, "num_tokens": 5828022.0, "reward": 0.908229649066925, "reward_std": 0.09579773992300034, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.711616039276123, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7931245565414429, "step": 18 }, { "adv/mean_abs_final_conf": 0.7445905804634094, "adv/mean_abs_reasoning": 0.326733261346817, "adv/mean_abs_step_conf": 0.7477931976318359, "adv/ratio_final_to_reasoning": 2.278894341500941, "adv/ratio_step_to_reasoning": 2.2886962733741303, "adv/std_final_conf": 0.9265520572662354, "adv/std_reasoning": 0.6184476613998413, "adv/std_step_conf": 0.9268099665641785, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.39840538625088595, "calib/avg_num_step_conf": 13.5078125, "calib/ece": 0.06415019762845847, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.014172927002126134, "calib/mean_conf": 0.6578260869565218, "calib/mu_c": 0.6531764705882352, "calib/mu_w": 0.6673493975903614, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.025019762845849815, "calib/std_conf": 0.04024158016301241, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5784865366759517, "calib/step_q_c_n": 2154.0, "calib/step_q_gap": -0.021904567618526882, "calib/step_q_w": 0.6003911042944786, "calib/step_q_w_n": 1304.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 777.23046875, "completions/mean_terminated_length": 789.5675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.020266666666666665, "grad_norm": 1.3248672485351562, "kl": 0.006027698516845703, "learning_rate": 4.5e-06, "loss": -0.0865, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019088495522737503, "mask/share_reasoning": 0.7767906188964844, "mask/share_step_conf": 0.18849590420722961, "num_tokens": 6131753.0, "reward": 0.9724036455154419, "reward_std": 0.07303591072559357, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7624542713165283, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.851884126663208, "step": 19 }, { "adv/mean_abs_final_conf": 0.7116622924804688, "adv/mean_abs_reasoning": 0.28604555130004883, "adv/mean_abs_step_conf": 0.7420024871826172, "adv/ratio_final_to_reasoning": 2.487933440132293, "adv/ratio_step_to_reasoning": 2.5940011435601393, "adv/std_final_conf": 0.9248640537261963, "adv/std_reasoning": 0.5961463451385498, "adv/std_step_conf": 0.925365686416626, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.38771802325581395, "calib/avg_num_step_conf": 16.34765625, "calib/ece": 0.19411016949152546, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.00423728813559322, "calib/gap": -0.03371366279069765, "calib/mean_conf": 0.6669915254237287, "calib/mu_c": 0.6578488372093023, "calib/mu_w": 0.6915625, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06614406779661017, "calib/std_conf": 0.06487158054248303, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5862242663162506, "calib/step_q_c_n": 2283.0, "calib/step_q_gap": -0.08511490297922786, "calib/step_q_w": 0.6713391692954784, "calib/step_q_w_n": 1902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 759.03125, "completions/mean_terminated_length": 819.8818359375, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.021333333333333333, "grad_norm": 0.6182474493980408, "kl": 0.0067653656005859375, "learning_rate": 4.75e-06, "loss": -0.3376, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.018506959080696106, "mask/share_reasoning": 0.7251446843147278, "mask/share_step_conf": 0.18212959170341492, "num_tokens": 6430937.0, "reward": 0.9154574275016785, "reward_std": 0.12602384388446808, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7199832201004028, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7921817302703857, "step": 20 }, { "adv/mean_abs_final_conf": 0.7544095516204834, "adv/mean_abs_reasoning": 0.2885745167732239, "adv/mean_abs_step_conf": 0.7639347314834595, "adv/ratio_final_to_reasoning": 2.614262548391741, "adv/ratio_step_to_reasoning": 2.647270244183748, "adv/std_final_conf": 0.9284787178039551, "adv/std_reasoning": 0.5727882385253906, "adv/std_step_conf": 0.927897572517395, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.35599377270368443, "calib/avg_num_step_conf": 16.4765625, "calib/ece": 0.16662142857142861, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.023809523809523808, "calib/gap": -0.032942833419823625, "calib/mean_conf": 0.6717119047619048, "calib/mu_c": 0.6655678048780487, "calib/mu_w": 0.6985106382978723, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.012420634920634922, "calib/std_conf": 0.08072379530298308, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5942708798424163, "calib/step_q_c_n": 3046.0, "calib/step_q_gap": -0.08038824985041648, "calib/step_q_w": 0.6746591296928328, "calib/step_q_w_n": 1172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 850.6875, "completions/mean_terminated_length": 864.1905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.0224, "grad_norm": 0.7119145393371582, "kl": 0.013216972351074219, "learning_rate": 5e-06, "loss": -0.1552, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018894609063863754, "mask/share_reasoning": 0.7712918519973755, "mask/share_step_conf": 0.19418853521347046, "num_tokens": 6751673.0, "reward": 0.997853696346283, "reward_std": 0.0898774042725563, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.798980712890625, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8396954536437988, "step": 21 }, { "adv/mean_abs_final_conf": 0.7490301728248596, "adv/mean_abs_reasoning": 0.23687519133090973, "adv/mean_abs_step_conf": 0.7512524127960205, "adv/ratio_final_to_reasoning": 3.162130101579443, "adv/ratio_step_to_reasoning": 3.171511582006646, "adv/std_final_conf": 0.9280193448066711, "adv/std_reasoning": 0.5227688550949097, "adv/std_step_conf": 0.9256880283355713, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5853698273053112, "calib/avg_num_step_conf": 14.37109375, "calib/ece": 0.07611111111111112, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": 0.013714565004887547, "calib/mean_conf": 0.6693650793650794, "calib/mu_c": 0.6729569892473117, "calib/mu_w": 0.6592424242424242, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0036904761904761915, "calib/std_conf": 0.05536668575651217, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5891939252336449, "calib/step_q_c_n": 2568.0, "calib/step_q_gap": -0.024955489707849332, "calib/step_q_w": 0.6141494149414942, "calib/step_q_w_n": 1111.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 833.0, "completions/mean_terminated_length": 849.5936279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 362.0, "epoch": 0.023466666666666667, "grad_norm": 0.9455334544181824, "kl": 0.012845993041992188, "learning_rate": 4.9722222222222224e-06, "loss": -0.1131, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018307868391275406, "mask/share_reasoning": 0.7735457420349121, "mask/share_step_conf": 0.18861517310142517, "num_tokens": 7066737.0, "reward": 0.9924610257148743, "reward_std": 0.06731672585010529, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7916367650032043, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8510977029800415, "step": 22 }, { "adv/mean_abs_final_conf": 0.7527889013290405, "adv/mean_abs_reasoning": 0.32413190603256226, "adv/mean_abs_step_conf": 0.7550621032714844, "adv/ratio_final_to_reasoning": 2.3224770141987054, "adv/ratio_step_to_reasoning": 2.3294902143809035, "adv/std_final_conf": 0.9277175068855286, "adv/std_reasoning": 0.6185213327407837, "adv/std_step_conf": 0.9261959195137024, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.46720343531937736, "calib/avg_num_step_conf": 17.26953125, "calib/ece": 0.14980158730158727, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": -0.011314009661835822, "calib/mean_conf": 0.6911507936507936, "calib/mu_c": 0.6891304347826086, "calib/mu_w": 0.7004444444444444, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.009761904761904772, "calib/std_conf": 0.08101455556661305, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6082226792009401, "calib/step_q_c_n": 3404.0, "calib/step_q_gap": -0.05040072296228504, "calib/step_q_w": 0.6586234021632251, "calib/step_q_w_n": 1017.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2728.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 933.78515625, "completions/mean_terminated_length": 952.386474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.024533333333333334, "grad_norm": 0.7370911240577698, "kl": 0.013940811157226562, "learning_rate": 4.944444444444445e-06, "loss": -0.0585, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018075132742524147, "mask/share_reasoning": 0.7694632411003113, "mask/share_step_conf": 0.19293037056922913, "num_tokens": 7409722.0, "reward": 1.0178022384643555, "reward_std": 0.09153546392917633, "rewards/accuracy_reward_step": 0.80859375, "rewards/final_brier_reward_step": 0.8135480284690857, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8634628057479858, "step": 23 }, { "adv/mean_abs_final_conf": 0.7669250965118408, "adv/mean_abs_reasoning": 0.39520037174224854, "adv/mean_abs_step_conf": 0.7460485696792603, "adv/ratio_final_to_reasoning": 1.9405981151557035, "adv/ratio_step_to_reasoning": 1.8877729451272796, "adv/std_final_conf": 0.9306222200393677, "adv/std_reasoning": 0.661401629447937, "adv/std_step_conf": 0.9280813336372375, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.46803350970017643, "calib/avg_num_step_conf": 17.97265625, "calib/ece": 0.07293654618473899, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.040160642570281124, "calib/gap": -0.00945925925925939, "calib/mean_conf": 0.6914104417670683, "calib/mu_c": 0.6883333333333332, "calib/mu_w": 0.6977925925925926, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04482409638554215, "calib/std_conf": 0.09198773475189376, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6068624478969306, "calib/step_q_c_n": 2639.0, "calib/step_q_gap": -0.053195554141805546, "calib/step_q_w": 0.6600580020387361, "calib/step_q_w_n": 1962.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 952.109375, "completions/mean_terminated_length": 974.9600219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.0256, "grad_norm": 0.2945130467414856, "kl": 0.01755523681640625, "learning_rate": 4.9166666666666665e-06, "loss": -0.1351, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017136435955762863, "mask/share_reasoning": 0.7662916779518127, "mask/share_step_conf": 0.1931343972682953, "num_tokens": 7757974.0, "reward": 0.9411197900772095, "reward_std": 0.12054760009050369, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7466366291046143, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.809821605682373, "step": 24 }, { "adv/mean_abs_final_conf": 0.7522770762443542, "adv/mean_abs_reasoning": 0.21244873106479645, "adv/mean_abs_step_conf": 0.7597434520721436, "adv/ratio_final_to_reasoning": 3.540981734623358, "adv/ratio_step_to_reasoning": 3.5761260999973836, "adv/std_final_conf": 0.9289838671684265, "adv/std_reasoning": 0.4960183799266815, "adv/std_step_conf": 0.9283822178840637, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.40092658588738417, "calib/avg_num_step_conf": 17.33984375, "calib/ece": 0.09539682539682537, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.03323354716084559, "calib/mean_conf": 0.6941269841269841, "calib/mu_c": 0.6850273224043717, "calib/mu_w": 0.7182608695652173, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03166666666666665, "calib/std_conf": 0.07158378198506601, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6001742767514814, "calib/step_q_c_n": 2869.0, "calib/step_q_gap": -0.05252636019119361, "calib/step_q_w": 0.652700636942675, "calib/step_q_w_n": 1570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 910.984375, "completions/mean_terminated_length": 925.4445190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.02666666666666667, "grad_norm": 0.1930381953716278, "kl": 0.017154693603515625, "learning_rate": 4.888888888888889e-06, "loss": -0.0763, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.016981422901153564, "mask/share_reasoning": 0.7668448090553284, "mask/share_step_conf": 0.20054879784584045, "num_tokens": 8094410.0, "reward": 0.971386194229126, "reward_std": 0.08087246119976044, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7695780992507935, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8333505988121033, "step": 25 }, { "adv/mean_abs_final_conf": 0.7301466464996338, "adv/mean_abs_reasoning": 0.3386092185974121, "adv/mean_abs_step_conf": 0.7286078333854675, "adv/ratio_final_to_reasoning": 2.156310597579265, "adv/ratio_step_to_reasoning": 2.151766087183062, "adv/std_final_conf": 0.9304450154304504, "adv/std_reasoning": 0.640184223651886, "adv/std_step_conf": 0.9266344904899597, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.38496472663139325, "calib/avg_num_step_conf": 16.765625, "calib/ece": 0.14566265060240963, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.004016064257028112, "calib/gap": -0.021521164021163997, "calib/mean_conf": 0.6744979919678714, "calib/mu_c": 0.6693121693121694, "calib/mu_w": 0.6908333333333334, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.030562248995983927, "calib/std_conf": 0.052723505841863014, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5905247706422019, "calib/step_q_c_n": 2725.0, "calib/step_q_gap": -0.09112168755818095, "calib/step_q_w": 0.6816464582003828, "calib/step_q_w_n": 1567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 845.87109375, "completions/mean_terminated_length": 866.1720581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.027733333333333332, "grad_norm": 0.3324378728866577, "kl": 0.022998809814453125, "learning_rate": 4.861111111111111e-06, "loss": -0.0824, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0170967485755682, "mask/share_reasoning": 0.7706181406974792, "mask/share_step_conf": 0.1888476014137268, "num_tokens": 8416193.0, "reward": 0.9643614888191223, "reward_std": 0.10628978908061981, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7751891016960144, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8121277093887329, "step": 26 }, { "adv/mean_abs_final_conf": 0.7062721252441406, "adv/mean_abs_reasoning": 0.4295037090778351, "adv/mean_abs_step_conf": 0.7095715999603271, "adv/ratio_final_to_reasoning": 1.6443912131993936, "adv/ratio_step_to_reasoning": 1.6520732765819675, "adv/std_final_conf": 0.9321457743644714, "adv/std_reasoning": 0.7393494248390198, "adv/std_step_conf": 0.9296938180923462, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4772676868485252, "calib/avg_num_step_conf": 16.69140625, "calib/ece": 0.04967741935483873, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.008064516129032258, "calib/gap": -0.005013676351001806, "calib/mean_conf": 0.6900806451612904, "calib/mu_c": 0.688443113772455, "calib/mu_w": 0.6934567901234568, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.033185483870967786, "calib/std_conf": 0.0672081020430493, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6008618012422361, "calib/step_q_c_n": 2576.0, "calib/step_q_gap": -0.02853713806241931, "calib/step_q_w": 0.6293989393046554, "calib/step_q_w_n": 1697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 948.578125, "completions/mean_terminated_length": 975.2449340820312, "completions/min_length": 0.0, "completions/min_terminated_length": 389.0, "epoch": 0.0288, "grad_norm": 0.4145672023296356, "kl": 0.023395538330078125, "learning_rate": 4.833333333333333e-06, "loss": -0.1215, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01612701267004013, "mask/share_reasoning": 0.7714564204216003, "mask/share_step_conf": 0.18507282435894012, "num_tokens": 8764245.0, "reward": 0.9498916864395142, "reward_std": 0.14259380102157593, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7489039301872253, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8266607522964478, "step": 27 }, { "adv/mean_abs_final_conf": 0.7357367277145386, "adv/mean_abs_reasoning": 0.2043769657611847, "adv/mean_abs_step_conf": 0.746915340423584, "adv/ratio_final_to_reasoning": 3.5999004338593124, "adv/ratio_step_to_reasoning": 3.6545964837170426, "adv/std_final_conf": 0.9269275665283203, "adv/std_reasoning": 0.4959013760089874, "adv/std_step_conf": 0.9273102879524231, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.2549955396966994, "calib/avg_num_step_conf": 17.80078125, "calib/ece": 0.2018473895582329, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.04819277108433735, "calib/gap": -0.053265834076717034, "calib/mean_conf": 0.6947791164658634, "calib/mu_c": 0.6821578947368422, "calib/mu_w": 0.7354237288135592, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06678714859437751, "calib/std_conf": 0.08246729548689093, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6043547273982057, "calib/step_q_c_n": 2898.0, "calib/step_q_gap": -0.0652203178097509, "calib/step_q_w": 0.6695750452079566, "calib/step_q_w_n": 1659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 967.8515625, "completions/mean_terminated_length": 995.0601806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.029866666666666666, "grad_norm": 0.1708788424730301, "kl": 0.020660400390625, "learning_rate": 4.805555555555556e-06, "loss": -0.1527, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01645374484360218, "mask/share_reasoning": 0.7701396942138672, "mask/share_step_conf": 0.1860627979040146, "num_tokens": 9118959.0, "reward": 0.9653811454772949, "reward_std": 0.080658920109272, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7669132947921753, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8208802938461304, "step": 28 }, { "adv/mean_abs_final_conf": 0.7823663949966431, "adv/mean_abs_reasoning": 0.37815457582473755, "adv/mean_abs_step_conf": 0.7526275515556335, "adv/ratio_final_to_reasoning": 2.068906328292705, "adv/ratio_step_to_reasoning": 1.990264298439832, "adv/std_final_conf": 0.9295948147773743, "adv/std_reasoning": 0.640355110168457, "adv/std_step_conf": 0.9275586605072021, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.3082565556343019, "calib/avg_num_step_conf": 18.53125, "calib/ece": 0.17231075697211154, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.03187250996015936, "calib/gap": -0.043673281360737115, "calib/mean_conf": 0.706175298804781, "calib/mu_c": 0.6913855421686748, "calib/mu_w": 0.7350588235294119, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10856573705179282, "calib/std_conf": 0.08572931288108, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6146497498213009, "calib/step_q_c_n": 2798.0, "calib/step_q_gap": -0.02035025017869907, "calib/step_q_w": 0.635, "calib/step_q_w_n": 1946.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 1028.03125, "completions/mean_terminated_length": 1040.221435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 0.030933333333333334, "grad_norm": 0.22465556859970093, "kl": 0.024017333984375, "learning_rate": 4.777777777777778e-06, "loss": -0.0608, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01575513184070587, "mask/share_reasoning": 0.7790378928184509, "mask/share_step_conf": 0.193488210439682, "num_tokens": 9489263.0, "reward": 0.941707968711853, "reward_std": 0.10860035568475723, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7325222492218018, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8251124620437622, "step": 29 }, { "adv/mean_abs_final_conf": 0.76378333568573, "adv/mean_abs_reasoning": 0.38688045740127563, "adv/mean_abs_step_conf": 0.7751642465591431, "adv/ratio_final_to_reasoning": 1.9742101754535706, "adv/ratio_step_to_reasoning": 2.003627300706834, "adv/std_final_conf": 0.9292889833450317, "adv/std_reasoning": 0.6613550782203674, "adv/std_step_conf": 0.9297365546226501, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.4981684981684982, "calib/avg_num_step_conf": 18.73046875, "calib/ece": 0.11599173553719015, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.02066115702479339, "calib/gap": -0.010727106227106264, "calib/mean_conf": 0.703099173553719, "calib/mu_c": 0.7004395604395603, "calib/mu_w": 0.7111666666666666, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03351239669421487, "calib/std_conf": 0.07115271725570785, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6083764412589592, "calib/step_q_c_n": 3209.0, "calib/step_q_gap": -0.05755987652162087, "calib/step_q_w": 0.66593631778058, "calib/step_q_w_n": 1586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 958.73046875, "completions/mean_terminated_length": 1010.0205688476562, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.032, "grad_norm": 0.36320260167121887, "kl": 0.0264892578125, "learning_rate": 4.75e-06, "loss": -0.2031, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.015068121254444122, "mask/share_reasoning": 0.7489598989486694, "mask/share_step_conf": 0.18519069254398346, "num_tokens": 9841682.0, "reward": 0.9350690841674805, "reward_std": 0.13166502118110657, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7582129240036011, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7806752920150757, "step": 30 }, { "adv/mean_abs_final_conf": 0.737023115158081, "adv/mean_abs_reasoning": 0.27113649249076843, "adv/mean_abs_step_conf": 0.764258623123169, "adv/ratio_final_to_reasoning": 2.7182733994509243, "adv/ratio_step_to_reasoning": 2.8187228362452545, "adv/std_final_conf": 0.9162626266479492, "adv/std_reasoning": 0.5484534502029419, "adv/std_step_conf": 0.9284743666648865, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.43510135135135136, "calib/avg_num_step_conf": 18.671875, "calib/ece": 0.11072580645161291, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.020161290322580645, "calib/gap": -0.00763243243243239, "calib/mean_conf": 0.7006451612903225, "calib/mu_c": 0.6975675675675677, "calib/mu_w": 0.7052, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10729838709677418, "calib/std_conf": 0.0760967331701403, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6141433635289484, "calib/step_q_c_n": 2539.0, "calib/step_q_gap": -0.022264490107820945, "calib/step_q_w": 0.6364078536367693, "calib/step_q_w_n": 2241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 997.203125, "completions/mean_terminated_length": 1025.2369384765625, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.03306666666666667, "grad_norm": 0.17617161571979523, "kl": 0.025751113891601562, "learning_rate": 4.722222222222222e-06, "loss": -0.0917, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.015319299884140491, "mask/share_reasoning": 0.7717986106872559, "mask/share_step_conf": 0.18553832173347473, "num_tokens": 10202878.0, "reward": 0.9131944179534912, "reward_std": 0.1015782430768013, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7160148620605469, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8009990453720093, "step": 31 }, { "adv/mean_abs_final_conf": 0.7535610198974609, "adv/mean_abs_reasoning": 0.3606758713722229, "adv/mean_abs_step_conf": 0.741081953048706, "adv/ratio_final_to_reasoning": 2.0893025558667735, "adv/ratio_step_to_reasoning": 2.0547034383786054, "adv/std_final_conf": 0.9300407767295837, "adv/std_reasoning": 0.66114741563797, "adv/std_step_conf": 0.9290763139724731, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.29699424287053156, "calib/avg_num_step_conf": 17.16015625, "calib/ece": 0.13645418326693223, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.01195219123505976, "calib/gap": -0.040182755388941116, "calib/mean_conf": 0.6868924302788846, "calib/mu_c": 0.6713636363636363, "calib/mu_w": 0.7115463917525774, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1049003984063745, "calib/std_conf": 0.05819927627928366, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5917901498929335, "calib/step_q_c_n": 2335.0, "calib/step_q_gap": -0.03574143416926279, "calib/step_q_w": 0.6275315840621963, "calib/step_q_w_n": 2058.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 931.29296875, "completions/mean_terminated_length": 946.075439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.034133333333333335, "grad_norm": 0.2636437714099884, "kl": 0.035003662109375, "learning_rate": 4.694444444444445e-06, "loss": -0.0706, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.016750000417232513, "mask/share_reasoning": 0.7754542827606201, "mask/share_step_conf": 0.19217070937156677, "num_tokens": 10547993.0, "reward": 0.9230811595916748, "reward_std": 0.09608988463878632, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7207136750221252, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8090424537658691, "step": 32 }, { "adv/mean_abs_final_conf": 0.7888538837432861, "adv/mean_abs_reasoning": 0.39104142785072327, "adv/mean_abs_step_conf": 0.7540225386619568, "adv/ratio_final_to_reasoning": 2.01731537264237, "adv/ratio_step_to_reasoning": 1.928242086282987, "adv/std_final_conf": 0.9316420555114746, "adv/std_reasoning": 0.6403576135635376, "adv/std_step_conf": 0.9295898079872131, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4358139228845901, "calib/avg_num_step_conf": 16.6328125, "calib/ece": 0.08342629482071709, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00398406374501992, "calib/gap": -0.013058955125674343, "calib/mean_conf": 0.6770517928286852, "calib/mu_c": 0.671744966442953, "calib/mu_w": 0.6848039215686273, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08342629482071709, "calib/std_conf": 0.059285867207713394, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5933526526972804, "calib/step_q_c_n": 2243.0, "calib/step_q_gap": -0.046984816285349784, "calib/step_q_w": 0.6403374689826302, "calib/step_q_w_n": 2015.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 844.57421875, "completions/mean_terminated_length": 861.3984375, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.0352, "grad_norm": 1.4770957231521606, "kl": 0.035400390625, "learning_rate": 4.666666666666667e-06, "loss": -0.0499, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01838311180472374, "mask/share_reasoning": 0.7677148580551147, "mask/share_step_conf": 0.19437071681022644, "num_tokens": 10871076.0, "reward": 0.9298532009124756, "reward_std": 0.10431680083274841, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.727498471736908, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8197079300880432, "step": 33 }, { "adv/mean_abs_final_conf": 0.7429213523864746, "adv/mean_abs_reasoning": 0.5263911485671997, "adv/mean_abs_step_conf": 0.7568153738975525, "adv/ratio_final_to_reasoning": 1.411348489442984, "adv/ratio_step_to_reasoning": 1.4377433510376296, "adv/std_final_conf": 0.9325934648513794, "adv/std_reasoning": 0.7926983833312988, "adv/std_step_conf": 0.9275981187820435, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49484620418848174, "calib/avg_num_step_conf": 16.3046875, "calib/ece": 0.09945098039215682, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": -0.0034260471204190512, "calib/mean_conf": 0.6830588235294118, "calib/mu_c": 0.6821989528795811, "calib/mu_w": 0.6856250000000002, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016745098039215693, "calib/std_conf": 0.0643889564775123, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6000424559111691, "calib/step_q_c_n": 3062.0, "calib/step_q_gap": -0.009157184376600735, "calib/step_q_w": 0.6091996402877699, "calib/step_q_w_n": 1112.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 860.80078125, "completions/mean_terminated_length": 867.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.03626666666666667, "grad_norm": 0.36523711681365967, "kl": 0.038021087646484375, "learning_rate": 4.638888888888889e-06, "loss": -0.0161, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01807224377989769, "mask/share_reasoning": 0.7730226516723633, "mask/share_step_conf": 0.20109258592128754, "num_tokens": 11196553.0, "reward": 0.9977847337722778, "reward_std": 0.10451152920722961, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.799092173576355, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8480398058891296, "step": 34 }, { "adv/mean_abs_final_conf": 0.7598680257797241, "adv/mean_abs_reasoning": 0.3720818758010864, "adv/mean_abs_step_conf": 0.7630703449249268, "adv/ratio_final_to_reasoning": 2.04220650130765, "adv/ratio_step_to_reasoning": 2.050812991850378, "adv/std_final_conf": 0.9306283593177795, "adv/std_reasoning": 0.6612749099731445, "adv/std_step_conf": 0.9308457970619202, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.3112910008410429, "calib/avg_num_step_conf": 17.57421875, "calib/ece": 0.1734980079681275, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.03187250996015936, "calib/gap": -0.048416456405943276, "calib/mean_conf": 0.6981354581673307, "calib/mu_c": 0.6813536585365854, "calib/mu_w": 0.7297701149425286, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10912350597609563, "calib/std_conf": 0.08101427237453744, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.597978184651344, "calib/step_q_c_n": 2567.0, "calib/step_q_gap": -0.03350369940662712, "calib/step_q_w": 0.6314818840579711, "calib/step_q_w_n": 1932.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 1029.19140625, "completions/mean_terminated_length": 1045.52783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.037333333333333336, "grad_norm": 1.7280222177505493, "kl": 0.032344818115234375, "learning_rate": 4.611111111111112e-06, "loss": -0.0996, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.015513326972723007, "mask/share_reasoning": 0.7886403799057007, "mask/share_step_conf": 0.180221289396286, "num_tokens": 11569282.0, "reward": 0.9292683601379395, "reward_std": 0.10767196118831635, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7285193204879761, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8057986497879028, "step": 35 }, { "adv/mean_abs_final_conf": 0.7362346649169922, "adv/mean_abs_reasoning": 0.24333158135414124, "adv/mean_abs_step_conf": 0.7633366584777832, "adv/ratio_final_to_reasoning": 3.0256436949936516, "adv/ratio_step_to_reasoning": 3.1370225526411804, "adv/std_final_conf": 0.9270575642585754, "adv/std_reasoning": 0.5483153462409973, "adv/std_step_conf": 0.927507221698761, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.38246225319396054, "calib/avg_num_step_conf": 15.5, "calib/ece": 0.17529521912350596, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.01195219123505976, "calib/gap": -0.03209443670150991, "calib/mean_conf": 0.677294422310757, "calib/mu_c": 0.6720519047619048, "calib/mu_w": 0.7041463414634147, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007968127490039842, "calib/std_conf": 0.08193762694833942, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5975192802056555, "calib/step_q_c_n": 3112.0, "calib/step_q_gap": -0.02646436465415769, "calib/step_q_w": 0.6239836448598132, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 804.93359375, "completions/mean_terminated_length": 824.2520141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 331.0, "epoch": 0.0384, "grad_norm": 1.4742515087127686, "kl": 0.046176910400390625, "learning_rate": 4.583333333333333e-06, "loss": -0.1172, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01938263140618801, "mask/share_reasoning": 0.7564852237701416, "mask/share_step_conf": 0.20069465041160583, "num_tokens": 11878057.0, "reward": 1.009666085243225, "reward_std": 0.07966910302639008, "rewards/accuracy_reward_step": 0.8203125, "rewards/final_brier_reward_step": 0.8063905239105225, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8527854084968567, "step": 36 }, { "adv/mean_abs_final_conf": 0.7378990650177002, "adv/mean_abs_reasoning": 0.3196714222431183, "adv/mean_abs_step_conf": 0.7179653644561768, "adv/ratio_final_to_reasoning": 2.3083047581791942, "adv/ratio_step_to_reasoning": 2.2459479155760933, "adv/std_final_conf": 0.930656373500824, "adv/std_reasoning": 0.6402551531791687, "adv/std_step_conf": 0.9292874336242676, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.30667789001122336, "calib/avg_num_step_conf": 16.09765625, "calib/ece": 0.21304000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.116, "calib/gap": -0.07526374859708185, "calib/mean_conf": 0.7103200000000001, "calib/mu_c": 0.6838271604938272, "calib/mu_w": 0.759090909090909, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13768, "calib/std_conf": 0.11493605874572173, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5974634146341464, "calib/step_q_c_n": 2255.0, "calib/step_q_gap": -0.04583776435835096, "calib/step_q_w": 0.6433011789924974, "calib/step_q_w_n": 1866.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 878.13671875, "completions/mean_terminated_length": 892.075439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.039466666666666664, "grad_norm": 0.6386725902557373, "kl": 0.0490875244140625, "learning_rate": 4.555555555555556e-06, "loss": -0.0729, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017722753807902336, "mask/share_reasoning": 0.7773354053497314, "mask/share_step_conf": 0.18931682407855988, "num_tokens": 12209956.0, "reward": 0.9249991774559021, "reward_std": 0.1157519668340683, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7035890817642212, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8245342969894409, "step": 37 }, { "adv/mean_abs_final_conf": 0.763925313949585, "adv/mean_abs_reasoning": 0.40792927145957947, "adv/mean_abs_step_conf": 0.7357197999954224, "adv/ratio_final_to_reasoning": 1.8726906044673974, "adv/ratio_step_to_reasoning": 1.8035474565554013, "adv/std_final_conf": 0.9305187463760376, "adv/std_reasoning": 0.6816024780273438, "adv/std_step_conf": 0.9308106899261475, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.39997705018359847, "calib/avg_num_step_conf": 18.3203125, "calib/ece": 0.14907258064516135, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2056451612903226, "calib/gap": -0.046398408812729275, "calib/mean_conf": 0.7708467741935484, "calib/mu_c": 0.7566279069767443, "calib/mu_w": 0.8030263157894736, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11318548387096779, "calib/std_conf": 0.12580582480199515, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6426657112146184, "calib/step_q_c_n": 2791.0, "calib/step_q_gap": -0.03901090805868335, "calib/step_q_w": 0.6816766192733017, "calib/step_q_w_n": 1899.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 930.3828125, "completions/mean_terminated_length": 956.5381469726562, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.04053333333333333, "grad_norm": 8.489635467529297, "kl": 0.06583404541015625, "learning_rate": 4.527777777777778e-06, "loss": -0.0696, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01635890081524849, "mask/share_reasoning": 0.7625895738601685, "mask/share_step_conf": 0.19370779395103455, "num_tokens": 12555022.0, "reward": 0.9193906188011169, "reward_std": 0.13446089625358582, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7226253747940063, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7880308628082275, "step": 38 }, { "adv/mean_abs_final_conf": 0.7229318022727966, "adv/mean_abs_reasoning": 0.4046805799007416, "adv/mean_abs_step_conf": 0.7568466663360596, "adv/ratio_final_to_reasoning": 1.7864257347118422, "adv/ratio_step_to_reasoning": 1.8702322372911886, "adv/std_final_conf": 0.9026696085929871, "adv/std_reasoning": 0.6816485524177551, "adv/std_step_conf": 0.9291907548904419, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.3354502688172043, "calib/avg_num_step_conf": 17.78125, "calib/ece": 0.2783266932270917, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.38645418326693226, "calib/gap": -0.08987163978494594, "calib/mean_conf": 0.8131474103585659, "calib/mu_c": 0.7787741935483873, "calib/mu_w": 0.8686458333333332, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2369721115537849, "calib/std_conf": 0.15061115266155678, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6284839136183341, "calib/step_q_c_n": 2269.0, "calib/step_q_gap": -0.06271626158972543, "calib/step_q_w": 0.6912001752080595, "calib/step_q_w_n": 2283.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 914.67578125, "completions/mean_terminated_length": 932.8964233398438, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.0416, "grad_norm": 2.350914478302002, "kl": 0.076629638671875, "learning_rate": 4.5e-06, "loss": -0.0759, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017323926091194153, "mask/share_reasoning": 0.7676963806152344, "mask/share_step_conf": 0.19544848799705505, "num_tokens": 12895267.0, "reward": 0.8789174556732178, "reward_std": 0.15501967072486877, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6475117206573486, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7939168810844421, "step": 39 }, { "adv/mean_abs_final_conf": 0.6927260160446167, "adv/mean_abs_reasoning": 0.3538864254951477, "adv/mean_abs_step_conf": 0.7473255395889282, "adv/ratio_final_to_reasoning": 1.957481175140794, "adv/ratio_step_to_reasoning": 2.111766617053174, "adv/std_final_conf": 0.9034165740013123, "adv/std_reasoning": 0.640332043170929, "adv/std_step_conf": 0.9314160346984863, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5539537350036455, "calib/avg_num_step_conf": 19.25390625, "calib/ece": 0.31169354838709673, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5403225806451613, "calib/gap": 0.031000198846689142, "calib/mean_conf": 0.880241935483871, "calib/mu_c": 0.8936170212765957, "calib/mu_w": 0.8626168224299066, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31169354838709673, "calib/std_conf": 0.13433806482441984, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6769523457702253, "calib/step_q_c_n": 2707.0, "calib/step_q_gap": -0.01144099356370809, "calib/step_q_w": 0.6883933393339334, "calib/step_q_w_n": 2222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 1026.09765625, "completions/mean_terminated_length": 1042.385009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.042666666666666665, "grad_norm": 6.329873085021973, "kl": 0.078125, "learning_rate": 4.472222222222223e-06, "loss": -0.0576, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.015485573559999466, "mask/share_reasoning": 0.7712819576263428, "mask/share_step_conf": 0.19760745763778687, "num_tokens": 13264708.0, "reward": 0.8414897918701172, "reward_std": 0.15075916051864624, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6342484354972839, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7448247671127319, "step": 40 }, { "adv/mean_abs_final_conf": 0.4674037992954254, "adv/mean_abs_reasoning": 0.2338797003030777, "adv/mean_abs_step_conf": 0.7664213180541992, "adv/ratio_final_to_reasoning": 1.9984795546160306, "adv/ratio_step_to_reasoning": 3.276989482460499, "adv/std_final_conf": 0.7384918332099915, "adv/std_reasoning": 0.5227841734886169, "adv/std_step_conf": 0.9294553399085999, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4487588138703757, "calib/avg_num_step_conf": 16.54296875, "calib/ece": 0.21744094488188975, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8818897637795275, "calib/gap": -0.02974693325606126, "calib/mean_conf": 0.9675984251968504, "calib/mu_c": 0.9616256157635467, "calib/mu_w": 0.9913725490196079, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19291338582677164, "calib/std_conf": 0.09021922205947056, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6751368970013036, "calib/step_q_c_n": 3068.0, "calib/step_q_gap": -0.03461460257024729, "calib/step_q_w": 0.7097514995715509, "calib/step_q_w_n": 1167.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 882.9921875, "completions/mean_terminated_length": 886.4549560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.04373333333333333, "grad_norm": 5.853041648864746, "kl": 0.1062164306640625, "learning_rate": 4.444444444444444e-06, "loss": 0.0229, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01867281273007393, "mask/share_reasoning": 0.7770313620567322, "mask/share_step_conf": 0.200389564037323, "num_tokens": 13598002.0, "reward": 0.9842904806137085, "reward_std": 0.10741116106510162, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.7872886657714844, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8242610692977905, "step": 41 }, { "adv/mean_abs_final_conf": 0.24544203281402588, "adv/mean_abs_reasoning": 0.24031004309654236, "adv/mean_abs_step_conf": 0.7314047813415527, "adv/ratio_final_to_reasoning": 1.0213557022060114, "adv/ratio_step_to_reasoning": 3.0435880744597825, "adv/std_final_conf": 0.5492023825645447, "adv/std_reasoning": 0.5482551455497742, "adv/std_step_conf": 0.9319241046905518, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5061728395061729, "calib/avg_num_step_conf": 16.0703125, "calib/ece": 0.3190909090909091, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.0033333333333332993, "calib/mean_conf": 0.9989328063241107, "calib/mu_c": 1.0, "calib/mu_w": 0.9966666666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3190909090909091, "calib/std_conf": 0.016941174402864172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6768776210446055, "calib/step_q_c_n": 2623.0, "calib/step_q_gap": -0.019963425233060383, "calib/step_q_w": 0.6968410462776659, "calib/step_q_w_n": 1491.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2526.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 816.77734375, "completions/mean_terminated_length": 829.7421264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.0448, "grad_norm": 0.8582086563110352, "kl": 0.12103271484375, "learning_rate": 4.416666666666667e-06, "loss": -0.0349, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01830482669174671, "mask/share_reasoning": 0.767654299736023, "mask/share_step_conf": 0.1984158754348755, "num_tokens": 13911465.0, "reward": 0.8988593816757202, "reward_std": 0.11435660719871521, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6736996173858643, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7919878959655762, "step": 42 }, { "adv/mean_abs_final_conf": 0.31151294708251953, "adv/mean_abs_reasoning": 0.3551718294620514, "adv/mean_abs_step_conf": 0.7394903898239136, "adv/ratio_final_to_reasoning": 0.8770767308723266, "adv/ratio_step_to_reasoning": 2.08206374628291, "adv/std_final_conf": 0.5970984697341919, "adv/std_reasoning": 0.6402140259742737, "adv/std_step_conf": 0.9295834302902222, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5062528115159695, "calib/avg_num_step_conf": 18.03125, "calib/ece": 0.22428571428571428, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.003433198380566771, "calib/mean_conf": 0.9980952380952381, "calib/mu_c": 0.9988717948717949, "calib/mu_w": 0.9954385964912281, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22428571428571428, "calib/std_conf": 0.021370291116492263, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6324468514612834, "calib/step_q_c_n": 3319.0, "calib/step_q_gap": -0.03133109765205511, "calib/step_q_w": 0.6637779491133385, "calib/step_q_w_n": 1297.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 924.08203125, "completions/mean_terminated_length": 938.7500610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.04586666666666667, "grad_norm": 0.5675400495529175, "kl": 0.0981597900390625, "learning_rate": 4.388888888888889e-06, "loss": -0.0448, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017082808539271355, "mask/share_reasoning": 0.769527792930603, "mask/share_step_conf": 0.1977643519639969, "num_tokens": 14253254.0, "reward": 0.9661224484443665, "reward_std": 0.13642850518226624, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7632968425750732, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.819729208946228, "step": 43 }, { "adv/mean_abs_final_conf": 0.40735363960266113, "adv/mean_abs_reasoning": 0.39606475830078125, "adv/mean_abs_step_conf": 0.7465857267379761, "adv/ratio_final_to_reasoning": 1.0285026149519387, "adv/ratio_step_to_reasoning": 1.8850092341995262, "adv/std_final_conf": 0.7025128602981567, "adv/std_reasoning": 0.701213002204895, "adv/std_step_conf": 0.9330065846443176, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.496235387358827, "calib/avg_num_step_conf": 18.87890625, "calib/ece": 0.41003999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.976, "calib/gap": -0.0013598837593288282, "calib/mean_conf": 0.9898800000000001, "calib/mu_c": 0.9893197278911565, "calib/mu_w": 0.9906796116504853, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40596, "calib/std_conf": 0.0660695512320161, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5696090003688676, "calib/step_q_c_n": 2711.0, "calib/step_q_gap": -0.015537088226796891, "calib/step_q_w": 0.5851460885956645, "calib/step_q_w_n": 2122.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 964.74609375, "completions/mean_terminated_length": 987.9000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.046933333333333334, "grad_norm": 1.150726556777954, "kl": 0.0948333740234375, "learning_rate": 4.361111111111112e-06, "loss": -0.0994, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.015860669314861298, "mask/share_reasoning": 0.765245258808136, "mask/share_step_conf": 0.1954565942287445, "num_tokens": 14606549.0, "reward": 0.8504647016525269, "reward_std": 0.1694660782814026, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5773558616638184, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8134173154830933, "step": 44 }, { "adv/mean_abs_final_conf": 0.6760562062263489, "adv/mean_abs_reasoning": 0.36660242080688477, "adv/mean_abs_step_conf": 0.7361263036727905, "adv/ratio_final_to_reasoning": 1.8441127713733105, "adv/ratio_step_to_reasoning": 2.0079690200970055, "adv/std_final_conf": 0.8596835136413574, "adv/std_reasoning": 0.6612597107887268, "adv/std_step_conf": 0.9321467280387878, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6382594021215043, "calib/avg_num_step_conf": 18.43359375, "calib/ece": 0.17344621513944228, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6055776892430279, "calib/gap": 0.1140340726454514, "calib/mean_conf": 0.825199203187251, "calib/mu_c": 0.8560928961748632, "calib/mu_w": 0.7420588235294118, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13478087649402393, "calib/std_conf": 0.22110677811872337, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4913256907701352, "calib/step_q_c_n": 3402.0, "calib/step_q_gap": -0.0069431019405709615, "calib/step_q_w": 0.4982687927107062, "calib/step_q_w_n": 1317.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 931.734375, "completions/mean_terminated_length": 946.5238647460938, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.048, "grad_norm": 9.781808853149414, "kl": 0.0993499755859375, "learning_rate": 4.333333333333334e-06, "loss": -0.0653, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017006918787956238, "mask/share_reasoning": 0.7669587731361389, "mask/share_step_conf": 0.20040932297706604, "num_tokens": 14950121.0, "reward": 0.979068398475647, "reward_std": 0.14598211646080017, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7739831209182739, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8450911045074463, "step": 45 }, { "adv/mean_abs_final_conf": 0.6329466104507446, "adv/mean_abs_reasoning": 0.4576505124568939, "adv/mean_abs_step_conf": 0.755608320236206, "adv/ratio_final_to_reasoning": 1.3830348556866563, "adv/ratio_step_to_reasoning": 1.6510597053190819, "adv/std_final_conf": 0.8438161611557007, "adv/std_reasoning": 0.7206340432167053, "adv/std_step_conf": 0.9321249127388, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5197687138615408, "calib/avg_num_step_conf": 18.9453125, "calib/ece": 0.24536307053941905, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5311203319502075, "calib/gap": 0.014447569932802051, "calib/mean_conf": 0.767085062240664, "calib/mu_c": 0.771820987654321, "calib/mu_w": 0.7573734177215189, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17012448132780084, "calib/std_conf": 0.2530070110899711, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44540912449205755, "calib/step_q_c_n": 2707.0, "calib/step_q_gap": -0.04950429128022438, "calib/step_q_w": 0.49491341577228193, "calib/step_q_w_n": 2143.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 904.48046875, "completions/mean_terminated_length": 956.8057250976562, "completions/min_length": 0.0, "completions/min_terminated_length": 316.0, "epoch": 0.04906666666666667, "grad_norm": 2.4032022953033447, "kl": 0.0972747802734375, "learning_rate": 4.305555555555556e-06, "loss": -0.1157, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017247095704078674, "mask/share_reasoning": 0.7377867698669434, "mask/share_step_conf": 0.19027858972549438, "num_tokens": 15286436.0, "reward": 0.8975282907485962, "reward_std": 0.16433261334896088, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6712260842323303, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8089866638183594, "step": 46 }, { "adv/mean_abs_final_conf": 0.7495410442352295, "adv/mean_abs_reasoning": 0.2339608073234558, "adv/mean_abs_step_conf": 0.7660993337631226, "adv/ratio_final_to_reasoning": 3.203703444222489, "adv/ratio_step_to_reasoning": 3.2744772191864335, "adv/std_final_conf": 0.9358264207839966, "adv/std_reasoning": 0.5227507948875427, "adv/std_step_conf": 0.9310525059700012, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.42580899763220204, "calib/avg_num_step_conf": 17.2265625, "calib/ece": 0.298126812749004, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.26294820717131473, "calib/gap": -0.06323209155485376, "calib/mean_conf": 0.6142237848605577, "calib/mu_c": 0.5965893370165747, "calib/mu_w": 0.6598214285714284, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09561752988047809, "calib/std_conf": 0.23729989030730314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41470974134917865, "calib/step_q_c_n": 2861.0, "calib/step_q_gap": -0.027415994932293297, "calib/step_q_w": 0.44212573628147195, "calib/step_q_w_n": 1549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 897.2109375, "completions/mean_terminated_length": 915.0836791992188, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.050133333333333335, "grad_norm": 6.013477325439453, "kl": 0.1095428466796875, "learning_rate": 4.277777777777778e-06, "loss": -0.0614, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017619537189602852, "mask/share_reasoning": 0.7625261545181274, "mask/share_step_conf": 0.20032307505607605, "num_tokens": 15622098.0, "reward": 0.9354605674743652, "reward_std": 0.1320100575685501, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.6919384002685547, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8414826989173889, "step": 47 }, { "adv/mean_abs_final_conf": 0.6579278111457825, "adv/mean_abs_reasoning": 0.3850702941417694, "adv/mean_abs_step_conf": 0.7593048810958862, "adv/ratio_final_to_reasoning": 1.7085914472113408, "adv/ratio_step_to_reasoning": 1.9718604437877951, "adv/std_final_conf": 0.8437376618385315, "adv/std_reasoning": 0.6612684726715088, "adv/std_step_conf": 0.9322788715362549, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5509906759906761, "calib/avg_num_step_conf": 16.45703125, "calib/ece": 0.23585887096774188, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5967741935483871, "calib/gap": 0.046279886779886925, "calib/mean_conf": 0.7883346774193548, "calib/mu_c": 0.800651098901099, "calib/mu_w": 0.754371212121212, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14516129032258063, "calib/std_conf": 0.2623918803585998, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4058072773352643, "calib/step_q_c_n": 2762.0, "calib/step_q_gap": -0.0073858548494360465, "calib/step_q_w": 0.41319313218470033, "calib/step_q_w_n": 1451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 827.75390625, "completions/mean_terminated_length": 844.2430419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.0512, "grad_norm": 16.120351791381836, "kl": 0.1216888427734375, "learning_rate": 4.25e-06, "loss": -0.0668, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019641082733869553, "mask/share_reasoning": 0.758228063583374, "mask/share_step_conf": 0.20259957015514374, "num_tokens": 15937691.0, "reward": 0.9367126822471619, "reward_std": 0.12111753970384598, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.727489709854126, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8107792735099792, "step": 48 }, { "adv/mean_abs_final_conf": 0.49926644563674927, "adv/mean_abs_reasoning": 0.4738897979259491, "adv/mean_abs_step_conf": 0.7442777752876282, "adv/ratio_final_to_reasoning": 1.05354968142776, "adv/ratio_step_to_reasoning": 1.5705714251395013, "adv/std_final_conf": 0.7588150501251221, "adv/std_reasoning": 0.7393327951431274, "adv/std_step_conf": 0.9322012066841125, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5067469295410472, "calib/avg_num_step_conf": 15.37890625, "calib/ece": 0.26702, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.952, "calib/gap": 0.004473173884938397, "calib/mean_conf": 0.97458, "calib/mu_c": 0.9757967032967031, "calib/mu_w": 0.9713235294117647, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25680000000000003, "calib/std_conf": 0.11425901977524575, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40485632148040634, "calib/step_q_c_n": 2756.0, "calib/step_q_gap": -0.0040649316948688785, "calib/step_q_w": 0.4089212531752752, "calib/step_q_w_n": 1181.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 794.8515625, "completions/mean_terminated_length": 810.685302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.05226666666666667, "grad_norm": 8.985692024230957, "kl": 0.121917724609375, "learning_rate": 4.222222222222223e-06, "loss": -0.1164, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018479930236935616, "mask/share_reasoning": 0.7629247903823853, "mask/share_step_conf": 0.19906404614448547, "num_tokens": 16245709.0, "reward": 0.951441764831543, "reward_std": 0.18734773993492126, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7127916812896729, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8525917530059814, "step": 49 }, { "adv/mean_abs_final_conf": 0.238711878657341, "adv/mean_abs_reasoning": 0.23853182792663574, "adv/mean_abs_step_conf": 0.7555795907974243, "adv/ratio_final_to_reasoning": 1.0007548289562458, "adv/ratio_step_to_reasoning": 3.1676258777080886, "adv/std_final_conf": 0.523689866065979, "adv/std_reasoning": 0.5228677988052368, "adv/std_step_conf": 0.931075394153595, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.506578947368421, "calib/avg_num_step_conf": 16.234375, "calib/ece": 0.303012048192771, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.007236842105263097, "calib/mean_conf": 0.9977911646586345, "calib/mu_c": 1.0, "calib/mu_w": 0.9927631578947369, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.303012048192771, "calib/std_conf": 0.03478477374061442, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39436594846156864, "calib/step_q_c_n": 2550.0, "calib/step_q_gap": -0.0184179301187552, "calib/step_q_w": 0.41278387858032384, "calib/step_q_w_n": 1606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 819.47265625, "completions/mean_terminated_length": 835.7968139648438, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.05333333333333334, "grad_norm": 0.6560844779014587, "kl": 0.1348114013671875, "learning_rate": 4.194444444444445e-06, "loss": -0.0714, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019871089607477188, "mask/share_reasoning": 0.7585107088088989, "mask/share_step_conf": 0.20208695530891418, "num_tokens": 16560854.0, "reward": 0.9203869104385376, "reward_std": 0.10696984082460403, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6788964867591858, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8314086198806763, "step": 50 }, { "adv/mean_abs_final_conf": 0.38873496651649475, "adv/mean_abs_reasoning": 0.3800598084926605, "adv/mean_abs_step_conf": 0.762474000453949, "adv/ratio_final_to_reasoning": 1.0228257706549935, "adv/ratio_step_to_reasoning": 2.0061947709702994, "adv/std_final_conf": 0.662423849105835, "adv/std_reasoning": 0.6613155603408813, "adv/std_step_conf": 0.931913435459137, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5347456267051838, "calib/avg_num_step_conf": 16.8515625, "calib/ece": 0.25079051383399215, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": 0.042631198844487295, "calib/mean_conf": 0.9859683794466403, "calib/mu_c": 0.9972580645161291, "calib/mu_w": 0.9546268656716418, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25079051383399215, "calib/std_conf": 0.09131641006509278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38147910707933125, "calib/step_q_c_n": 2811.0, "calib/step_q_gap": 0.004981652467580311, "calib/step_q_w": 0.37649745461175094, "calib/step_q_w_n": 1503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 832.00390625, "completions/mean_terminated_length": 845.2103881835938, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.0544, "grad_norm": 1.00384521484375, "kl": 0.131195068359375, "learning_rate": 4.166666666666667e-06, "loss": 0.0174, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019142724573612213, "mask/share_reasoning": 0.7567824125289917, "mask/share_step_conf": 0.20844991505146027, "num_tokens": 16883143.0, "reward": 0.9582247734069824, "reward_std": 0.15678593516349792, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7418769598007202, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8323850631713867, "step": 51 }, { "adv/mean_abs_final_conf": 0.3104902505874634, "adv/mean_abs_reasoning": 0.27709537744522095, "adv/mean_abs_step_conf": 0.7418975830078125, "adv/ratio_final_to_reasoning": 1.1205176118423132, "adv/ratio_step_to_reasoning": 2.677408731419485, "adv/std_final_conf": 0.5970803499221802, "adv/std_reasoning": 0.5725939869880676, "adv/std_step_conf": 0.9316956400871277, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.48947368421052634, "calib/avg_num_step_conf": 16.734375, "calib/ece": 0.2659607843137255, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.984313725490196, "calib/gap": -0.014842105263157879, "calib/mean_conf": 0.9889411764705882, "calib/mu_c": 0.9851578947368421, "calib/mu_w": 1.0, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2549019607843137, "calib/std_conf": 0.08772090585548072, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38782724006644514, "calib/step_q_c_n": 3010.0, "calib/step_q_gap": 0.04162455081605265, "calib/step_q_w": 0.3462026892503925, "calib/step_q_w_n": 1274.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 858.3046875, "completions/mean_terminated_length": 865.06298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.055466666666666664, "grad_norm": 0.48756399750709534, "kl": 0.135009765625, "learning_rate": 4.138888888888889e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0193092729896307, "mask/share_reasoning": 0.7665286660194397, "mask/share_step_conf": 0.20634955167770386, "num_tokens": 17210821.0, "reward": 0.9706714153289795, "reward_std": 0.11463622748851776, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.734400749206543, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8592857122421265, "step": 52 }, { "adv/mean_abs_final_conf": 0.3958958089351654, "adv/mean_abs_reasoning": 0.3979988098144531, "adv/mean_abs_step_conf": 0.7480939030647278, "adv/ratio_final_to_reasoning": 0.9947160623915732, "adv/ratio_step_to_reasoning": 1.8796385431742593, "adv/std_final_conf": 0.6624305248260498, "adv/std_reasoning": 0.661271870136261, "adv/std_step_conf": 0.9320031404495239, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5079365079365079, "calib/avg_num_step_conf": 16.515625, "calib/ece": 0.2447450980392157, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.009365079365079465, "calib/mean_conf": 0.997686274509804, "calib/mu_c": 1.0, "calib/mu_w": 0.9906349206349205, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2447450980392157, "calib/std_conf": 0.03687471645411939, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36148193094462544, "calib/step_q_c_n": 3070.0, "calib/step_q_gap": -0.018805372164182876, "calib/step_q_w": 0.3802873031088083, "calib/step_q_w_n": 1158.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 861.42578125, "completions/mean_terminated_length": 868.2086791992188, "completions/min_length": 0.0, "completions/min_terminated_length": 316.0, "epoch": 0.05653333333333333, "grad_norm": 0.7224613428115845, "kl": 0.1280517578125, "learning_rate": 4.111111111111111e-06, "loss": -0.0055, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018144361674785614, "mask/share_reasoning": 0.7759116888046265, "mask/share_step_conf": 0.1981314867734909, "num_tokens": 17537170.0, "reward": 0.971244215965271, "reward_std": 0.14458441734313965, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7532496452331543, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8400200009346008, "step": 53 }, { "adv/mean_abs_final_conf": 0.2834058105945587, "adv/mean_abs_reasoning": 0.2771291732788086, "adv/mean_abs_step_conf": 0.7611951231956482, "adv/ratio_final_to_reasoning": 1.0226487786958303, "adv/ratio_step_to_reasoning": 2.7467159598886406, "adv/std_final_conf": 0.5969284772872925, "adv/std_reasoning": 0.5959136486053467, "adv/std_step_conf": 0.9312698245048523, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5777011494252873, "calib/avg_num_step_conf": 14.87109375, "calib/ece": 0.1039370078740157, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9645669291338582, "calib/gap": 0.08632950191570865, "calib/mean_conf": 0.9799212598425195, "calib/mu_c": 0.9897777777777778, "calib/mu_w": 0.9034482758620691, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09901574803149601, "calib/std_conf": 0.10625327091207667, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3776230558096981, "calib/step_q_c_n": 3279.0, "calib/step_q_gap": 0.008109798233940535, "calib/step_q_w": 0.3695132575757576, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 779.32421875, "completions/mean_terminated_length": 785.4606323242188, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.0576, "grad_norm": 1003.9580078125, "kl": 7.28546142578125, "learning_rate": 4.083333333333334e-06, "loss": 0.0699, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02094903402030468, "mask/share_reasoning": 0.7728541493415833, "mask/share_step_conf": 0.19838431477546692, "num_tokens": 17842909.0, "reward": 1.042170763015747, "reward_std": 0.10559922456741333, "rewards/accuracy_reward_step": 0.87890625, "rewards/final_brier_reward_step": 0.8891797065734863, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8209431171417236, "step": 54 }, { "adv/mean_abs_final_conf": 0.42238491773605347, "adv/mean_abs_reasoning": 0.35225415229797363, "adv/mean_abs_step_conf": 0.729461669921875, "adv/ratio_final_to_reasoning": 1.199091380415456, "adv/ratio_step_to_reasoning": 2.0708390949067352, "adv/std_final_conf": 0.702567994594574, "adv/std_reasoning": 0.6611537933349609, "adv/std_step_conf": 0.9324063062667847, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6214314570428225, "calib/avg_num_step_conf": 17.1875, "calib/ece": 0.3098809523809524, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.876984126984127, "calib/gap": 0.1343105305682738, "calib/mean_conf": 0.9309126984126984, "calib/mu_c": 0.9810126582278481, "calib/mu_w": 0.8467021276595743, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30690476190476196, "calib/std_conf": 0.1857213529722994, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3926400347372992, "calib/step_q_c_n": 2303.0, "calib/step_q_gap": -0.009996922821117582, "calib/step_q_w": 0.4026369575584168, "calib/step_q_w_n": 2097.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 837.421875, "completions/mean_terminated_length": 850.71435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 236.0, "epoch": 0.058666666666666666, "grad_norm": 1.3093879222869873, "kl": 0.125213623046875, "learning_rate": 4.055555555555556e-06, "loss": -0.0721, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018828079104423523, "mask/share_reasoning": 0.760090708732605, "mask/share_step_conf": 0.20545616745948792, "num_tokens": 18165113.0, "reward": 0.9341639280319214, "reward_std": 0.148420050740242, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6911137104034424, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8569015264511108, "step": 55 }, { "adv/mean_abs_final_conf": 0.47167450189590454, "adv/mean_abs_reasoning": 0.3771626353263855, "adv/mean_abs_step_conf": 0.7823553085327148, "adv/ratio_final_to_reasoning": 1.250586504911154, "adv/ratio_step_to_reasoning": 2.074318172731208, "adv/std_final_conf": 0.7215442657470703, "adv/std_reasoning": 0.6613060235977173, "adv/std_step_conf": 0.93223637342453, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5537507171543317, "calib/avg_num_step_conf": 16.3828125, "calib/ece": 0.25859229747675966, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7529880478087649, "calib/gap": 0.06329867087397223, "calib/mean_conf": 0.8569455511288181, "calib/mu_c": 0.8778769841269842, "calib/mu_w": 0.814578313253012, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22310756972111553, "calib/std_conf": 0.2527051731099045, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3889309542280838, "calib/step_q_c_n": 2578.0, "calib/step_q_gap": -0.012464466563995358, "calib/step_q_w": 0.4013954207920792, "calib/step_q_w_n": 1616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 819.80078125, "completions/mean_terminated_length": 836.1314697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 324.0, "epoch": 0.05973333333333333, "grad_norm": 1.1368886232376099, "kl": 0.13116455078125, "learning_rate": 4.027777777777779e-06, "loss": -0.0813, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019105419516563416, "mask/share_reasoning": 0.764003574848175, "mask/share_step_conf": 0.19735974073410034, "num_tokens": 18481822.0, "reward": 0.9351576566696167, "reward_std": 0.13738873600959778, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6938068866729736, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8491647243499756, "step": 56 }, { "adv/mean_abs_final_conf": 0.5013920664787292, "adv/mean_abs_reasoning": 0.2982490062713623, "adv/mean_abs_step_conf": 0.7509977221488953, "adv/ratio_final_to_reasoning": 1.6811189842575265, "adv/ratio_step_to_reasoning": 2.5180225461190604, "adv/std_final_conf": 0.7581309080123901, "adv/std_reasoning": 0.5727874636650085, "adv/std_step_conf": 0.9307642579078674, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7988700564971751, "calib/avg_num_step_conf": 16.10546875, "calib/ece": 0.07724409448818895, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6614173228346457, "calib/gap": 0.32819643633202944, "calib/mean_conf": 0.8109448818897639, "calib/mu_c": 0.8871794871794871, "calib/mu_w": 0.5589830508474577, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06023622047244091, "calib/std_conf": 0.2692609689556561, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4002271791352094, "calib/step_q_c_n": 2914.0, "calib/step_q_gap": -0.006001108705981695, "calib/step_q_w": 0.4062282878411911, "calib/step_q_w_n": 1209.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 813.8046875, "completions/mean_terminated_length": 820.2125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.0608, "grad_norm": 1.7037533521652222, "kl": 0.127044677734375, "learning_rate": 4.000000000000001e-06, "loss": -0.0952, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018915927037596703, "mask/share_reasoning": 0.7732715606689453, "mask/share_step_conf": 0.20000000298023224, "num_tokens": 18796948.0, "reward": 1.0256168842315674, "reward_std": 0.09571676701307297, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.8576023578643799, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8428501486778259, "step": 57 }, { "adv/mean_abs_final_conf": 0.6737596988677979, "adv/mean_abs_reasoning": 0.4155024290084839, "adv/mean_abs_step_conf": 0.7450528740882874, "adv/ratio_final_to_reasoning": 1.621554175930078, "adv/ratio_step_to_reasoning": 1.793137228743071, "adv/std_final_conf": 0.874968409538269, "adv/std_reasoning": 0.7205427289009094, "adv/std_step_conf": 0.9317278265953064, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6546511627906976, "calib/avg_num_step_conf": 17.87890625, "calib/ece": 0.18950199203187246, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4820717131474104, "calib/gap": 0.1429517265680057, "calib/mean_conf": 0.7192629482071714, "calib/mu_c": 0.7682424242424244, "calib/mu_w": 0.6252906976744187, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1256972111553785, "calib/std_conf": 0.27699072032306954, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4372982394366198, "calib/step_q_c_n": 2840.0, "calib/step_q_gap": 0.011571123719866794, "calib/step_q_w": 0.425727115716753, "calib/step_q_w_n": 1737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 895.7734375, "completions/mean_terminated_length": 913.6175537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.06186666666666667, "grad_norm": 2.793691396713257, "kl": 0.1250762939453125, "learning_rate": 3.972222222222223e-06, "loss": -0.1449, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017459027469158173, "mask/share_reasoning": 0.771247148513794, "mask/share_step_conf": 0.19176256656646729, "num_tokens": 19132586.0, "reward": 0.9520823359489441, "reward_std": 0.1335805356502533, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7437897324562073, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8353749513626099, "step": 58 }, { "adv/mean_abs_final_conf": 0.4890376329421997, "adv/mean_abs_reasoning": 0.4541805386543274, "adv/mean_abs_step_conf": 0.772358775138855, "adv/ratio_final_to_reasoning": 1.0767472212507143, "adv/ratio_step_to_reasoning": 1.7005545359280356, "adv/std_final_conf": 0.7584484219551086, "adv/std_reasoning": 0.7205294966697693, "adv/std_step_conf": 0.9323009252548218, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6304761904761904, "calib/avg_num_step_conf": 15.953125, "calib/ece": 0.18223999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.812, "calib/gap": 0.14259523809523822, "calib/mean_conf": 0.90224, "calib/mu_c": 0.9421666666666668, "calib/mu_w": 0.7995714285714286, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18223999999999999, "calib/std_conf": 0.2077300710056202, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47640015034836813, "calib/step_q_c_n": 2727.0, "calib/step_q_gap": 0.024071484172981217, "calib/step_q_w": 0.4523286661753869, "calib/step_q_w_n": 1357.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 833.0390625, "completions/mean_terminated_length": 849.6334838867188, "completions/min_length": 0.0, "completions/min_terminated_length": 282.0, "epoch": 0.06293333333333333, "grad_norm": 0.9994844794273376, "kl": 0.14361572265625, "learning_rate": 3.944444444444445e-06, "loss": -0.0986, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019324954599142075, "mask/share_reasoning": 0.7716541290283203, "mask/share_step_conf": 0.1894896924495697, "num_tokens": 19452092.0, "reward": 0.972926139831543, "reward_std": 0.1648274064064026, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.761260986328125, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8478724956512451, "step": 59 }, { "adv/mean_abs_final_conf": 0.528482973575592, "adv/mean_abs_reasoning": 0.4964033365249634, "adv/mean_abs_step_conf": 0.7247074842453003, "adv/ratio_final_to_reasoning": 1.064624136645011, "adv/ratio_step_to_reasoning": 1.459916626102, "adv/std_final_conf": 0.7766311764717102, "adv/std_reasoning": 0.7752928137779236, "adv/std_step_conf": 0.933074414730072, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.612676597051597, "calib/avg_num_step_conf": 15.51171875, "calib/ece": 0.24340000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.876, "calib/gap": 0.11940264127764122, "calib/mean_conf": 0.9346, "calib/mu_c": 0.9699431818181817, "calib/mu_w": 0.8505405405405405, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23700000000000004, "calib/std_conf": 0.17584322563010496, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5086385461689588, "calib/step_q_c_n": 2545.0, "calib/step_q_gap": 0.05656280984357309, "calib/step_q_w": 0.45207573632538567, "calib/step_q_w_n": 1426.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 819.41015625, "completions/mean_terminated_length": 822.6235961914062, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.064, "grad_norm": 0.8521646857261658, "kl": 0.1455841064453125, "learning_rate": 3.916666666666667e-06, "loss": 0.0068, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01989622786641121, "mask/share_reasoning": 0.7782717347145081, "mask/share_step_conf": 0.19792579114437103, "num_tokens": 19770717.0, "reward": 0.967574417591095, "reward_std": 0.19966405630111694, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7395331859588623, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.862021803855896, "step": 60 }, { "adv/mean_abs_final_conf": 0.20381410419940948, "adv/mean_abs_reasoning": 0.20344872772693634, "adv/mean_abs_step_conf": 0.7607905864715576, "adv/ratio_final_to_reasoning": 1.0017959142657482, "adv/ratio_step_to_reasoning": 3.7394708483636783, "adv/std_final_conf": 0.4968036413192749, "adv/std_reasoning": 0.49589797854423523, "adv/std_step_conf": 0.9322981834411621, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5, "calib/avg_num_step_conf": 13.79296875, "calib/ece": 0.21513944223107573, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 1.0, "calib/mu_c": 1.0, "calib/mu_w": 1.0, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21513944223107573, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5691638418079097, "calib/step_q_c_n": 2655.0, "calib/step_q_gap": -0.020185473260583398, "calib/step_q_w": 0.5893493150684931, "calib/step_q_w_n": 876.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 709.79296875, "completions/mean_terminated_length": 712.5765380859375, "completions/min_length": 0.0, "completions/min_terminated_length": 263.0, "epoch": 0.06506666666666666, "grad_norm": 0.6356127858161926, "kl": 0.1813812255859375, "learning_rate": 3.88888888888889e-06, "loss": 0.0078, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02276683785021305, "mask/share_reasoning": 0.7717860341072083, "mask/share_step_conf": 0.20154087245464325, "num_tokens": 20056488.0, "reward": 0.973097562789917, "reward_std": 0.10957294702529907, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.76953125, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8235387802124023, "step": 61 }, { "adv/mean_abs_final_conf": 0.4145433008670807, "adv/mean_abs_reasoning": 0.40846771001815796, "adv/mean_abs_step_conf": 0.7656244039535522, "adv/ratio_final_to_reasoning": 1.0148741031418436, "adv/ratio_step_to_reasoning": 1.8743817079678522, "adv/std_final_conf": 0.662446141242981, "adv/std_reasoning": 0.6613765954971313, "adv/std_step_conf": 0.9333126544952393, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 15.23828125, "calib/ece": 0.24193548387096775, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 1.0, "calib/mu_c": 1.0, "calib/mu_w": 1.0, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24193548387096775, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5913802622498275, "calib/step_q_c_n": 2898.0, "calib/step_q_gap": 0.01769102994673677, "calib/step_q_w": 0.5736892323030908, "calib/step_q_w_n": 1003.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 794.70703125, "completions/mean_terminated_length": 800.9645385742188, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.06613333333333334, "grad_norm": 0.6255844235420227, "kl": 0.154815673828125, "learning_rate": 3.861111111111112e-06, "loss": 0.0408, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02088448777794838, "mask/share_reasoning": 0.7749018669128418, "mask/share_step_conf": 0.1964011788368225, "num_tokens": 20367013.0, "reward": 0.9518564939498901, "reward_std": 0.182759091258049, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.734375, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8263691663742065, "step": 62 }, { "adv/mean_abs_final_conf": 0.4214579463005066, "adv/mean_abs_reasoning": 0.43693113327026367, "adv/mean_abs_step_conf": 0.7564267516136169, "adv/ratio_final_to_reasoning": 0.9645866687183718, "adv/ratio_step_to_reasoning": 1.7312264886048514, "adv/std_final_conf": 0.7218169569969177, "adv/std_reasoning": 0.7205101251602173, "adv/std_step_conf": 0.932865560054779, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.572463768115942, "calib/avg_num_step_conf": 15.7265625, "calib/ece": 0.2667226890756303, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.957983193277311, "calib/gap": 0.08000000000000007, "calib/mean_conf": 0.9768067226890758, "calib/mu_c": 1.0, "calib/mu_w": 0.9199999999999999, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2667226890756303, "calib/std_conf": 0.11492367048976056, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5731888667992049, "calib/step_q_c_n": 2515.0, "calib/step_q_gap": 0.020158423384247892, "calib/step_q_w": 0.553030443414957, "calib/step_q_w_n": 1511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 848.71484375, "completions/mean_terminated_length": 862.1865844726562, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 0.0672, "grad_norm": 0.714672863483429, "kl": 0.145233154296875, "learning_rate": 3.833333333333334e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01956631988286972, "mask/share_reasoning": 0.7764766216278076, "mask/share_step_conf": 0.18833208084106445, "num_tokens": 20692924.0, "reward": 0.9051201343536377, "reward_std": 0.19580185413360596, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6905023455619812, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7978629469871521, "step": 63 }, { "adv/mean_abs_final_conf": 0.4704277515411377, "adv/mean_abs_reasoning": 0.4621776342391968, "adv/mean_abs_step_conf": 0.7612091302871704, "adv/ratio_final_to_reasoning": 1.0178505334112968, "adv/ratio_step_to_reasoning": 1.6470055534820882, "adv/std_final_conf": 0.7218660712242126, "adv/std_reasoning": 0.720646321773529, "adv/std_step_conf": 0.9323759078979492, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5319602272727272, "calib/avg_num_step_conf": 14.328125, "calib/ece": 0.27399193548387096, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9758064516129032, "calib/gap": 0.04646464646464643, "calib/mean_conf": 0.9836693548387097, "calib/mu_c": 0.9971590909090909, "calib/mu_w": 0.9506944444444445, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27399193548387096, "calib/std_conf": 0.11110081311033378, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5757563694267516, "calib/step_q_c_n": 2512.0, "calib/step_q_gap": 0.0056462915720801865, "calib/step_q_w": 0.5701100778546714, "calib/step_q_w_n": 1156.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 741.96484375, "completions/mean_terminated_length": 750.7628784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.06826666666666667, "grad_norm": 0.5652779340744019, "kl": 0.16583251953125, "learning_rate": 3.8055555555555556e-06, "loss": -0.0414, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02139584720134735, "mask/share_reasoning": 0.7729315161705017, "mask/share_step_conf": 0.19395390152931213, "num_tokens": 20986643.0, "reward": 0.9350217580795288, "reward_std": 0.19865821301937103, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7030183672904968, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8334312438964844, "step": 64 }, { "adv/mean_abs_final_conf": 0.36213234066963196, "adv/mean_abs_reasoning": 0.3751865029335022, "adv/mean_abs_step_conf": 0.7652068138122559, "adv/ratio_final_to_reasoning": 0.9652062050158986, "adv/ratio_step_to_reasoning": 2.0395371577316057, "adv/std_final_conf": 0.640243649482727, "adv/std_reasoning": 0.6611436009407043, "adv/std_step_conf": 0.9320917129516602, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5317092309887288, "calib/avg_num_step_conf": 13.81640625, "calib/ece": 0.3333734939759037, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9759036144578314, "calib/gap": 0.040260379512055855, "calib/mean_conf": 0.9799598393574297, "calib/mu_c": 0.9938650306748467, "calib/mu_w": 0.9536046511627908, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3293574297188756, "calib/std_conf": 0.1326362238041872, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5978841296928328, "calib/step_q_c_n": 2344.0, "calib/step_q_gap": 0.03175923447070372, "calib/step_q_w": 0.566124895222129, "calib/step_q_w_n": 1193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 722.6171875, "completions/mean_terminated_length": 725.4510498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.06933333333333333, "grad_norm": 0.8071674108505249, "kl": 0.1625518798828125, "learning_rate": 3.777777777777778e-06, "loss": -0.0419, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02218158356845379, "mask/share_reasoning": 0.7751785516738892, "mask/share_step_conf": 0.1987336277961731, "num_tokens": 21276657.0, "reward": 0.896655261516571, "reward_std": 0.15456606447696686, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6503886580467224, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8171405792236328, "step": 65 }, { "adv/mean_abs_final_conf": 0.45162153244018555, "adv/mean_abs_reasoning": 0.38963302969932556, "adv/mean_abs_step_conf": 0.7725710272789001, "adv/ratio_final_to_reasoning": 1.1590945787853142, "adv/ratio_step_to_reasoning": 1.982817082717763, "adv/std_final_conf": 0.7025941610336304, "adv/std_reasoning": 0.6613808274269104, "adv/std_step_conf": 0.9321988224983215, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5274750525210083, "calib/avg_num_step_conf": 16.69140625, "calib/ece": 0.4310887096774193, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9153225806451613, "calib/gap": 0.02890231092436968, "calib/mean_conf": 0.9391532258064516, "calib/mu_c": 0.9522058823529411, "calib/mu_w": 0.9233035714285714, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4109274193548386, "calib/std_conf": 0.21163078562694523, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5852220595412396, "calib/step_q_c_n": 2049.0, "calib/step_q_gap": 0.04228321062037632, "calib/step_q_w": 0.5429388489208633, "calib/step_q_w_n": 2224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 867.703125, "completions/mean_terminated_length": 881.4762573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 0.0704, "grad_norm": 1.3051528930664062, "kl": 0.13916015625, "learning_rate": 3.7500000000000005e-06, "loss": -0.0313, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019024869427084923, "mask/share_reasoning": 0.7749911546707153, "mask/share_step_conf": 0.1903589814901352, "num_tokens": 21605141.0, "reward": 0.8392418622970581, "reward_std": 0.17693649232387543, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5513848066329956, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8270989656448364, "step": 66 }, { "adv/mean_abs_final_conf": 0.4753112494945526, "adv/mean_abs_reasoning": 0.3199561834335327, "adv/mean_abs_step_conf": 0.7562891244888306, "adv/ratio_final_to_reasoning": 1.4855510663799787, "adv/ratio_step_to_reasoning": 2.3637271715548547, "adv/std_final_conf": 0.7217549085617065, "adv/std_reasoning": 0.5961703062057495, "adv/std_step_conf": 0.9313390851020813, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5917937545388525, "calib/avg_num_step_conf": 15.83984375, "calib/ece": 0.27569721115537854, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8804780876494024, "calib/gap": 0.08962962962962961, "calib/mean_conf": 0.9310756972111554, "calib/mu_c": 0.96, "calib/mu_w": 0.8703703703703703, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2647410358565737, "calib/std_conf": 0.20245908112232547, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5699514955537591, "calib/step_q_c_n": 2474.0, "calib/step_q_gap": 0.04286990162586535, "calib/step_q_w": 0.5270815939278938, "calib/step_q_w_n": 1581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 809.48828125, "completions/mean_terminated_length": 822.3373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.07146666666666666, "grad_norm": 1.0130443572998047, "kl": 0.1458282470703125, "learning_rate": 3.7222222222222225e-06, "loss": -0.0599, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019511142745614052, "mask/share_reasoning": 0.7761760950088501, "mask/share_step_conf": 0.18868783116340637, "num_tokens": 21917378.0, "reward": 0.942968487739563, "reward_std": 0.16495318710803986, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7012468576431274, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8550026416778564, "step": 67 }, { "adv/mean_abs_final_conf": 0.6102041006088257, "adv/mean_abs_reasoning": 0.3448832631111145, "adv/mean_abs_step_conf": 0.7249223589897156, "adv/ratio_final_to_reasoning": 1.7693062142369897, "adv/ratio_step_to_reasoning": 2.1019354562188775, "adv/std_final_conf": 0.8273938298225403, "adv/std_reasoning": 0.6403563618659973, "adv/std_step_conf": 0.9326390027999878, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6122399106270074, "calib/avg_num_step_conf": 16.171875, "calib/ece": 0.2532874493927125, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7368421052631579, "calib/gap": 0.10864306661080847, "calib/mean_conf": 0.8767692307692309, "calib/mu_c": 0.9176753246753246, "calib/mu_w": 0.8090322580645162, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2532874493927125, "calib/std_conf": 0.21778779039318882, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5420821256038648, "calib/step_q_c_n": 2070.0, "calib/step_q_gap": -0.042884057971014466, "calib/step_q_w": 0.5849661835748793, "calib/step_q_w_n": 2070.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 784.62109375, "completions/mean_terminated_length": 800.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.07253333333333334, "grad_norm": 1.7984700202941895, "kl": 0.1521453857421875, "learning_rate": 3.694444444444445e-06, "loss": -0.0989, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020881060510873795, "mask/share_reasoning": 0.7615818977355957, "mask/share_step_conf": 0.1980057805776596, "num_tokens": 22222329.0, "reward": 0.9141995906829834, "reward_std": 0.17229121923446655, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6798965334892273, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8352213501930237, "step": 68 }, { "adv/mean_abs_final_conf": 0.6957080364227295, "adv/mean_abs_reasoning": 0.5500853657722473, "adv/mean_abs_step_conf": 0.7283504605293274, "adv/ratio_final_to_reasoning": 1.264727403620431, "adv/ratio_step_to_reasoning": 1.3240680553405004, "adv/std_final_conf": 0.9360483288764954, "adv/std_reasoning": 0.8099032640457153, "adv/std_step_conf": 0.9317101240158081, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5718556701030929, "calib/avg_num_step_conf": 17.14453125, "calib/ece": 0.10623481781376522, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.14979757085020243, "calib/gap": 0.05692371134020613, "calib/mean_conf": 0.5904453441295546, "calib/mu_c": 0.6127999999999999, "calib/mu_w": 0.5558762886597938, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.044696356275303654, "calib/std_conf": 0.23181078256968513, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5406031468531469, "calib/step_q_c_n": 2288.0, "calib/step_q_gap": -0.017797614688975893, "calib/step_q_w": 0.5584007615421228, "calib/step_q_w_n": 2101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 868.8515625, "completions/mean_terminated_length": 893.277099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 362.0, "epoch": 0.0736, "grad_norm": 1.9360079765319824, "kl": 0.1445465087890625, "learning_rate": 3.6666666666666666e-06, "loss": -0.1238, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01783462055027485, "mask/share_reasoning": 0.771956205368042, "mask/share_step_conf": 0.1828654259443283, "num_tokens": 22549251.0, "reward": 0.9289262890815735, "reward_std": 0.17833179235458374, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7088148593902588, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8381001353263855, "step": 69 }, { "adv/mean_abs_final_conf": 0.7357186675071716, "adv/mean_abs_reasoning": 0.44369590282440186, "adv/mean_abs_step_conf": 0.7311370372772217, "adv/ratio_final_to_reasoning": 1.6581597053834896, "adv/ratio_step_to_reasoning": 1.6478336460243994, "adv/std_final_conf": 0.9344125390052795, "adv/std_reasoning": 0.7206258773803711, "adv/std_step_conf": 0.931718647480011, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6739236111111111, "calib/avg_num_step_conf": 17.375, "calib/ece": 0.04636065573770485, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.11885245901639344, "calib/gap": 0.10216944444444453, "calib/mean_conf": 0.5961967213114753, "calib/mu_c": 0.6380694444444445, "calib/mu_w": 0.5358999999999999, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02619672131147535, "calib/std_conf": 0.18601708185902863, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5241969273743017, "calib/step_q_c_n": 2148.0, "calib/step_q_gap": 0.01965779693951908, "calib/step_q_w": 0.5045391304347826, "calib/step_q_w_n": 2300.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 848.48046875, "completions/mean_terminated_length": 886.5755004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 241.0, "epoch": 0.07466666666666667, "grad_norm": 2.3967182636260986, "kl": 0.1437835693359375, "learning_rate": 3.638888888888889e-06, "loss": -0.2364, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018349867314100266, "mask/share_reasoning": 0.748029351234436, "mask/share_step_conf": 0.19065198302268982, "num_tokens": 22873454.0, "reward": 0.9337574243545532, "reward_std": 0.15358775854110718, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7366839647293091, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8269246816635132, "step": 70 }, { "adv/mean_abs_final_conf": 0.7590868473052979, "adv/mean_abs_reasoning": 0.34748631715774536, "adv/mean_abs_step_conf": 0.7451059222221375, "adv/ratio_final_to_reasoning": 2.184508597386589, "adv/ratio_step_to_reasoning": 2.144274135214044, "adv/std_final_conf": 0.9346693754196167, "adv/std_reasoning": 0.6403198838233948, "adv/std_step_conf": 0.9316022396087646, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6670933606417477, "calib/avg_num_step_conf": 17.1953125, "calib/ece": 0.13208835341365463, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.1686746987951807, "calib/gap": 0.11900409626216091, "calib/mean_conf": 0.6304819277108433, "calib/mu_c": 0.6605913978494624, "calib/mu_w": 0.5415873015873015, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007791164658634543, "calib/std_conf": 0.19300979196370918, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5177360477360478, "calib/step_q_c_n": 2849.0, "calib/step_q_gap": -0.028735298046309032, "calib/step_q_w": 0.5464713457823568, "calib/step_q_w_n": 1553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 853.96875, "completions/mean_terminated_length": 867.5238647460938, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.07573333333333333, "grad_norm": 2.86549711227417, "kl": 0.1335296630859375, "learning_rate": 3.6111111111111115e-06, "loss": -0.098, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019371051341295242, "mask/share_reasoning": 0.7695331573486328, "mask/share_step_conf": 0.19547083973884583, "num_tokens": 23196478.0, "reward": 0.9787261486053467, "reward_std": 0.11370344460010529, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7831434011459351, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8344652056694031, "step": 71 }, { "adv/mean_abs_final_conf": 0.591888427734375, "adv/mean_abs_reasoning": 0.35237473249435425, "adv/mean_abs_step_conf": 0.730226993560791, "adv/ratio_final_to_reasoning": 1.6797130246671652, "adv/ratio_step_to_reasoning": 2.072302370807733, "adv/std_final_conf": 0.8264250159263611, "adv/std_reasoning": 0.661247193813324, "adv/std_step_conf": 0.9301158785820007, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7797329253025455, "calib/avg_num_step_conf": 16.62109375, "calib/ece": 0.13269076305220884, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.46586345381526106, "calib/gap": 0.24635623869801082, "calib/mean_conf": 0.7577510040160643, "calib/mu_c": 0.8477848101265822, "calib/mu_w": 0.6014285714285714, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1279518072289157, "calib/std_conf": 0.2399426015888485, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4751867049651211, "calib/step_q_c_n": 2437.0, "calib/step_q_gap": -0.005553669072282663, "calib/step_q_w": 0.4807403740374038, "calib/step_q_w_n": 1818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 761.93359375, "completions/mean_terminated_length": 777.111572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.0768, "grad_norm": 1.9492751359939575, "kl": 0.1450653076171875, "learning_rate": 3.5833333333333335e-06, "loss": -0.0597, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019757341593503952, "mask/share_reasoning": 0.7533541321754456, "mask/share_step_conf": 0.20735730230808258, "num_tokens": 23495941.0, "reward": 0.9815877676010132, "reward_std": 0.12433907389640808, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7874687910079956, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8577379584312439, "step": 72 }, { "adv/mean_abs_final_conf": 0.5162774920463562, "adv/mean_abs_reasoning": 0.41008317470550537, "adv/mean_abs_step_conf": 0.7600690722465515, "adv/ratio_final_to_reasoning": 1.2589579965506084, "adv/ratio_step_to_reasoning": 1.8534510048903685, "adv/std_final_conf": 0.7756823897361755, "adv/std_reasoning": 0.6816674470901489, "adv/std_step_conf": 0.9316983222961426, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6724709784411276, "calib/avg_num_step_conf": 16.4921875, "calib/ece": 0.11714859437751006, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7911646586345381, "calib/gap": 0.18342039800995025, "calib/mean_conf": 0.8972289156626507, "calib/mu_c": 0.9325870646766169, "calib/mu_w": 0.7491666666666666, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10357429718875504, "calib/std_conf": 0.20616587807968217, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4244396835860251, "calib/step_q_c_n": 3034.0, "calib/step_q_gap": -0.008691629545288015, "calib/step_q_w": 0.4331313131313131, "calib/step_q_w_n": 1188.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 792.33984375, "completions/mean_terminated_length": 808.12353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.07786666666666667, "grad_norm": 0.9351516366004944, "kl": 0.145599365234375, "learning_rate": 3.555555555555556e-06, "loss": -0.092, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01906701549887657, "mask/share_reasoning": 0.7661653161048889, "mask/share_step_conf": 0.1952364146709442, "num_tokens": 23805812.0, "reward": 1.0169777870178223, "reward_std": 0.14627042412757874, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.8276035189628601, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8547895550727844, "step": 73 }, { "adv/mean_abs_final_conf": 0.547152042388916, "adv/mean_abs_reasoning": 0.4900486171245575, "adv/mean_abs_step_conf": 0.7392736673355103, "adv/ratio_final_to_reasoning": 1.1165260410271587, "adv/ratio_step_to_reasoning": 1.508572091629036, "adv/std_final_conf": 0.8277955651283264, "adv/std_reasoning": 0.7753596305847168, "adv/std_step_conf": 0.9327936172485352, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.610968660968661, "calib/avg_num_step_conf": 17.609375, "calib/ece": 0.2772764227642276, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8008130081300813, "calib/gap": 0.10952136752136743, "calib/mean_conf": 0.9013414634146341, "calib/mu_c": 0.9414102564102563, "calib/mu_w": 0.8318888888888889, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27223577235772356, "calib/std_conf": 0.20449280223754604, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3839217354317312, "calib/step_q_c_n": 2351.0, "calib/step_q_gap": -0.030018922890011956, "calib/step_q_w": 0.41394065832174315, "calib/step_q_w_n": 2157.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 839.4296875, "completions/mean_terminated_length": 866.508056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.07893333333333333, "grad_norm": 1.0575493574142456, "kl": 0.1515045166015625, "learning_rate": 3.5277777777777784e-06, "loss": -0.2192, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019271448254585266, "mask/share_reasoning": 0.7547069787979126, "mask/share_step_conf": 0.19477157294750214, "num_tokens": 24124634.0, "reward": 0.915574312210083, "reward_std": 0.1927521973848343, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6780409812927246, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.838263750076294, "step": 74 }, { "adv/mean_abs_final_conf": 0.3390395939350128, "adv/mean_abs_reasoning": 0.29136961698532104, "adv/mean_abs_step_conf": 0.7484970092773438, "adv/ratio_final_to_reasoning": 1.163606547048086, "adv/ratio_step_to_reasoning": 2.5688917637388817, "adv/std_final_conf": 0.6404313445091248, "adv/std_reasoning": 0.5959947109222412, "adv/std_step_conf": 0.930900514125824, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5580176440062273, "calib/avg_num_step_conf": 15.3046875, "calib/ece": 0.17166666666666672, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9246031746031746, "calib/gap": 0.05934613388687082, "calib/mean_conf": 0.9638095238095238, "calib/mu_c": 0.9748780487804879, "calib/mu_w": 0.9155319148936171, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16099206349206355, "calib/std_conf": 0.14033891597795844, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4058226997985225, "calib/step_q_c_n": 2978.0, "calib/step_q_gap": 0.0060886572453309196, "calib/step_q_w": 0.39973404255319156, "calib/step_q_w_n": 940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 760.12109375, "completions/mean_terminated_length": 769.1343994140625, "completions/min_length": 0.0, "completions/min_terminated_length": 296.0, "epoch": 0.08, "grad_norm": 0.9821433424949646, "kl": 0.1571044921875, "learning_rate": 3.5e-06, "loss": -0.0248, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020810868591070175, "mask/share_reasoning": 0.7695268392562866, "mask/share_step_conf": 0.1979435831308365, "num_tokens": 24423977.0, "reward": 1.003063678741455, "reward_std": 0.12076610326766968, "rewards/accuracy_reward_step": 0.8046875, "rewards/final_brier_reward_step": 0.8111202716827393, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8371946811676025, "step": 75 }, { "adv/mean_abs_final_conf": 0.3958325982093811, "adv/mean_abs_reasoning": 0.34861207008361816, "adv/mean_abs_step_conf": 0.7318915128707886, "adv/ratio_final_to_reasoning": 1.1354529351621032, "adv/ratio_step_to_reasoning": 2.0994439828065534, "adv/std_final_conf": 0.6825776100158691, "adv/std_reasoning": 0.6403353214263916, "adv/std_step_conf": 0.9318379163742065, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.565868263473054, "calib/avg_num_step_conf": 16.515625, "calib/ece": 0.27987854251012145, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8906882591093117, "calib/gap": 0.0534580838323353, "calib/mean_conf": 0.9536437246963563, "calib/mu_c": 0.9709580838323352, "calib/mu_w": 0.9174999999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2787044534412956, "calib/std_conf": 0.1368044889965546, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5230705736690054, "calib/step_q_c_n": 2423.0, "calib/step_q_gap": -0.04025574212046834, "calib/step_q_w": 0.5633263157894738, "calib/step_q_w_n": 1805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 816.15625, "completions/mean_terminated_length": 839.100341796875, "completions/min_length": 0.0, "completions/min_terminated_length": 343.0, "epoch": 0.08106666666666666, "grad_norm": 0.7638458013534546, "kl": 0.1407928466796875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0638, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019383957609534264, "mask/share_reasoning": 0.7647703886032104, "mask/share_step_conf": 0.18850192427635193, "num_tokens": 24735969.0, "reward": 0.9202332496643066, "reward_std": 0.1503058522939682, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6837754249572754, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8332536816596985, "step": 76 }, { "adv/mean_abs_final_conf": 0.4167235791683197, "adv/mean_abs_reasoning": 0.31452637910842896, "adv/mean_abs_step_conf": 0.7387938499450684, "adv/ratio_final_to_reasoning": 1.3249240980981745, "adv/ratio_step_to_reasoning": 2.348909023272667, "adv/std_final_conf": 0.6861057281494141, "adv/std_reasoning": 0.6185916066169739, "adv/std_step_conf": 0.9307155013084412, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6515749721053987, "calib/avg_num_step_conf": 16.17578125, "calib/ece": 0.19257936507936516, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8928571428571429, "calib/gap": 0.1380525276800274, "calib/mean_conf": 0.9498809523809524, "calib/mu_c": 0.9832984293193717, "calib/mu_w": 0.8452459016393443, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19226190476190483, "calib/std_conf": 0.15676664019636977, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5262242725337118, "calib/step_q_c_n": 2818.0, "calib/step_q_gap": -0.03107429133628059, "calib/step_q_w": 0.5572985638699924, "calib/step_q_w_n": 1323.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 780.83203125, "completions/mean_terminated_length": 793.2262573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 251.0, "epoch": 0.08213333333333334, "grad_norm": 0.6315996050834656, "kl": 0.15972900390625, "learning_rate": 3.444444444444445e-06, "loss": -0.0529, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0212685689330101, "mask/share_reasoning": 0.7615213394165039, "mask/share_step_conf": 0.2015850841999054, "num_tokens": 25040526.0, "reward": 0.9884750843048096, "reward_std": 0.1469959169626236, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.7931792736053467, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8376772403717041, "step": 77 }, { "adv/mean_abs_final_conf": 0.3757950961589813, "adv/mean_abs_reasoning": 0.24786332249641418, "adv/mean_abs_step_conf": 0.7679344415664673, "adv/ratio_final_to_reasoning": 1.5161383797089136, "adv/ratio_step_to_reasoning": 3.098217331358402, "adv/std_final_conf": 0.6416763067245483, "adv/std_reasoning": 0.5228817462921143, "adv/std_step_conf": 0.9308434724807739, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5914634146341463, "calib/avg_num_step_conf": 16.95703125, "calib/ece": 0.32464566929133865, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9251968503937008, "calib/gap": 0.07087127371273705, "calib/mean_conf": 0.9703149606299213, "calib/mu_c": 0.9954268292682927, "calib/mu_w": 0.9245555555555557, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32464566929133865, "calib/std_conf": 0.11836109841793138, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5713968871595331, "calib/step_q_c_n": 2570.0, "calib/step_q_gap": -0.013001757673894332, "calib/step_q_w": 0.5843986448334274, "calib/step_q_w_n": 1771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 913.40625, "completions/mean_terminated_length": 920.5984497070312, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.0832, "grad_norm": 0.69254070520401, "kl": 0.132720947265625, "learning_rate": 3.416666666666667e-06, "loss": 0.002, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01735468953847885, "mask/share_reasoning": 0.7867718935012817, "mask/share_step_conf": 0.18806087970733643, "num_tokens": 25382382.0, "reward": 0.9259521961212158, "reward_std": 0.11798139661550522, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6788976788520813, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8464441895484924, "step": 78 }, { "adv/mean_abs_final_conf": 0.46276575326919556, "adv/mean_abs_reasoning": 0.36419200897216797, "adv/mean_abs_step_conf": 0.7574458718299866, "adv/ratio_final_to_reasoning": 1.2706642152177499, "adv/ratio_step_to_reasoning": 2.079798164621101, "adv/std_final_conf": 0.7214287519454956, "adv/std_reasoning": 0.6404659748077393, "adv/std_step_conf": 0.930621862411499, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5591424968474148, "calib/avg_num_step_conf": 16.24609375, "calib/ece": 0.23822580645161295, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9395161290322581, "calib/gap": 0.05521647751155945, "calib/mean_conf": 0.9761290322580645, "calib/mu_c": 0.9906010928961748, "calib/mu_w": 0.9353846153846154, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23822580645161295, "calib/std_conf": 0.10713806044463223, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5847711711711712, "calib/step_q_c_n": 2775.0, "calib/step_q_gap": 0.011636055564234749, "calib/step_q_w": 0.5731351156069364, "calib/step_q_w_n": 1384.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 814.19921875, "completions/mean_terminated_length": 837.0883178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.08426666666666667, "grad_norm": 1.2054603099822998, "kl": 0.1497344970703125, "learning_rate": 3.3888888888888893e-06, "loss": -0.0978, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0185125432908535, "mask/share_reasoning": 0.7671641111373901, "mask/share_step_conf": 0.18697963654994965, "num_tokens": 25697193.0, "reward": 0.961531937122345, "reward_std": 0.17448094487190247, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7359843850135803, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8503606915473938, "step": 79 }, { "adv/mean_abs_final_conf": 0.35745030641555786, "adv/mean_abs_reasoning": 0.3114376664161682, "adv/mean_abs_step_conf": 0.7303011417388916, "adv/ratio_final_to_reasoning": 1.1477426944816105, "adv/ratio_step_to_reasoning": 2.3449351844391364, "adv/std_final_conf": 0.66139817237854, "adv/std_reasoning": 0.6185581088066101, "adv/std_step_conf": 0.9301923513412476, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5443126022913256, "calib/avg_num_step_conf": 14.71875, "calib/ece": 0.23785375494071154, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9446640316205533, "calib/gap": 0.06889067103109647, "calib/mean_conf": 0.9686837944664032, "calib/mu_c": 0.9863829787234042, "calib/mu_w": 0.9174923076923077, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23172727272727278, "calib/std_conf": 0.1424637906087504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5588358110582204, "calib/step_q_c_n": 2731.0, "calib/step_q_gap": 0.03245201163681266, "calib/step_q_w": 0.5263837994214078, "calib/step_q_w_n": 1037.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 726.515625, "completions/mean_terminated_length": 735.1304931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.08533333333333333, "grad_norm": 1.05507230758667, "kl": 0.1724853515625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0478, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.021705832332372665, "mask/share_reasoning": 0.7650576829910278, "mask/share_step_conf": 0.2015177607536316, "num_tokens": 25985341.0, "reward": 0.9841961860656738, "reward_std": 0.13544291257858276, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7552461624145508, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8686149716377258, "step": 80 }, { "adv/mean_abs_final_conf": 0.41031861305236816, "adv/mean_abs_reasoning": 0.34500330686569214, "adv/mean_abs_step_conf": 0.7481362223625183, "adv/ratio_final_to_reasoning": 1.1893179134427918, "adv/ratio_step_to_reasoning": 2.1684900042241146, "adv/std_final_conf": 0.6827117204666138, "adv/std_reasoning": 0.6404272317886353, "adv/std_step_conf": 0.9312711954116821, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.700248262164846, "calib/avg_num_step_conf": 15.98828125, "calib/ece": 0.16539094650205766, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9053497942386831, "calib/gap": 0.23204766633565055, "calib/mean_conf": 0.9427572016460906, "calib/mu_c": 0.9933684210526317, "calib/mu_w": 0.7613207547169811, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16312757201646094, "calib/std_conf": 0.1869734326201171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4652222222222222, "calib/step_q_c_n": 2700.0, "calib/step_q_gap": -0.004946478423865308, "calib/step_q_w": 0.4701687006460875, "calib/step_q_w_n": 1393.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 776.0546875, "completions/mean_terminated_length": 807.6016235351562, "completions/min_length": 0.0, "completions/min_terminated_length": 325.0, "epoch": 0.0864, "grad_norm": 0.6449930667877197, "kl": 0.168121337890625, "learning_rate": 3.3333333333333333e-06, "loss": -0.1816, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01991911232471466, "mask/share_reasoning": 0.7525591850280762, "mask/share_step_conf": 0.18845918774604797, "num_tokens": 26290259.0, "reward": 0.9914293885231018, "reward_std": 0.17866012454032898, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.804721474647522, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8390746712684631, "step": 81 }, { "adv/mean_abs_final_conf": 0.44825103878974915, "adv/mean_abs_reasoning": 0.42008984088897705, "adv/mean_abs_step_conf": 0.7493448257446289, "adv/ratio_final_to_reasoning": 1.0670361317026342, "adv/ratio_step_to_reasoning": 1.78377278574244, "adv/std_final_conf": 0.7218263745307922, "adv/std_reasoning": 0.7205085158348083, "adv/std_step_conf": 0.9325801730155945, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.720112517580872, "calib/avg_num_step_conf": 14.9140625, "calib/ece": 0.23392000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.836, "calib/gap": 0.2186631134799023, "calib/mean_conf": 0.9179200000000001, "calib/mu_c": 0.9870175438596491, "calib/mu_w": 0.7683544303797468, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23392000000000002, "calib/std_conf": 0.20773366024792422, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42461879432624117, "calib/step_q_c_n": 2256.0, "calib/step_q_gap": 0.033178333378737934, "calib/step_q_w": 0.39144046094750323, "calib/step_q_w_n": 1562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 734.7578125, "completions/mean_terminated_length": 746.420654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.08746666666666666, "grad_norm": 1.2045516967773438, "kl": 0.187896728515625, "learning_rate": 3.3055555555555558e-06, "loss": -0.0592, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.020980939269065857, "mask/share_reasoning": 0.7644993662834167, "mask/share_step_conf": 0.1988946795463562, "num_tokens": 26583909.0, "reward": 0.9693397283554077, "reward_std": 0.1750274896621704, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7622164487838745, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8475567102432251, "step": 82 }, { "adv/mean_abs_final_conf": 0.43114012479782104, "adv/mean_abs_reasoning": 0.27020975947380066, "adv/mean_abs_step_conf": 0.7526513338088989, "adv/ratio_final_to_reasoning": 1.595575695109651, "adv/ratio_step_to_reasoning": 2.7854335656661413, "adv/std_final_conf": 0.7021347284317017, "adv/std_reasoning": 0.5483786463737488, "adv/std_step_conf": 0.9318440556526184, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6169209152950622, "calib/avg_num_step_conf": 16.18359375, "calib/ece": 0.25766798418972325, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7391304347826086, "calib/gap": 0.1499023150006692, "calib/mean_conf": 0.8522924901185771, "calib/mu_c": 0.9079874213836479, "calib/mu_w": 0.7580851063829787, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24075098814229245, "calib/std_conf": 0.27601247577156063, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37846669459572685, "calib/step_q_c_n": 2387.0, "calib/step_q_gap": 0.013193346076364632, "calib/step_q_w": 0.3652733485193622, "calib/step_q_w_n": 1756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 871.6171875, "completions/mean_terminated_length": 875.0353393554688, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.08853333333333334, "grad_norm": 1.04400634765625, "kl": 0.163818359375, "learning_rate": 3.277777777777778e-06, "loss": -0.0302, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019192636013031006, "mask/share_reasoning": 0.7826995849609375, "mask/share_step_conf": 0.19420155882835388, "num_tokens": 26914307.0, "reward": 0.9339853525161743, "reward_std": 0.12328419089317322, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7018980383872986, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8441977500915527, "step": 83 }, { "adv/mean_abs_final_conf": 0.48722946643829346, "adv/mean_abs_reasoning": 0.36283811926841736, "adv/mean_abs_step_conf": 0.7674565315246582, "adv/ratio_final_to_reasoning": 1.3428287728441644, "adv/ratio_step_to_reasoning": 2.1151485766491795, "adv/std_final_conf": 0.7576949596405029, "adv/std_reasoning": 0.6612746119499207, "adv/std_step_conf": 0.9317241907119751, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7398956496178718, "calib/avg_num_step_conf": 14.86328125, "calib/ece": 0.22236947791164657, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7429718875502008, "calib/gap": 0.23302469135802462, "calib/mean_conf": 0.8691967871485944, "calib/mu_c": 0.945, "calib/mu_w": 0.7119753086419753, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.208433734939759, "calib/std_conf": 0.24694957745446464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40114668974469925, "calib/step_q_c_n": 2311.0, "calib/step_q_gap": 0.04260585975808617, "calib/step_q_w": 0.3585408299866131, "calib/step_q_w_n": 1494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 733.73828125, "completions/mean_terminated_length": 754.3654174804688, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.0896, "grad_norm": 1.3086735010147095, "kl": 0.195068359375, "learning_rate": 3.2500000000000002e-06, "loss": -0.1138, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.021002911031246185, "mask/share_reasoning": 0.7597129344940186, "mask/share_step_conf": 0.19194041192531586, "num_tokens": 27208064.0, "reward": 0.9720569252967834, "reward_std": 0.13980735838413239, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7625573873519897, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8557752370834351, "step": 84 }, { "adv/mean_abs_final_conf": 0.5669814348220825, "adv/mean_abs_reasoning": 0.3699195981025696, "adv/mean_abs_step_conf": 0.7619590163230896, "adv/ratio_final_to_reasoning": 1.5327153190323064, "adv/ratio_step_to_reasoning": 2.0597962914952594, "adv/std_final_conf": 0.8277752995491028, "adv/std_reasoning": 0.6613036394119263, "adv/std_step_conf": 0.9325735569000244, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8462388440288993, "calib/avg_num_step_conf": 15.921875, "calib/ece": 0.061097560975609776, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5040650406504065, "calib/gap": 0.4110148746281343, "calib/mean_conf": 0.7341056910569106, "calib/mu_c": 0.842707182320442, "calib/mu_w": 0.43169230769230765, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02971544715447154, "calib/std_conf": 0.30742143719804693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.407486144101346, "calib/step_q_c_n": 2526.0, "calib/step_q_gap": 0.027860337649733058, "calib/step_q_w": 0.3796258064516129, "calib/step_q_w_n": 1550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 778.14453125, "completions/mean_terminated_length": 803.2459716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.09066666666666667, "grad_norm": 2.0298848152160645, "kl": 0.199371337890625, "learning_rate": 3.2222222222222227e-06, "loss": -0.2, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019832279533147812, "mask/share_reasoning": 0.7580169439315796, "mask/share_step_conf": 0.19090081751346588, "num_tokens": 27515093.0, "reward": 1.00211501121521, "reward_std": 0.16773498058319092, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.8339410424232483, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8374766111373901, "step": 85 }, { "adv/mean_abs_final_conf": 0.6240010261535645, "adv/mean_abs_reasoning": 0.4230995178222656, "adv/mean_abs_step_conf": 0.7713678479194641, "adv/ratio_final_to_reasoning": 1.474832751796453, "adv/ratio_step_to_reasoning": 1.823135729130039, "adv/std_final_conf": 0.8579559326171875, "adv/std_reasoning": 0.7013693451881409, "adv/std_step_conf": 0.9327153563499451, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7724427372314695, "calib/avg_num_step_conf": 16.203125, "calib/ece": 0.12267219917012452, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.37344398340248963, "calib/gap": 0.3106485986626832, "calib/mean_conf": 0.6493609958506225, "calib/mu_c": 0.7769718309859155, "calib/mu_w": 0.4663232323232323, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09141078838174278, "calib/std_conf": 0.3149915206219662, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44261421319796956, "calib/step_q_c_n": 1970.0, "calib/step_q_gap": 0.011503102086858452, "calib/step_q_w": 0.4311111111111111, "calib/step_q_w_n": 2178.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 814.15234375, "completions/mean_terminated_length": 847.2479248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.09173333333333333, "grad_norm": 1.5091356039047241, "kl": 0.189422607421875, "learning_rate": 3.1944444444444443e-06, "loss": -0.1741, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.019832249730825424, "mask/share_reasoning": 0.7581910490989685, "mask/share_step_conf": 0.18291422724723816, "num_tokens": 27829028.0, "reward": 0.9400574564933777, "reward_std": 0.15439200401306152, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7583033442497253, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8225927352905273, "step": 86 }, { "adv/mean_abs_final_conf": 0.5351973176002502, "adv/mean_abs_reasoning": 0.3062160313129425, "adv/mean_abs_step_conf": 0.7412095665931702, "adv/ratio_final_to_reasoning": 1.7477769380836123, "adv/ratio_step_to_reasoning": 2.4205446181740853, "adv/std_final_conf": 0.7931978106498718, "adv/std_reasoning": 0.5963323712348938, "adv/std_step_conf": 0.931337833404541, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6943032296650717, "calib/avg_num_step_conf": 14.328125, "calib/ece": 0.13917012448132784, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5684647302904564, "calib/gap": 0.1794706937799042, "calib/mean_conf": 0.796265560165975, "calib/mu_c": 0.8200956937799042, "calib/mu_w": 0.640625, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03410788381742739, "calib/std_conf": 0.25204095534405574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.495296786389414, "calib/step_q_c_n": 2645.0, "calib/step_q_gap": -0.029991581157018055, "calib/step_q_w": 0.525288367546432, "calib/step_q_w_n": 1023.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 698.7421875, "completions/mean_terminated_length": 727.1463012695312, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.0928, "grad_norm": 1.6742327213287354, "kl": 0.235260009765625, "learning_rate": 3.1666666666666667e-06, "loss": -0.1237, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02258254587650299, "mask/share_reasoning": 0.7534446120262146, "mask/share_step_conf": 0.18491032719612122, "num_tokens": 28113402.0, "reward": 1.0020322799682617, "reward_std": 0.159156933426857, "rewards/accuracy_reward_step": 0.81640625, "rewards/final_brier_reward_step": 0.8073718547821045, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8451303243637085, "step": 87 }, { "adv/mean_abs_final_conf": 0.5548301935195923, "adv/mean_abs_reasoning": 0.34698277711868286, "adv/mean_abs_step_conf": 0.7414649724960327, "adv/ratio_final_to_reasoning": 1.5990136401779296, "adv/ratio_step_to_reasoning": 2.136892726068707, "adv/std_final_conf": 0.8096156716346741, "adv/std_reasoning": 0.6403238773345947, "adv/std_step_conf": 0.932481050491333, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7586206896551725, "calib/avg_num_step_conf": 14.76953125, "calib/ece": 0.11999196787148603, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6345381526104418, "calib/gap": 0.2752393933923091, "calib/mean_conf": 0.8214377510040161, "calib/mu_c": 0.8855497382198952, "calib/mu_w": 0.6103103448275862, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08718072289156635, "calib/std_conf": 0.2611107666689446, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5174226006191951, "calib/step_q_c_n": 2584.0, "calib/step_q_gap": 0.019980662440414798, "calib/step_q_w": 0.4974419381787803, "calib/step_q_w_n": 1197.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 755.17578125, "completions/mean_terminated_length": 770.2191162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.09386666666666667, "grad_norm": 1.514581322669983, "kl": 0.211181640625, "learning_rate": 3.138888888888889e-06, "loss": -0.0679, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02104560285806656, "mask/share_reasoning": 0.7749693393707275, "mask/share_step_conf": 0.1844537854194641, "num_tokens": 28416575.0, "reward": 1.0080301761627197, "reward_std": 0.13474811613559723, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.8253446817398071, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8469657301902771, "step": 88 }, { "adv/mean_abs_final_conf": 0.5520856380462646, "adv/mean_abs_reasoning": 0.396220862865448, "adv/mean_abs_step_conf": 0.7405757904052734, "adv/ratio_final_to_reasoning": 1.393378516349722, "adv/ratio_step_to_reasoning": 1.8690984241704718, "adv/std_final_conf": 0.7762579917907715, "adv/std_reasoning": 0.661486804485321, "adv/std_step_conf": 0.9330097436904907, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.65588668138337, "calib/avg_num_step_conf": 14.40625, "calib/ece": 0.23485477178423236, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6929460580912863, "calib/gap": 0.17630389992641637, "calib/mean_conf": 0.8419087136929461, "calib/mu_c": 0.9077483443708608, "calib/mu_w": 0.7314444444444445, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22510373443983403, "calib/std_conf": 0.26856559188397267, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5550863422291994, "calib/step_q_c_n": 1911.0, "calib/step_q_gap": 0.044512341103707054, "calib/step_q_w": 0.5105740011254923, "calib/step_q_w_n": 1777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 729.68359375, "completions/mean_terminated_length": 768.7201538085938, "completions/min_length": 0.0, "completions/min_terminated_length": 244.0, "epoch": 0.09493333333333333, "grad_norm": 1.3917933702468872, "kl": 0.20709228515625, "learning_rate": 3.1111111111111116e-06, "loss": -0.2533, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.020858295261859894, "mask/share_reasoning": 0.7529197931289673, "mask/share_step_conf": 0.17544062435626984, "num_tokens": 28712262.0, "reward": 0.9003409743309021, "reward_std": 0.184332937002182, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6872422099113464, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8071897029876709, "step": 89 }, { "adv/mean_abs_final_conf": 0.3925582766532898, "adv/mean_abs_reasoning": 0.3166203498840332, "adv/mean_abs_step_conf": 0.7366718053817749, "adv/ratio_final_to_reasoning": 1.2398390589773207, "adv/ratio_step_to_reasoning": 2.326672324288668, "adv/std_final_conf": 0.6615562438964844, "adv/std_reasoning": 0.5961743593215942, "adv/std_step_conf": 0.9308331608772278, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7850442710616289, "calib/avg_num_step_conf": 14.25390625, "calib/ece": 0.1648790322580646, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.3271719119838695, "calib/mean_conf": 0.8439112903225807, "calib/mu_c": 0.9243850267379679, "calib/mu_w": 0.5972131147540983, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1273790322580646, "calib/std_conf": 0.3018364893848017, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6031167763157895, "calib/step_q_c_n": 2432.0, "calib/step_q_gap": 0.10898366210050603, "calib/step_q_w": 0.4941331142152835, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 702.67578125, "completions/mean_terminated_length": 725.3427124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.096, "grad_norm": 0.7975932359695435, "kl": 0.211456298828125, "learning_rate": 3.0833333333333336e-06, "loss": -0.1296, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.022103622555732727, "mask/share_reasoning": 0.7583642601966858, "mask/share_step_conf": 0.18828216195106506, "num_tokens": 28995467.0, "reward": 1.0022435188293457, "reward_std": 0.13096696138381958, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.8105612993240356, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8540821075439453, "step": 90 }, { "adv/mean_abs_final_conf": 0.4347422122955322, "adv/mean_abs_reasoning": 0.39271825551986694, "adv/mean_abs_step_conf": 0.7518894076347351, "adv/ratio_final_to_reasoning": 1.1070079024465909, "adv/ratio_step_to_reasoning": 1.914577173499128, "adv/std_final_conf": 0.7024941444396973, "adv/std_reasoning": 0.661315381526947, "adv/std_step_conf": 0.9302010536193848, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5980769230769231, "calib/avg_num_step_conf": 12.73828125, "calib/ece": 0.21213438735177859, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.841897233201581, "calib/gap": 0.10378559738134208, "calib/mean_conf": 0.9145059288537549, "calib/mu_c": 0.9411702127659575, "calib/mu_w": 0.8373846153846154, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19177865612648215, "calib/std_conf": 0.22002931210680218, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5744874615046194, "calib/step_q_c_n": 2273.0, "calib/step_q_gap": 0.03976074085684611, "calib/step_q_w": 0.5347267206477733, "calib/step_q_w_n": 988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 763.78515625, "completions/mean_terminated_length": 772.8419189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.09706666666666666, "grad_norm": 0.8278793692588806, "kl": 0.203094482421875, "learning_rate": 3.055555555555556e-06, "loss": -0.0548, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.020584098994731903, "mask/share_reasoning": 0.7944842576980591, "mask/share_step_conf": 0.17321288585662842, "num_tokens": 29298708.0, "reward": 0.9739769697189331, "reward_std": 0.161073237657547, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7550480961799622, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8499372005462646, "step": 91 }, { "adv/mean_abs_final_conf": 0.4526572525501251, "adv/mean_abs_reasoning": 0.39416584372520447, "adv/mean_abs_step_conf": 0.7069941759109497, "adv/ratio_final_to_reasoning": 1.1483928903431277, "adv/ratio_step_to_reasoning": 1.7936464743602587, "adv/std_final_conf": 0.739734411239624, "adv/std_reasoning": 0.7014203071594238, "adv/std_step_conf": 0.9329286217689514, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.761889097744361, "calib/avg_num_step_conf": 14.53515625, "calib/ece": 0.1340243902439025, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7682926829268293, "calib/gap": 0.25142293233082713, "calib/mean_conf": 0.8950813008130081, "calib/mu_c": 0.9523157894736842, "calib/mu_w": 0.7008928571428571, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1283739837398375, "calib/std_conf": 0.22623417946739058, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5978565365025468, "calib/step_q_c_n": 2356.0, "calib/step_q_gap": 0.04762210426811453, "calib/step_q_w": 0.5502344322344322, "calib/step_q_w_n": 1365.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 722.02734375, "completions/mean_terminated_length": 739.3560180664062, "completions/min_length": 0.0, "completions/min_terminated_length": 258.0, "epoch": 0.09813333333333334, "grad_norm": 0.870351254940033, "kl": 0.21209716796875, "learning_rate": 3.0277777777777776e-06, "loss": -0.0898, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.022989589720964432, "mask/share_reasoning": 0.7676361203193665, "mask/share_step_conf": 0.1859368085861206, "num_tokens": 29590267.0, "reward": 1.0020456314086914, "reward_std": 0.16076776385307312, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.813286304473877, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8501797914505005, "step": 92 }, { "adv/mean_abs_final_conf": 0.3889133334159851, "adv/mean_abs_reasoning": 0.38567960262298584, "adv/mean_abs_step_conf": 0.7637486457824707, "adv/ratio_final_to_reasoning": 1.0083845004273155, "adv/ratio_step_to_reasoning": 1.9802671455484242, "adv/std_final_conf": 0.662729024887085, "adv/std_reasoning": 0.6614421606063843, "adv/std_step_conf": 0.9304633736610413, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7263586956521739, "calib/avg_num_step_conf": 14.453125, "calib/ece": 0.1716803278688525, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.819672131147541, "calib/gap": 0.24210507246376822, "calib/mean_conf": 0.9132377049180327, "calib/mu_c": 0.9727717391304348, "calib/mu_w": 0.7306666666666666, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1654098360655738, "calib/std_conf": 0.20747646729202443, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6043217665615142, "calib/step_q_c_n": 2219.0, "calib/step_q_gap": 0.05339671592005568, "calib/step_q_w": 0.5509250506414585, "calib/step_q_w_n": 1481.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 707.24609375, "completions/mean_terminated_length": 735.9959106445312, "completions/min_length": 0.0, "completions/min_terminated_length": 265.0, "epoch": 0.0992, "grad_norm": 0.595244288444519, "kl": 0.20147705078125, "learning_rate": 3e-06, "loss": -0.1013, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.022081824019551277, "mask/share_reasoning": 0.7556986808776855, "mask/share_step_conf": 0.18315701186656952, "num_tokens": 29877098.0, "reward": 0.9965525269508362, "reward_std": 0.15626981854438782, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.796796441078186, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8619334697723389, "step": 93 }, { "adv/mean_abs_final_conf": 0.4328933358192444, "adv/mean_abs_reasoning": 0.35787466168403625, "adv/mean_abs_step_conf": 0.766270637512207, "adv/ratio_final_to_reasoning": 1.2096227594940525, "adv/ratio_step_to_reasoning": 2.1411704139834837, "adv/std_final_conf": 0.7017487287521362, "adv/std_reasoning": 0.6611588001251221, "adv/std_step_conf": 0.9323412179946899, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6674933081404504, "calib/avg_num_step_conf": 12.29296875, "calib/ece": 0.23765182186234823, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8785425101214575, "calib/gap": 0.17484648086915444, "calib/mean_conf": 0.9259109311740891, "calib/mu_c": 0.9775862068965517, "calib/mu_w": 0.8027397260273973, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2295546558704454, "calib/std_conf": 0.22257755815598185, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6268849104859334, "calib/step_q_c_n": 1955.0, "calib/step_q_gap": 0.11863826619063134, "calib/step_q_w": 0.5082466442953021, "calib/step_q_w_n": 1192.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 662.48046875, "completions/mean_terminated_length": 681.1043701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.10026666666666667, "grad_norm": 0.9370195865631104, "kl": 0.212310791015625, "learning_rate": 2.9722222222222225e-06, "loss": -0.053, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.024116527289152145, "mask/share_reasoning": 0.771746039390564, "mask/share_step_conf": 0.17679369449615479, "num_tokens": 30155373.0, "reward": 0.9501739740371704, "reward_std": 0.14142459630966187, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7390921711921692, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8323494791984558, "step": 94 }, { "adv/mean_abs_final_conf": 0.4895930886268616, "adv/mean_abs_reasoning": 0.3745056092739105, "adv/mean_abs_step_conf": 0.7683069109916687, "adv/ratio_final_to_reasoning": 1.3073050883699233, "adv/ratio_step_to_reasoning": 2.051523106639866, "adv/std_final_conf": 0.7582565546035767, "adv/std_reasoning": 0.6613578200340271, "adv/std_step_conf": 0.9327183365821838, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6172413793103448, "calib/avg_num_step_conf": 13.578125, "calib/ece": 0.20120967741935483, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8467741935483871, "calib/gap": 0.15801088929219587, "calib/mean_conf": 0.9046774193548387, "calib/mu_c": 0.9416315789473684, "calib/mu_w": 0.7836206896551725, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1698790322580645, "calib/std_conf": 0.2566909007393377, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6179151627523692, "calib/step_q_c_n": 2427.0, "calib/step_q_gap": 0.08542231241871812, "calib/step_q_w": 0.5324928503336511, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2890.0, "completions/max_terminated_length": 2890.0, "completions/mean_length": 746.81640625, "completions/mean_terminated_length": 764.7400512695312, "completions/min_length": 0.0, "completions/min_terminated_length": 280.0, "epoch": 0.10133333333333333, "grad_norm": 0.6035562753677368, "kl": 0.204071044921875, "learning_rate": 2.944444444444445e-06, "loss": -0.0845, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.021579675376415253, "mask/share_reasoning": 0.7744049429893494, "mask/share_step_conf": 0.18057787418365479, "num_tokens": 30452686.0, "reward": 0.971251368522644, "reward_std": 0.1747710257768631, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7676007747650146, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.832714319229126, "step": 95 }, { "adv/mean_abs_final_conf": 0.33149275183677673, "adv/mean_abs_reasoning": 0.24979457259178162, "adv/mean_abs_step_conf": 0.7619727253913879, "adv/ratio_final_to_reasoning": 1.3270614665375762, "adv/ratio_step_to_reasoning": 3.05039744252817, "adv/std_final_conf": 0.5985773801803589, "adv/std_reasoning": 0.5485026836395264, "adv/std_step_conf": 0.9320087432861328, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8070443238353687, "calib/avg_num_step_conf": 13.3984375, "calib/ece": 0.11546938775510214, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8693877551020408, "calib/gap": 0.30552238805970156, "calib/mean_conf": 0.9306530612244898, "calib/mu_c": 0.9855223880597016, "calib/mu_w": 0.68, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11285714285714296, "calib/std_conf": 0.20376953278306575, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6230241563055062, "calib/step_q_c_n": 2252.0, "calib/step_q_gap": 0.14354622761280672, "calib/step_q_w": 0.4794779286926995, "calib/step_q_w_n": 1178.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 629.6796875, "completions/mean_terminated_length": 657.9509887695312, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.1024, "grad_norm": 0.7102448344230652, "kl": 0.217926025390625, "learning_rate": 2.916666666666667e-06, "loss": -0.2076, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02291429042816162, "mask/share_reasoning": 0.7549576759338379, "mask/share_step_conf": 0.17915934324264526, "num_tokens": 30719700.0, "reward": 1.0202877521514893, "reward_std": 0.1436702311038971, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.850816011428833, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8413221836090088, "step": 96 }, { "adv/mean_abs_final_conf": 0.47722479701042175, "adv/mean_abs_reasoning": 0.3774394690990448, "adv/mean_abs_step_conf": 0.738217830657959, "adv/ratio_final_to_reasoning": 1.264374386042791, "adv/ratio_step_to_reasoning": 1.9558575376870337, "adv/std_final_conf": 0.7588989734649658, "adv/std_reasoning": 0.6816702485084534, "adv/std_step_conf": 0.9340404868125916, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6067873303167421, "calib/avg_num_step_conf": 12.84765625, "calib/ece": 0.2636693548387097, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8830645161290323, "calib/gap": 0.13605882352941168, "calib/mean_conf": 0.933266129032258, "calib/mu_c": 0.9760588235294118, "calib/mu_w": 0.8400000000000001, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2557258064516129, "calib/std_conf": 0.20840266579586272, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.629279604261796, "calib/step_q_c_n": 1971.0, "calib/step_q_gap": 0.09934030228911006, "calib/step_q_w": 0.5299393019726859, "calib/step_q_w_n": 1318.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 682.55078125, "completions/mean_terminated_length": 701.7389526367188, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.10346666666666667, "grad_norm": 1.224445104598999, "kl": 0.20086669921875, "learning_rate": 2.888888888888889e-06, "loss": -0.1223, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.022092588245868683, "mask/share_reasoning": 0.7687257528305054, "mask/share_step_conf": 0.18183787167072296, "num_tokens": 30999505.0, "reward": 0.9291366338729858, "reward_std": 0.19311800599098206, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7112675905227661, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8212244510650635, "step": 97 }, { "adv/mean_abs_final_conf": 0.5556801557540894, "adv/mean_abs_reasoning": 0.5108206868171692, "adv/mean_abs_step_conf": 0.763264536857605, "adv/ratio_final_to_reasoning": 1.0878184264941018, "adv/ratio_step_to_reasoning": 1.4941926913989476, "adv/std_final_conf": 0.7944974899291992, "adv/std_reasoning": 0.7754715085029602, "adv/std_step_conf": 0.9323673844337463, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6304736024844719, "calib/avg_num_step_conf": 14.6796875, "calib/ece": 0.27290456431535265, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8381742738589212, "calib/gap": 0.1587585403726708, "calib/mean_conf": 0.9059336099585062, "calib/mu_c": 0.9586335403726708, "calib/mu_w": 0.799875, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2553941908713692, "calib/std_conf": 0.24793600213064737, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6270970863683664, "calib/step_q_c_n": 1922.0, "calib/step_q_gap": 0.0971624458454905, "calib/step_q_w": 0.5299346405228759, "calib/step_q_w_n": 1836.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 681.73828125, "completions/mean_terminated_length": 724.170166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.10453333333333334, "grad_norm": 0.8766622543334961, "kl": 0.2059326171875, "learning_rate": 2.861111111111111e-06, "loss": -0.246, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.021250521764159203, "mask/share_reasoning": 0.745991587638855, "mask/share_step_conf": 0.17416416108608246, "num_tokens": 31280214.0, "reward": 0.9029910564422607, "reward_std": 0.21328818798065186, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.68778395652771, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8041356205940247, "step": 98 }, { "adv/mean_abs_final_conf": 0.5864074230194092, "adv/mean_abs_reasoning": 0.5255153775215149, "adv/mean_abs_step_conf": 0.7783534526824951, "adv/ratio_final_to_reasoning": 1.115871101213211, "adv/ratio_step_to_reasoning": 1.4811240279084485, "adv/std_final_conf": 0.7936495542526245, "adv/std_reasoning": 0.7756103873252869, "adv/std_step_conf": 0.9340755939483643, "calib/answer_extract_rate": 0.8828125, "calib/auroc": 0.7115127175368139, "calib/avg_num_step_conf": 17.7578125, "calib/ece": 0.22687224669603534, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.6828193832599119, "calib/gap": 0.28313755020080345, "calib/mean_conf": 0.801057268722467, "calib/mu_c": 0.9045833333333335, "calib/mu_w": 0.6214457831325301, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19678414096916308, "calib/std_conf": 0.33488472772173805, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.552300163132137, "calib/step_q_c_n": 1839.0, "calib/step_q_gap": 0.08282435965965823, "calib/step_q_w": 0.4694758034724788, "calib/step_q_w_n": 2707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 734.40234375, "completions/mean_terminated_length": 820.9912719726562, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.1056, "grad_norm": 1.0238356590270996, "kl": 0.181793212890625, "learning_rate": 2.8333333333333335e-06, "loss": -0.4596, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.01949150674045086, "mask/share_reasoning": 0.7137925028800964, "mask/share_step_conf": 0.16124722361564636, "num_tokens": 31574021.0, "reward": 0.8591356873512268, "reward_std": 0.24496795237064362, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6734304428100586, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.7557783722877502, "step": 99 }, { "adv/mean_abs_final_conf": 0.49218031764030457, "adv/mean_abs_reasoning": 0.38254693150520325, "adv/mean_abs_step_conf": 0.7464720010757446, "adv/ratio_final_to_reasoning": 1.2865880683024382, "adv/ratio_step_to_reasoning": 1.9513213663448021, "adv/std_final_conf": 0.7524144053459167, "adv/std_reasoning": 0.6613397002220154, "adv/std_step_conf": 0.9313260316848755, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8291703507220749, "calib/avg_num_step_conf": 14.35546875, "calib/ece": 0.18193415637860075, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6625514403292181, "calib/gap": 0.4215473032714412, "calib/mean_conf": 0.7829218106995884, "calib/mu_c": 0.9338461538461539, "calib/mu_w": 0.5122988505747127, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16144032921810691, "calib/std_conf": 0.34799499496426406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6065625529661018, "calib/step_q_c_n": 1888.0, "calib/step_q_gap": 0.1626677572190397, "calib/step_q_w": 0.44389479574706214, "calib/step_q_w_n": 1787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 775.57421875, "completions/mean_terminated_length": 800.5927124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.10666666666666667, "grad_norm": 0.7905797362327576, "kl": 0.181121826171875, "learning_rate": 2.805555555555556e-06, "loss": -0.0445, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02066512033343315, "mask/share_reasoning": 0.7727394104003906, "mask/share_step_conf": 0.17534548044204712, "num_tokens": 31879976.0, "reward": 0.9744864702224731, "reward_std": 0.16301025450229645, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7811784744262695, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8560757040977478, "step": 100 }, { "adv/mean_abs_final_conf": 0.5227075219154358, "adv/mean_abs_reasoning": 0.43173086643218994, "adv/mean_abs_step_conf": 0.7493784427642822, "adv/ratio_final_to_reasoning": 1.2107253906468953, "adv/ratio_step_to_reasoning": 1.735753685987573, "adv/std_final_conf": 0.7767247557640076, "adv/std_reasoning": 0.7015949487686157, "adv/std_step_conf": 0.9333872199058533, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.7633287764866712, "calib/avg_num_step_conf": 18.46484375, "calib/ece": 0.18497816593886465, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.6943231441048034, "calib/gap": 0.3077947710184554, "calib/mean_conf": 0.8317030567685589, "calib/mu_c": 0.9351973684210527, "calib/mu_w": 0.6274025974025973, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1764628820960699, "calib/std_conf": 0.3007809036481368, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5562262798634812, "calib/step_q_c_n": 2051.0, "calib/step_q_gap": 0.06015340990832424, "calib/step_q_w": 0.4960728699551569, "calib/step_q_w_n": 2676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 720.0703125, "completions/mean_terminated_length": 798.0, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.10773333333333333, "grad_norm": 0.9769598841667175, "kl": 0.170135498046875, "learning_rate": 2.7777777777777783e-06, "loss": -0.3476, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.018404729664325714, "mask/share_reasoning": 0.7185856103897095, "mask/share_step_conf": 0.1653534173965454, "num_tokens": 32171306.0, "reward": 0.8911153078079224, "reward_std": 0.197750523686409, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7116265296936035, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7729478478431702, "step": 101 }, { "adv/mean_abs_final_conf": 0.5290060043334961, "adv/mean_abs_reasoning": 0.3234380781650543, "adv/mean_abs_step_conf": 0.7649726271629333, "adv/ratio_final_to_reasoning": 1.6355711959911474, "adv/ratio_step_to_reasoning": 2.365128532493192, "adv/std_final_conf": 0.7570039629936218, "adv/std_reasoning": 0.6187290549278259, "adv/std_step_conf": 0.9321916103363037, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.7868106617647059, "calib/avg_num_step_conf": 16.92578125, "calib/ece": 0.17418803418803414, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.7051282051282052, "calib/gap": 0.3493235294117646, "calib/mean_conf": 0.8112820512820513, "calib/mu_c": 0.9068235294117647, "calib/mu_w": 0.5575000000000001, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12948717948717942, "calib/std_conf": 0.3303379219364566, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5947416187739464, "calib/step_q_c_n": 2088.0, "calib/step_q_gap": 0.1707416187739464, "calib/step_q_w": 0.424, "calib/step_q_w_n": 2245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 656.5703125, "completions/mean_terminated_length": 709.2067260742188, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.1088, "grad_norm": 1.2531845569610596, "kl": 0.205841064453125, "learning_rate": 2.7500000000000004e-06, "loss": -0.2938, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.021747365593910217, "mask/share_reasoning": 0.7253262400627136, "mask/share_step_conf": 0.17870762944221497, "num_tokens": 32446084.0, "reward": 0.9338048696517944, "reward_std": 0.20178765058517456, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7530132532119751, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.79897141456604, "step": 102 }, { "adv/mean_abs_final_conf": 0.4550447463989258, "adv/mean_abs_reasoning": 0.3277204930782318, "adv/mean_abs_step_conf": 0.7122563123703003, "adv/ratio_final_to_reasoning": 1.388514774052594, "adv/ratio_step_to_reasoning": 2.173365192027445, "adv/std_final_conf": 0.7031997442245483, "adv/std_reasoning": 0.6404638290405273, "adv/std_step_conf": 0.9319030046463013, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8430335097001763, "calib/avg_num_step_conf": 15.21875, "calib/ece": 0.14888429752066124, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6487603305785123, "calib/gap": 0.426429721647113, "calib/mean_conf": 0.7888842975206612, "calib/mu_c": 0.9316149068322982, "calib/mu_w": 0.5051851851851852, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13623966942148766, "calib/std_conf": 0.3297068492580144, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5680960443037975, "calib/step_q_c_n": 1896.0, "calib/step_q_gap": 0.10115854430379745, "calib/step_q_w": 0.4669375, "calib/step_q_w_n": 2000.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 801.2109375, "completions/mean_terminated_length": 840.6146850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 0.10986666666666667, "grad_norm": 1.0091633796691895, "kl": 0.167510986328125, "learning_rate": 2.7222222222222224e-06, "loss": -0.2265, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.019758710637688637, "mask/share_reasoning": 0.768470823764801, "mask/share_step_conf": 0.1648954451084137, "num_tokens": 32755746.0, "reward": 0.9756123423576355, "reward_std": 0.1721249520778656, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7971370816230774, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8392437696456909, "step": 103 }, { "adv/mean_abs_final_conf": 0.6239424347877502, "adv/mean_abs_reasoning": 0.5552152395248413, "adv/mean_abs_step_conf": 0.7641361355781555, "adv/ratio_final_to_reasoning": 1.1237847781731034, "adv/ratio_step_to_reasoning": 1.3762881152759978, "adv/std_final_conf": 0.8441416025161743, "adv/std_reasoning": 0.7931216955184937, "adv/std_step_conf": 0.9340603947639465, "calib/answer_extract_rate": 0.859375, "calib/auroc": 0.8823529411764706, "calib/avg_num_step_conf": 19.48828125, "calib/ece": 0.12918181818181818, "calib/final_conf_rate": 0.859375, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.5590909090909091, "calib/gap": 0.4843522408963586, "calib/mean_conf": 0.7257272727272729, "calib/mu_c": 0.9106617647058823, "calib/mu_w": 0.42630952380952375, "calib/nonempty_final_conf_rate": 0.859375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11836363636363634, "calib/std_conf": 0.35999358350571925, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5315108359133127, "calib/step_q_c_n": 1615.0, "calib/step_q_gap": 0.07182171321028952, "calib/step_q_w": 0.4596891227030232, "calib/step_q_w_n": 3374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 722.06640625, "completions/mean_terminated_length": 814.312744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.11093333333333333, "grad_norm": 2.5542263984680176, "kl": 0.174072265625, "learning_rate": 2.6944444444444444e-06, "loss": -0.4348, "mask/has_final_conf_rate": 0.859375, "mask/share_final_conf": 0.018851863220334053, "mask/share_reasoning": 0.7025007009506226, "mask/share_step_conf": 0.16536614298820496, "num_tokens": 33047275.0, "reward": 0.8753511309623718, "reward_std": 0.28000450134277344, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7317163944244385, "rewards/format_reward_step": 0.859375, "rewards/step_l2_reward": 0.7408607602119446, "step": 104 }, { "adv/mean_abs_final_conf": 0.6069635152816772, "adv/mean_abs_reasoning": 0.48636770248413086, "adv/mean_abs_step_conf": 0.7328383922576904, "adv/ratio_final_to_reasoning": 1.2479519346815204, "adv/ratio_step_to_reasoning": 1.506757929267726, "adv/std_final_conf": 0.828083336353302, "adv/std_reasoning": 0.7577069997787476, "adv/std_step_conf": 0.9338470697402954, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7116402116402116, "calib/avg_num_step_conf": 16.05859375, "calib/ece": 0.1609704641350211, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6877637130801688, "calib/gap": 0.2955090311986862, "calib/mean_conf": 0.8121940928270043, "calib/mu_c": 0.8907471264367814, "calib/mu_w": 0.5952380952380952, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11949367088607596, "calib/std_conf": 0.3134055424811953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5325445173383317, "calib/step_q_c_n": 2134.0, "calib/step_q_gap": 0.08896586281127056, "calib/step_q_w": 0.4435786545270612, "calib/step_q_w_n": 1977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 737.23828125, "completions/mean_terminated_length": 792.995849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.112, "grad_norm": 1.3168301582336426, "kl": 0.1668701171875, "learning_rate": 2.666666666666667e-06, "loss": -0.3545, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.019618578255176544, "mask/share_reasoning": 0.7424402236938477, "mask/share_step_conf": 0.16762866079807281, "num_tokens": 33341768.0, "reward": 0.9317725896835327, "reward_std": 0.23298463225364685, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7515015602111816, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7917311191558838, "step": 105 }, { "adv/mean_abs_final_conf": 0.4749845862388611, "adv/mean_abs_reasoning": 0.42615675926208496, "adv/mean_abs_step_conf": 0.7237845063209534, "adv/ratio_final_to_reasoning": 1.1145771501109694, "adv/ratio_step_to_reasoning": 1.698399686477408, "adv/std_final_conf": 0.7402500510215759, "adv/std_reasoning": 0.701504647731781, "adv/std_step_conf": 0.9320195913314819, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8459119496855346, "calib/avg_num_step_conf": 16.1171875, "calib/ece": 0.15350000000000014, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6458333333333334, "calib/gap": 0.4023573258793385, "calib/mean_conf": 0.7815, "calib/mu_c": 0.9172955974842767, "calib/mu_w": 0.5149382716049382, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13625000000000012, "calib/std_conf": 0.34111740403171065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5458628081457664, "calib/step_q_c_n": 1866.0, "calib/step_q_gap": 0.09151103823426193, "calib/step_q_w": 0.45435176991150444, "calib/step_q_w_n": 2260.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 704.2890625, "completions/mean_terminated_length": 748.12451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.11306666666666666, "grad_norm": 1.1325608491897583, "kl": 0.1807098388671875, "learning_rate": 2.6388888888888893e-06, "loss": -0.35, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.019888795912265778, "mask/share_reasoning": 0.750140368938446, "mask/share_step_conf": 0.17137709259986877, "num_tokens": 33626650.0, "reward": 0.9610485434532166, "reward_std": 0.19624879956245422, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7741999626159668, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8361783027648926, "step": 106 }, { "adv/mean_abs_final_conf": 0.46741652488708496, "adv/mean_abs_reasoning": 0.37776196002960205, "adv/mean_abs_step_conf": 0.7364166975021362, "adv/ratio_final_to_reasoning": 1.2373308441391437, "adv/ratio_step_to_reasoning": 1.9494199401242767, "adv/std_final_conf": 0.7404344081878662, "adv/std_reasoning": 0.6615345478057861, "adv/std_step_conf": 0.9301761984825134, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7613046701260192, "calib/avg_num_step_conf": 16.1328125, "calib/ece": 0.13954545454545458, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6818181818181818, "calib/gap": 0.34491804628943257, "calib/mean_conf": 0.8054132231404959, "calib/mu_c": 0.906608187134503, "calib/mu_w": 0.5616901408450704, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11917355371900828, "calib/std_conf": 0.3134400484261384, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5207115401301519, "calib/step_q_c_n": 2305.0, "calib/step_q_gap": 0.1041033209520697, "calib/step_q_w": 0.41660821917808216, "calib/step_q_w_n": 1825.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 715.06640625, "completions/mean_terminated_length": 753.3209838867188, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.11413333333333334, "grad_norm": 1.1405235528945923, "kl": 0.17919921875, "learning_rate": 2.6111111111111113e-06, "loss": -0.3163, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.019453587010502815, "mask/share_reasoning": 0.7481480836868286, "mask/share_step_conf": 0.18161706626415253, "num_tokens": 33914323.0, "reward": 0.9685600996017456, "reward_std": 0.1913624256849289, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.779498815536499, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8357464075088501, "step": 107 }, { "adv/mean_abs_final_conf": 0.4155716896057129, "adv/mean_abs_reasoning": 0.2890777587890625, "adv/mean_abs_step_conf": 0.7490019798278809, "adv/ratio_final_to_reasoning": 1.4375775270519926, "adv/ratio_step_to_reasoning": 2.591005212457113, "adv/std_final_conf": 0.70196133852005, "adv/std_reasoning": 0.5962141752243042, "adv/std_step_conf": 0.9318674802780151, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7755922165820643, "calib/avg_num_step_conf": 15.2421875, "calib/ece": 0.11927755102040816, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.763265306122449, "calib/gap": 0.44832445008460237, "calib/mean_conf": 0.8359061224489795, "calib/mu_c": 0.923741116751269, "calib/mu_w": 0.47541666666666665, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07555102040816326, "calib/std_conf": 0.32973906097073563, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5432281262485018, "calib/step_q_c_n": 2503.0, "calib/step_q_gap": 0.1963160461913181, "calib/step_q_w": 0.3469120800571837, "calib/step_q_w_n": 1399.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 731.33203125, "completions/mean_terminated_length": 764.1672973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.1152, "grad_norm": 1.5220770835876465, "kl": 0.1700439453125, "learning_rate": 2.5833333333333337e-06, "loss": -0.2687, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020580247044563293, "mask/share_reasoning": 0.7575360536575317, "mask/share_step_conf": 0.17891497910022736, "num_tokens": 34204776.0, "reward": 1.006317138671875, "reward_std": 0.15224643051624298, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.8325179815292358, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8355849981307983, "step": 108 }, { "adv/mean_abs_final_conf": 0.5334233045578003, "adv/mean_abs_reasoning": 0.32670843601226807, "adv/mean_abs_step_conf": 0.7371534705162048, "adv/ratio_final_to_reasoning": 1.6327197150727077, "adv/ratio_step_to_reasoning": 2.256303753627361, "adv/std_final_conf": 0.7761994004249573, "adv/std_reasoning": 0.6188747882843018, "adv/std_step_conf": 0.9314670562744141, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.8270304951690821, "calib/avg_num_step_conf": 16.51953125, "calib/ece": 0.13906779661016955, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5211864406779662, "calib/gap": 0.47595108695652183, "calib/mean_conf": 0.665084745762712, "calib/mu_c": 0.8506250000000001, "calib/mu_w": 0.37467391304347825, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09699152542372887, "calib/std_conf": 0.390418605772586, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5304288025889967, "calib/step_q_c_n": 1854.0, "calib/step_q_gap": 0.16112648679952302, "calib/step_q_w": 0.3693023157894737, "calib/step_q_w_n": 2375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 734.88671875, "completions/mean_terminated_length": 793.8016357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.11626666666666667, "grad_norm": 1.3783597946166992, "kl": 0.18536376953125, "learning_rate": 2.5555555555555557e-06, "loss": -0.3752, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01848272606730461, "mask/share_reasoning": 0.7364345192909241, "mask/share_step_conf": 0.1708640307188034, "num_tokens": 34497507.0, "reward": 0.9415305852890015, "reward_std": 0.19288137555122375, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7680297493934631, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8181564807891846, "step": 109 }, { "adv/mean_abs_final_conf": 0.577642560005188, "adv/mean_abs_reasoning": 0.37749266624450684, "adv/mean_abs_step_conf": 0.7485542297363281, "adv/ratio_final_to_reasoning": 1.5302086945208135, "adv/ratio_step_to_reasoning": 1.9829636352497506, "adv/std_final_conf": 0.8023480176925659, "adv/std_reasoning": 0.6613473892211914, "adv/std_step_conf": 0.9315283894538879, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7073720397249808, "calib/avg_num_step_conf": 14.77734375, "calib/ece": 0.1866396761133603, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.631578947368421, "calib/gap": 0.30904812834224615, "calib/mean_conf": 0.7690688259109312, "calib/mu_c": 0.8654117647058823, "calib/mu_w": 0.5563636363636362, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1337246963562753, "calib/std_conf": 0.3502993851416319, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5100721265704979, "calib/step_q_c_n": 2149.0, "calib/step_q_gap": 0.1268162514174992, "calib/step_q_w": 0.38325587515299875, "calib/step_q_w_n": 1634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 728.671875, "completions/mean_terminated_length": 752.1773681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.11733333333333333, "grad_norm": 2.002427816390991, "kl": 0.20111083984375, "learning_rate": 2.5277777777777778e-06, "loss": -0.1678, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020799528807401657, "mask/share_reasoning": 0.7653565406799316, "mask/share_step_conf": 0.1825939416885376, "num_tokens": 34788967.0, "reward": 0.973604679107666, "reward_std": 0.1798972189426422, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7610874772071838, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8603405952453613, "step": 110 }, { "adv/mean_abs_final_conf": 0.4831022620201111, "adv/mean_abs_reasoning": 0.38877198100090027, "adv/mean_abs_step_conf": 0.7911495566368103, "adv/ratio_final_to_reasoning": 1.2426365212234582, "adv/ratio_step_to_reasoning": 2.0349963353840006, "adv/std_final_conf": 0.7388092279434204, "adv/std_reasoning": 0.6406905055046082, "adv/std_step_conf": 0.9329996109008789, "calib/answer_extract_rate": 0.88671875, "calib/auroc": 0.836324570273003, "calib/avg_num_step_conf": 16.53515625, "calib/ece": 0.09458149779735689, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.7136563876651982, "calib/gap": 0.4742985338725984, "calib/mean_conf": 0.8077092511013215, "calib/mu_c": 0.8975543478260868, "calib/mu_w": 0.4232558139534884, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04585903083700447, "calib/std_conf": 0.3332499743751304, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5335155335628228, "calib/step_q_c_n": 2324.0, "calib/step_q_gap": 0.16686807415999405, "calib/step_q_w": 0.3666474594028287, "calib/step_q_w_n": 1909.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 726.84375, "completions/mean_terminated_length": 802.0344848632812, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.1184, "grad_norm": 1.5444231033325195, "kl": 0.183135986328125, "learning_rate": 2.5e-06, "loss": -0.3794, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.01892288215458393, "mask/share_reasoning": 0.7185566425323486, "mask/share_step_conf": 0.16877049207687378, "num_tokens": 35082447.0, "reward": 0.9426184296607971, "reward_std": 0.21018102765083313, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7812378406524658, "rewards/format_reward_step": 0.88671875, "rewards/step_l2_reward": 0.782905101776123, "step": 111 }, { "adv/mean_abs_final_conf": 0.5017001628875732, "adv/mean_abs_reasoning": 0.4246975779533386, "adv/mean_abs_step_conf": 0.7408498525619507, "adv/ratio_final_to_reasoning": 1.1813115707071324, "adv/ratio_step_to_reasoning": 1.7444174184655878, "adv/std_final_conf": 0.7502984404563904, "adv/std_reasoning": 0.7014614343643188, "adv/std_step_conf": 0.9323014616966248, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.8076171875, "calib/avg_num_step_conf": 15.23046875, "calib/ece": 0.1792291666666667, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5541666666666667, "calib/gap": 0.46334374999999994, "calib/mean_conf": 0.6600208333333334, "calib/mu_c": 0.81446875, "calib/mu_w": 0.351125, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0862916666666667, "calib/std_conf": 0.4164073771352267, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5402286236854138, "calib/step_q_c_n": 2187.0, "calib/step_q_gap": 0.21655806293775026, "calib/step_q_w": 0.32367056074766354, "calib/step_q_w_n": 1712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 837.93359375, "completions/mean_terminated_length": 879.1433715820312, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.11946666666666667, "grad_norm": 1.0276604890823364, "kl": 0.178070068359375, "learning_rate": 2.4722222222222226e-06, "loss": -0.1891, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01743997260928154, "mask/share_reasoning": 0.7671045064926147, "mask/share_step_conf": 0.16858048737049103, "num_tokens": 35404878.0, "reward": 0.9420138597488403, "reward_std": 0.1948275864124298, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7521448135375977, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.8209453821182251, "step": 112 }, { "adv/mean_abs_final_conf": 0.4828791618347168, "adv/mean_abs_reasoning": 0.2895558476448059, "adv/mean_abs_step_conf": 0.7607555389404297, "adv/ratio_final_to_reasoning": 1.6676546709809774, "adv/ratio_step_to_reasoning": 2.627318857927669, "adv/std_final_conf": 0.7338624596595764, "adv/std_reasoning": 0.572844922542572, "adv/std_step_conf": 0.9308568239212036, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8748693834900731, "calib/avg_num_step_conf": 16.25390625, "calib/ece": 0.09904166666666672, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6833333333333333, "calib/gap": 0.4958986415882968, "calib/mean_conf": 0.8160416666666667, "calib/mu_c": 0.9524137931034483, "calib/mu_w": 0.45651515151515154, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09504166666666672, "calib/std_conf": 0.31163239283471306, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.577732053040695, "calib/step_q_c_n": 2187.0, "calib/step_q_gap": 0.1598141199099959, "calib/step_q_w": 0.4179179331306991, "calib/step_q_w_n": 1974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 704.10546875, "completions/mean_terminated_length": 744.8388061523438, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.12053333333333334, "grad_norm": 1.4608343839645386, "kl": 0.216827392578125, "learning_rate": 2.4444444444444447e-06, "loss": -0.1817, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.02076886221766472, "mask/share_reasoning": 0.7411940693855286, "mask/share_step_conf": 0.18334954977035522, "num_tokens": 35690329.0, "reward": 1.0036261081695557, "reward_std": 0.14372394979000092, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.8371511697769165, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8466634750366211, "step": 113 }, { "adv/mean_abs_final_conf": 0.37763625383377075, "adv/mean_abs_reasoning": 0.3088195323944092, "adv/mean_abs_step_conf": 0.7451033592224121, "adv/ratio_final_to_reasoning": 1.2228379821243698, "adv/ratio_step_to_reasoning": 2.412746866900901, "adv/std_final_conf": 0.6689955592155457, "adv/std_reasoning": 0.5960956811904907, "adv/std_step_conf": 0.9305745959281921, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8789053323937045, "calib/avg_num_step_conf": 15.55078125, "calib/ece": 0.08394190871369295, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8298755186721992, "calib/gap": 0.5109290580220813, "calib/mean_conf": 0.89, "calib/mu_c": 0.9811616161616162, "calib/mu_w": 0.47023255813953485, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07618257261410788, "calib/std_conf": 0.2681285998510627, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6627369689542483, "calib/step_q_c_n": 2448.0, "calib/step_q_gap": 0.22417206354002783, "calib/step_q_w": 0.43856490541422044, "calib/step_q_w_n": 1533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 748.65234375, "completions/mean_terminated_length": 782.2652587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.1216, "grad_norm": 1.317616581916809, "kl": 0.196868896484375, "learning_rate": 2.4166666666666667e-06, "loss": -0.2015, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02109391801059246, "mask/share_reasoning": 0.753338098526001, "mask/share_step_conf": 0.1825992465019226, "num_tokens": 35987008.0, "reward": 1.0221213102340698, "reward_std": 0.14799124002456665, "rewards/accuracy_reward_step": 0.7734375, "rewards/final_brier_reward_step": 0.8723347783088684, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8289391398429871, "step": 114 }, { "adv/mean_abs_final_conf": 0.5223723649978638, "adv/mean_abs_reasoning": 0.4652789831161499, "adv/mean_abs_step_conf": 0.7420525550842285, "adv/ratio_final_to_reasoning": 1.1227078461600346, "adv/ratio_step_to_reasoning": 1.5948550912710928, "adv/std_final_conf": 0.7592284083366394, "adv/std_reasoning": 0.7207779884338379, "adv/std_step_conf": 0.9330939650535583, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7567011823455974, "calib/avg_num_step_conf": 16.2109375, "calib/ece": 0.20657024793388418, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6694214876033058, "calib/gap": 0.32617977528089903, "calib/mean_conf": 0.7900413223140496, "calib/mu_c": 0.9100000000000001, "calib/mu_w": 0.5838202247191011, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.182190082644628, "calib/std_conf": 0.33950437616172524, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6168599257884972, "calib/step_q_c_n": 2156.0, "calib/step_q_gap": 0.13217085858689243, "calib/step_q_w": 0.4846890672016048, "calib/step_q_w_n": 1994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 744.55859375, "completions/mean_terminated_length": 781.1762084960938, "completions/min_length": 0.0, "completions/min_terminated_length": 340.0, "epoch": 0.12266666666666666, "grad_norm": 1.631301999092102, "kl": 0.192962646484375, "learning_rate": 2.388888888888889e-06, "loss": -0.2281, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018878819420933723, "mask/share_reasoning": 0.7551947236061096, "mask/share_step_conf": 0.1790514886379242, "num_tokens": 36282879.0, "reward": 0.9186517596244812, "reward_std": 0.20043343305587769, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7363996505737305, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7915288805961609, "step": 115 }, { "adv/mean_abs_final_conf": 0.4976283311843872, "adv/mean_abs_reasoning": 0.3845769166946411, "adv/mean_abs_step_conf": 0.7298250198364258, "adv/ratio_final_to_reasoning": 1.2939630788592293, "adv/ratio_step_to_reasoning": 1.8977348565512475, "adv/std_final_conf": 0.7586896419525146, "adv/std_reasoning": 0.6816074848175049, "adv/std_step_conf": 0.932213544845581, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.722817879794624, "calib/avg_num_step_conf": 15.265625, "calib/ece": 0.16698795180722903, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6947791164658634, "calib/gap": 0.31264799154334033, "calib/mean_conf": 0.8223293172690763, "calib/mu_c": 0.9190116279069767, "calib/mu_w": 0.6063636363636363, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14927710843373507, "calib/std_conf": 0.30884841810408764, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6477947413061916, "calib/step_q_c_n": 2358.0, "calib/step_q_gap": 0.1277044187255465, "calib/step_q_w": 0.5200903225806451, "calib/step_q_w_n": 1550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 817.55078125, "completions/mean_terminated_length": 840.5341186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.12373333333333333, "grad_norm": 1.0573772192001343, "kl": 0.17333984375, "learning_rate": 2.361111111111111e-06, "loss": -0.1528, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019139280542731285, "mask/share_reasoning": 0.7793089747428894, "mask/share_step_conf": 0.17420801520347595, "num_tokens": 36596692.0, "reward": 0.9798210263252258, "reward_std": 0.18024872243404388, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7851890921592712, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8455467224121094, "step": 116 }, { "adv/mean_abs_final_conf": 0.528212308883667, "adv/mean_abs_reasoning": 0.363384485244751, "adv/mean_abs_step_conf": 0.7528957724571228, "adv/ratio_final_to_reasoning": 1.453590701672085, "adv/ratio_step_to_reasoning": 2.071898507031811, "adv/std_final_conf": 0.7949268817901611, "adv/std_reasoning": 0.6612570285797119, "adv/std_step_conf": 0.9334501624107361, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7118725868725869, "calib/avg_num_step_conf": 16.19921875, "calib/ece": 0.23553784860557783, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6374501992031872, "calib/gap": 0.2977432432432433, "calib/mean_conf": 0.7728286852589641, "calib/mu_c": 0.9045, "calib/mu_w": 0.6067567567567567, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22529880478087663, "calib/std_conf": 0.3335233354246348, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6066803699897224, "calib/step_q_c_n": 1946.0, "calib/step_q_gap": 0.10854315963079458, "calib/step_q_w": 0.49813721035892783, "calib/step_q_w_n": 2201.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 811.0390625, "completions/mean_terminated_length": 827.1952514648438, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.1248, "grad_norm": 1.863582730293274, "kl": 0.1826171875, "learning_rate": 2.3333333333333336e-06, "loss": -0.0961, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019116554409265518, "mask/share_reasoning": 0.7717758417129517, "mask/share_step_conf": 0.18957631289958954, "num_tokens": 36910918.0, "reward": 0.925274670124054, "reward_std": 0.1669517159461975, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7245453596115112, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8213165998458862, "step": 117 }, { "adv/mean_abs_final_conf": 0.46183598041534424, "adv/mean_abs_reasoning": 0.2155253142118454, "adv/mean_abs_step_conf": 0.7451225519180298, "adv/ratio_final_to_reasoning": 2.142838682798039, "adv/ratio_step_to_reasoning": 3.45723913983315, "adv/std_final_conf": 0.7363441586494446, "adv/std_reasoning": 0.49626436829566956, "adv/std_step_conf": 0.9306170344352722, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7338340704590514, "calib/avg_num_step_conf": 16.546875, "calib/ece": 0.18885714285714295, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7510204081632653, "calib/gap": 0.3821244471557115, "calib/mean_conf": 0.8423265306122449, "calib/mu_c": 0.9655421686746988, "calib/mu_w": 0.5834177215189873, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17681632653061233, "calib/std_conf": 0.31773544301054324, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6898274374460742, "calib/step_q_c_n": 2318.0, "calib/step_q_gap": 0.1792226407828834, "calib/step_q_w": 0.5106047966631908, "calib/step_q_w_n": 1918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 837.8828125, "completions/mean_terminated_length": 871.9430541992188, "completions/min_length": 0.0, "completions/min_terminated_length": 385.0, "epoch": 0.12586666666666665, "grad_norm": 1.3745417594909668, "kl": 0.1988067626953125, "learning_rate": 2.305555555555556e-06, "loss": -0.1859, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018030881881713867, "mask/share_reasoning": 0.7629454731941223, "mask/share_step_conf": 0.1799611747264862, "num_tokens": 37229424.0, "reward": 0.9692332148551941, "reward_std": 0.14571306109428406, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.785136342048645, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8322362899780273, "step": 118 }, { "adv/mean_abs_final_conf": 0.488694965839386, "adv/mean_abs_reasoning": 0.46582090854644775, "adv/mean_abs_step_conf": 0.7593731880187988, "adv/ratio_final_to_reasoning": 1.0491048316493878, "adv/ratio_step_to_reasoning": 1.6301827034521799, "adv/std_final_conf": 0.7184910178184509, "adv/std_reasoning": 0.720762312412262, "adv/std_step_conf": 0.9316381812095642, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6617424242424241, "calib/avg_num_step_conf": 17.08203125, "calib/ece": 0.2053164556962026, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.7426160337552743, "calib/gap": 0.25903535353535356, "calib/mean_conf": 0.8250632911392406, "calib/mu_c": 0.9037575757575758, "calib/mu_w": 0.6447222222222222, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1670886075949368, "calib/std_conf": 0.3260097850686025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5923070325900515, "calib/step_q_c_n": 2332.0, "calib/step_q_gap": 0.1461482868771657, "calib/step_q_w": 0.4461587457128858, "calib/step_q_w_n": 2041.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 807.94921875, "completions/mean_terminated_length": 869.0546875, "completions/min_length": 0.0, "completions/min_terminated_length": 281.0, "epoch": 0.12693333333333334, "grad_norm": 1.1281754970550537, "kl": 0.165008544921875, "learning_rate": 2.277777777777778e-06, "loss": -0.3661, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01740211620926857, "mask/share_reasoning": 0.737775444984436, "mask/share_step_conf": 0.17450997233390808, "num_tokens": 37541323.0, "reward": 0.9210110306739807, "reward_std": 0.20405098795890808, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7176492214202881, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8103102445602417, "step": 119 }, { "adv/mean_abs_final_conf": 0.45271146297454834, "adv/mean_abs_reasoning": 0.34042036533355713, "adv/mean_abs_step_conf": 0.7365242838859558, "adv/ratio_final_to_reasoning": 1.32986010555204, "adv/ratio_step_to_reasoning": 2.1635729200991856, "adv/std_final_conf": 0.7090985178947449, "adv/std_reasoning": 0.6404178738594055, "adv/std_step_conf": 0.9312921166419983, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8437837837837838, "calib/avg_num_step_conf": 16.4609375, "calib/ece": 0.11310204081632655, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7346938775510204, "calib/gap": 0.5426891891891892, "calib/mean_conf": 0.7962857142857143, "calib/mu_c": 0.9291891891891892, "calib/mu_w": 0.3865, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07714285714285718, "calib/std_conf": 0.3663784148351849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5460447185813416, "calib/step_q_c_n": 2594.0, "calib/step_q_gap": 0.1906064469764034, "calib/step_q_w": 0.35543827160493824, "calib/step_q_w_n": 1620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 807.69921875, "completions/mean_terminated_length": 837.1295776367188, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.128, "grad_norm": 1.5760127305984497, "kl": 0.175811767578125, "learning_rate": 2.25e-06, "loss": -0.2105, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01830362156033516, "mask/share_reasoning": 0.763595700263977, "mask/share_step_conf": 0.18294444680213928, "num_tokens": 37854782.0, "reward": 1.00954270362854, "reward_std": 0.15593905746936798, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.842052698135376, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.841094970703125, "step": 120 }, { "adv/mean_abs_final_conf": 0.6101598739624023, "adv/mean_abs_reasoning": 0.47542914748191833, "adv/mean_abs_step_conf": 0.7502731084823608, "adv/ratio_final_to_reasoning": 1.2833876029563545, "adv/ratio_step_to_reasoning": 1.5780965732878114, "adv/std_final_conf": 0.8184146285057068, "adv/std_reasoning": 0.7395377159118652, "adv/std_step_conf": 0.933059573173523, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7071428571428572, "calib/avg_num_step_conf": 17.65234375, "calib/ece": 0.16775933609958507, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8132780082987552, "calib/gap": 0.33441298701298705, "calib/mean_conf": 0.865103734439834, "calib/mu_c": 0.9566857142857144, "calib/mu_w": 0.6222727272727273, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15336099585062243, "calib/std_conf": 0.3078719786116783, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4796006028636022, "calib/step_q_c_n": 2654.0, "calib/step_q_gap": 0.08274805594671208, "calib/step_q_w": 0.3968525469168901, "calib/step_q_w_n": 1865.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2071.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 870.2421875, "completions/mean_terminated_length": 920.5867309570312, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.12906666666666666, "grad_norm": 1.001937985420227, "kl": 0.1642913818359375, "learning_rate": 2.222222222222222e-06, "loss": -0.3088, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016683416441082954, "mask/share_reasoning": 0.7530420422554016, "mask/share_step_conf": 0.1755870282649994, "num_tokens": 38182620.0, "reward": 0.9672816395759583, "reward_std": 0.233428955078125, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7719972729682922, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8375660181045532, "step": 121 }, { "adv/mean_abs_final_conf": 0.47844454646110535, "adv/mean_abs_reasoning": 0.3753058910369873, "adv/mean_abs_step_conf": 0.7403842806816101, "adv/ratio_final_to_reasoning": 1.2748122475219912, "adv/ratio_step_to_reasoning": 1.972748891939571, "adv/std_final_conf": 0.7223483324050903, "adv/std_reasoning": 0.6613174676895142, "adv/std_step_conf": 0.9320122003555298, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.809391149870801, "calib/avg_num_step_conf": 16.609375, "calib/ece": 0.13643442622950835, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7418032786885246, "calib/gap": 0.49550064599483223, "calib/mean_conf": 0.8019262295081968, "calib/mu_c": 0.948139534883721, "calib/mu_w": 0.4526388888888888, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11672131147541, "calib/std_conf": 0.35996239940324887, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4943472956486376, "calib/step_q_c_n": 2459.0, "calib/step_q_gap": 0.1431203017836069, "calib/step_q_w": 0.3512269938650307, "calib/step_q_w_n": 1793.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 844.21484375, "completions/mean_terminated_length": 882.1183471679688, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.13013333333333332, "grad_norm": 1.3635586500167847, "kl": 0.168548583984375, "learning_rate": 2.1944444444444445e-06, "loss": -0.2296, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017064107581973076, "mask/share_reasoning": 0.7619980573654175, "mask/share_step_conf": 0.17796911299228668, "num_tokens": 38506083.0, "reward": 0.9945069551467896, "reward_std": 0.17891597747802734, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.8188722729682922, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8451414704322815, "step": 122 }, { "adv/mean_abs_final_conf": 0.5975226163864136, "adv/mean_abs_reasoning": 0.42298442125320435, "adv/mean_abs_step_conf": 0.7534896731376648, "adv/ratio_final_to_reasoning": 1.4126350436644763, "adv/ratio_step_to_reasoning": 1.7813650699126231, "adv/std_final_conf": 0.8297080397605896, "adv/std_reasoning": 0.7015055418014526, "adv/std_step_conf": 0.9317981600761414, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6623387228482771, "calib/avg_num_step_conf": 18.08984375, "calib/ece": 0.22391489361702122, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.7106382978723405, "calib/gap": 0.28907071696880604, "calib/mean_conf": 0.7890212765957446, "calib/mu_c": 0.884968152866242, "calib/mu_w": 0.5958974358974359, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17242553191489357, "calib/std_conf": 0.3619114438150965, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4730118365627633, "calib/step_q_c_n": 2374.0, "calib/step_q_gap": 0.1243011586717575, "calib/step_q_w": 0.3487106778910058, "calib/step_q_w_n": 2257.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 887.015625, "completions/mean_terminated_length": 958.1265258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.1312, "grad_norm": 1.798366665840149, "kl": 0.157623291015625, "learning_rate": 2.166666666666667e-06, "loss": -0.2705, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.015909792855381966, "mask/share_reasoning": 0.7412210702896118, "mask/share_step_conf": 0.16865041851997375, "num_tokens": 38838447.0, "reward": 0.8978673219680786, "reward_std": 0.20159657299518585, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6984351873397827, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7910494804382324, "step": 123 }, { "adv/mean_abs_final_conf": 0.4956919252872467, "adv/mean_abs_reasoning": 0.37638136744499207, "adv/mean_abs_step_conf": 0.7453024983406067, "adv/ratio_final_to_reasoning": 1.3169937944914125, "adv/ratio_step_to_reasoning": 1.9801790492445996, "adv/std_final_conf": 0.7436941266059875, "adv/std_reasoning": 0.6404518485069275, "adv/std_step_conf": 0.9319395422935486, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8204586852127835, "calib/avg_num_step_conf": 16.7109375, "calib/ece": 0.15329317269076298, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6907630522088354, "calib/gap": 0.50397913561848, "calib/mean_conf": 0.7516064257028111, "calib/mu_c": 0.8851912568306012, "calib/mu_w": 0.3812121212121212, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08497991967871475, "calib/std_conf": 0.39962812748853843, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45866718188353706, "calib/step_q_c_n": 2782.0, "calib/step_q_gap": 0.10763777011883113, "calib/step_q_w": 0.3510294117647059, "calib/step_q_w_n": 1496.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 891.6328125, "completions/mean_terminated_length": 913.0320434570312, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 0.13226666666666667, "grad_norm": 1.2462170124053955, "kl": 0.1632232666015625, "learning_rate": 2.138888888888889e-06, "loss": -0.136, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01659967750310898, "mask/share_reasoning": 0.783950924873352, "mask/share_step_conf": 0.17601194977760315, "num_tokens": 39173521.0, "reward": 1.0056735277175903, "reward_std": 0.1546945720911026, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.8185582160949707, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8552888631820679, "step": 124 }, { "adv/mean_abs_final_conf": 0.638321042060852, "adv/mean_abs_reasoning": 0.5520058870315552, "adv/mean_abs_step_conf": 0.7365944385528564, "adv/ratio_final_to_reasoning": 1.1563663668397846, "adv/ratio_step_to_reasoning": 1.3343959835536126, "adv/std_final_conf": 0.8369554877281189, "adv/std_reasoning": 0.7757068276405334, "adv/std_step_conf": 0.9327099919319153, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6364816149963951, "calib/avg_num_step_conf": 16.34375, "calib/ece": 0.3091697095435686, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6473029045643154, "calib/gap": 0.2068954217736122, "calib/mean_conf": 0.723444398340249, "calib/mu_c": 0.8050006849315069, "calib/mu_w": 0.5981052631578947, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2134024896265561, "calib/std_conf": 0.4091855021450746, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4394301288404361, "calib/step_q_c_n": 2018.0, "calib/step_q_gap": 0.060925881379678914, "calib/step_q_w": 0.37850424746075717, "calib/step_q_w_n": 2166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 843.921875, "completions/mean_terminated_length": 889.0699462890625, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.13333333333333333, "grad_norm": 2.6262714862823486, "kl": 0.17041015625, "learning_rate": 2.1111111111111114e-06, "loss": -0.216, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017273977398872375, "mask/share_reasoning": 0.761811375617981, "mask/share_step_conf": 0.17013338208198547, "num_tokens": 39494373.0, "reward": 0.8694828152656555, "reward_std": 0.22389066219329834, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.638969898223877, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.797652006149292, "step": 125 }, { "adv/mean_abs_final_conf": 0.47462698817253113, "adv/mean_abs_reasoning": 0.48116013407707214, "adv/mean_abs_step_conf": 0.739993691444397, "adv/ratio_final_to_reasoning": 0.9864220964251903, "adv/ratio_step_to_reasoning": 1.5379364145863856, "adv/std_final_conf": 0.742769181728363, "adv/std_reasoning": 0.7394794821739197, "adv/std_step_conf": 0.9330347776412964, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.7944596995638831, "calib/avg_num_step_conf": 17.72265625, "calib/ece": 0.21266051502145927, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.5579399141630901, "calib/gap": 0.4827885801970603, "calib/mean_conf": 0.6214167381974248, "calib/mu_c": 0.7913251655629139, "calib/mu_w": 0.3085365853658536, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09300429184549364, "calib/std_conf": 0.45432358032857983, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44223505802707935, "calib/step_q_c_n": 2068.0, "calib/step_q_gap": 0.11264413052606675, "calib/step_q_w": 0.3295909275010126, "calib/step_q_w_n": 2469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 802.32421875, "completions/mean_terminated_length": 870.3178100585938, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.1344, "grad_norm": 2.2203688621520996, "kl": 0.17095947265625, "learning_rate": 2.0833333333333334e-06, "loss": -0.3218, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.017456606030464172, "mask/share_reasoning": 0.7298016548156738, "mask/share_step_conf": 0.1746167689561844, "num_tokens": 39805232.0, "reward": 0.8997466564178467, "reward_std": 0.17906725406646729, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7144988775253296, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7849945425987244, "step": 126 }, { "adv/mean_abs_final_conf": 0.5340908765792847, "adv/mean_abs_reasoning": 0.4019251763820648, "adv/mean_abs_step_conf": 0.7468769550323486, "adv/ratio_final_to_reasoning": 1.328831603401686, "adv/ratio_step_to_reasoning": 1.858248746085956, "adv/std_final_conf": 0.7581247091293335, "adv/std_reasoning": 0.681617796421051, "adv/std_step_conf": 0.9322848320007324, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7414828975216767, "calib/avg_num_step_conf": 15.89453125, "calib/ece": 0.2650806451612904, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5443548387096774, "calib/gap": 0.3839270840445142, "calib/mean_conf": 0.6168548387096774, "calib/mu_c": 0.7670198675496688, "calib/mu_w": 0.3830927835051546, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13653225806451616, "calib/std_conf": 0.45962882453949144, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46720219078046554, "calib/step_q_c_n": 2191.0, "calib/step_q_gap": 0.1338315837517115, "calib/step_q_w": 0.33337060702875404, "calib/step_q_w_n": 1878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 810.37890625, "completions/mean_terminated_length": 833.1605834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.13546666666666668, "grad_norm": 1.509108304977417, "kl": 0.1894378662109375, "learning_rate": 2.0555555555555555e-06, "loss": -0.0941, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018593881279230118, "mask/share_reasoning": 0.7689633965492249, "mask/share_step_conf": 0.18509897589683533, "num_tokens": 40116361.0, "reward": 0.9314602613449097, "reward_std": 0.17034628987312317, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7065680027008057, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8454150557518005, "step": 127 }, { "adv/mean_abs_final_conf": 0.5389367341995239, "adv/mean_abs_reasoning": 0.4364900588989258, "adv/mean_abs_step_conf": 0.7516690492630005, "adv/ratio_final_to_reasoning": 1.234705632378035, "adv/ratio_step_to_reasoning": 1.722075987616153, "adv/std_final_conf": 0.8001542687416077, "adv/std_reasoning": 0.7393285036087036, "adv/std_step_conf": 0.9328148365020752, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8470835142567665, "calib/avg_num_step_conf": 16.65625, "calib/ece": 0.15161825726141076, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5933609958506224, "calib/gap": 0.6231278043132147, "calib/mean_conf": 0.6317842323651451, "calib/mu_c": 0.8748299319727891, "calib/mu_w": 0.25170212765957445, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08672199170124478, "calib/std_conf": 0.46266830639571904, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4711478800413651, "calib/step_q_c_n": 1934.0, "calib/step_q_gap": 0.14030667832462684, "calib/step_q_w": 0.33084120171673825, "calib/step_q_w_n": 2330.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 826.1484375, "completions/mean_terminated_length": 873.942138671875, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.13653333333333334, "grad_norm": 1.3248347043991089, "kl": 0.190185546875, "learning_rate": 2.027777777777778e-06, "loss": -0.3391, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017892982810735703, "mask/share_reasoning": 0.7522530555725098, "mask/share_step_conf": 0.17516639828681946, "num_tokens": 40434519.0, "reward": 0.9638656377792358, "reward_std": 0.21815454959869385, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7945922017097473, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8300141096115112, "step": 128 }, { "adv/mean_abs_final_conf": 0.5192984938621521, "adv/mean_abs_reasoning": 0.43999049067497253, "adv/mean_abs_step_conf": 0.7251843214035034, "adv/ratio_final_to_reasoning": 1.1802493573566015, "adv/ratio_step_to_reasoning": 1.6481818056818136, "adv/std_final_conf": 0.7576813101768494, "adv/std_reasoning": 0.7015329003334045, "adv/std_step_conf": 0.93047696352005, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6167287977632805, "calib/avg_num_step_conf": 14.92578125, "calib/ece": 0.2675806451612903, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8064516129032258, "calib/gap": 0.21778813296054644, "calib/mean_conf": 0.8364516129032258, "calib/mu_c": 0.9014367816091952, "calib/mu_w": 0.6836486486486487, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20120967741935486, "calib/std_conf": 0.35480274744451, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4805057096247961, "calib/step_q_c_n": 2452.0, "calib/step_q_gap": 0.047532736651823115, "calib/step_q_w": 0.432972972972973, "calib/step_q_w_n": 1369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 777.91015625, "completions/mean_terminated_length": 803.0040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.1376, "grad_norm": 1.8996646404266357, "kl": 0.1866302490234375, "learning_rate": 2.0000000000000003e-06, "loss": -0.1629, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019872169941663742, "mask/share_reasoning": 0.7659735083580017, "mask/share_step_conf": 0.18290433287620544, "num_tokens": 40736048.0, "reward": 0.9422010183334351, "reward_std": 0.17796912789344788, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7147148251533508, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8399996161460876, "step": 129 }, { "adv/mean_abs_final_conf": 0.28744834661483765, "adv/mean_abs_reasoning": 0.23953956365585327, "adv/mean_abs_step_conf": 0.737176775932312, "adv/ratio_final_to_reasoning": 1.2000036329189234, "adv/ratio_step_to_reasoning": 3.077473986683113, "adv/std_final_conf": 0.5962389707565308, "adv/std_reasoning": 0.5482521653175354, "adv/std_step_conf": 0.9300318360328674, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7417146345717773, "calib/avg_num_step_conf": 15.22265625, "calib/ece": 0.15208163265306132, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8244897959183674, "calib/gap": 0.44973137973137955, "calib/mean_conf": 0.8471020408163263, "calib/mu_c": 0.9627472527472527, "calib/mu_w": 0.5130158730158731, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12816326530612254, "calib/std_conf": 0.3438986707012995, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5272486983842011, "calib/step_q_c_n": 2228.0, "calib/step_q_gap": 0.16815942336922207, "calib/step_q_w": 0.359089275014979, "calib/step_q_w_n": 1669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 699.94921875, "completions/mean_terminated_length": 725.4534301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.13866666666666666, "grad_norm": 2.384445905685425, "kl": 0.20538330078125, "learning_rate": 1.9722222222222224e-06, "loss": -0.1488, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02121722884476185, "mask/share_reasoning": 0.751331090927124, "mask/share_step_conf": 0.1922954022884369, "num_tokens": 41020523.0, "reward": 0.994517982006073, "reward_std": 0.11978757381439209, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.811162531375885, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8450608849525452, "step": 130 }, { "adv/mean_abs_final_conf": 0.5089925527572632, "adv/mean_abs_reasoning": 0.4016663730144501, "adv/mean_abs_step_conf": 0.7433037757873535, "adv/ratio_final_to_reasoning": 1.2672023025909416, "adv/ratio_step_to_reasoning": 1.8505501722958844, "adv/std_final_conf": 0.7455973029136658, "adv/std_reasoning": 0.6816700100898743, "adv/std_step_conf": 0.9319937229156494, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.8307886258579633, "calib/avg_num_step_conf": 15.96875, "calib/ece": 0.18472803347280334, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.502092050209205, "calib/gap": 0.586250175094551, "calib/mean_conf": 0.5586192468619247, "calib/mu_c": 0.8554237288135592, "calib/mu_w": 0.2691735537190082, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12481171548117156, "calib/std_conf": 0.47216870057776017, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5208772757475082, "calib/step_q_c_n": 1505.0, "calib/step_q_gap": 0.1422632610359325, "calib/step_q_w": 0.3786140147115757, "calib/step_q_w_n": 2583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 745.875, "completions/mean_terminated_length": 792.2987670898438, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.13973333333333332, "grad_norm": 1.7262725830078125, "kl": 0.208740234375, "learning_rate": 1.944444444444445e-06, "loss": -0.1946, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018790874630212784, "mask/share_reasoning": 0.7444982528686523, "mask/share_step_conf": 0.1781170815229416, "num_tokens": 41317675.0, "reward": 0.9334285259246826, "reward_std": 0.18307605385780334, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7617784738540649, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8261722326278687, "step": 131 }, { "adv/mean_abs_final_conf": 0.4457632601261139, "adv/mean_abs_reasoning": 0.4157816767692566, "adv/mean_abs_step_conf": 0.744597315788269, "adv/ratio_final_to_reasoning": 1.072108957734316, "adv/ratio_step_to_reasoning": 1.7908372527957575, "adv/std_final_conf": 0.7231193780899048, "adv/std_reasoning": 0.7013997435569763, "adv/std_step_conf": 0.9315914511680603, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8034650187596557, "calib/avg_num_step_conf": 15.1953125, "calib/ece": 0.11728395061728394, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7983539094650206, "calib/gap": 0.5668252041491944, "calib/mean_conf": 0.8214814814814814, "calib/mu_c": 0.9287817258883247, "calib/mu_w": 0.36195652173913045, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06403292181069958, "calib/std_conf": 0.3693931552627997, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5528804356595401, "calib/step_q_c_n": 2479.0, "calib/step_q_gap": 0.10895414225061034, "calib/step_q_w": 0.4439262934089298, "calib/step_q_w_n": 1411.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 751.87109375, "completions/mean_terminated_length": 785.6285400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 281.0, "epoch": 0.1408, "grad_norm": 0.9695210456848145, "kl": 0.19598388671875, "learning_rate": 1.916666666666667e-06, "loss": -0.1617, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.020302962511777878, "mask/share_reasoning": 0.7580804824829102, "mask/share_step_conf": 0.17864784598350525, "num_tokens": 41615746.0, "reward": 1.008565068244934, "reward_std": 0.1856071650981903, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.8390554785728455, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8343245983123779, "step": 132 }, { "adv/mean_abs_final_conf": 0.586275041103363, "adv/mean_abs_reasoning": 0.5401917099952698, "adv/mean_abs_step_conf": 0.7389485836029053, "adv/ratio_final_to_reasoning": 1.0853092157013975, "adv/ratio_step_to_reasoning": 1.3679376597789255, "adv/std_final_conf": 0.8060058355331421, "adv/std_reasoning": 0.8099854588508606, "adv/std_step_conf": 0.9322542548179626, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6973301713586291, "calib/avg_num_step_conf": 17.890625, "calib/ece": 0.24563025210084036, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.7941176470588235, "calib/gap": 0.34347766217870257, "calib/mean_conf": 0.8152941176470588, "calib/mu_c": 0.939407894736842, "calib/mu_w": 0.5959302325581395, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21113445378151263, "calib/std_conf": 0.374132775445933, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5214712153518123, "calib/step_q_c_n": 2345.0, "calib/step_q_gap": 0.10588284846143198, "calib/step_q_w": 0.4155883668903803, "calib/step_q_w_n": 2235.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 869.65625, "completions/mean_terminated_length": 931.5146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.14186666666666667, "grad_norm": 1.4726386070251465, "kl": 0.1634674072265625, "learning_rate": 1.888888888888889e-06, "loss": -0.2602, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.016635891050100327, "mask/share_reasoning": 0.7523881793022156, "mask/share_step_conf": 0.164569690823555, "num_tokens": 41944722.0, "reward": 0.921579122543335, "reward_std": 0.24460284411907196, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7033835649490356, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.8350871801376343, "step": 133 }, { "adv/mean_abs_final_conf": 0.5112262964248657, "adv/mean_abs_reasoning": 0.4516318142414093, "adv/mean_abs_step_conf": 0.7432727813720703, "adv/ratio_final_to_reasoning": 1.1319536850687884, "adv/ratio_step_to_reasoning": 1.6457493868551323, "adv/std_final_conf": 0.7718311548233032, "adv/std_reasoning": 0.7393367290496826, "adv/std_step_conf": 0.9297084212303162, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7596343873517787, "calib/avg_num_step_conf": 15.45703125, "calib/ece": 0.19085365853658537, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7154471544715447, "calib/gap": 0.46503387916431393, "calib/mean_conf": 0.7632926829268292, "calib/mu_c": 0.9372077922077922, "calib/mu_w": 0.4721739130434782, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1640650406504065, "calib/std_conf": 0.4014345866325352, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5383115713642505, "calib/step_q_c_n": 2221.0, "calib/step_q_gap": 0.15309267735503385, "calib/step_q_w": 0.3852188940092166, "calib/step_q_w_n": 1736.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 862.15625, "completions/mean_terminated_length": 889.9677124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 331.0, "epoch": 0.14293333333333333, "grad_norm": 0.893550455570221, "kl": 0.17303466796875, "learning_rate": 1.8611111111111113e-06, "loss": -0.1254, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01787702739238739, "mask/share_reasoning": 0.7727956771850586, "mask/share_step_conf": 0.17807723581790924, "num_tokens": 42274386.0, "reward": 0.972447395324707, "reward_std": 0.19915707409381866, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7683347463607788, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8648412227630615, "step": 134 }, { "adv/mean_abs_final_conf": 0.47483566403388977, "adv/mean_abs_reasoning": 0.414731502532959, "adv/mean_abs_step_conf": 0.7407344579696655, "adv/ratio_final_to_reasoning": 1.1449230674155366, "adv/ratio_step_to_reasoning": 1.7860578553730648, "adv/std_final_conf": 0.7404298186302185, "adv/std_reasoning": 0.7014027237892151, "adv/std_step_conf": 0.9306936264038086, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6752712308267864, "calib/avg_num_step_conf": 16.171875, "calib/ece": 0.24105691056910575, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7845528455284553, "calib/gap": 0.25842424242424233, "calib/mean_conf": 0.8533333333333333, "calib/mu_c": 0.9384242424242423, "calib/mu_w": 0.6799999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.211829268292683, "calib/std_conf": 0.3164055406036077, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5287799657534247, "calib/step_q_c_n": 2336.0, "calib/step_q_gap": 0.09430103005497681, "calib/step_q_w": 0.43447893569844787, "calib/step_q_w_n": 1804.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 817.140625, "completions/mean_terminated_length": 846.9149780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 0.144, "grad_norm": 1.0808688402175903, "kl": 0.17889404296875, "learning_rate": 1.8333333333333333e-06, "loss": -0.1611, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018543971702456474, "mask/share_reasoning": 0.7660702466964722, "mask/share_step_conf": 0.18022947013378143, "num_tokens": 42589454.0, "reward": 0.953700602054596, "reward_std": 0.17119142413139343, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7301585674285889, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8561487793922424, "step": 135 }, { "adv/mean_abs_final_conf": 0.5471542477607727, "adv/mean_abs_reasoning": 0.46313631534576416, "adv/mean_abs_step_conf": 0.7517925500869751, "adv/ratio_final_to_reasoning": 1.1814108063460391, "adv/ratio_step_to_reasoning": 1.623264091319871, "adv/std_final_conf": 0.7937067747116089, "adv/std_reasoning": 0.7575483918190002, "adv/std_step_conf": 0.9331446290016174, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7184873949579831, "calib/avg_num_step_conf": 17.75, "calib/ece": 0.2541350210970465, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6962025316455697, "calib/gap": 0.3129435107376284, "calib/mean_conf": 0.7740506329113923, "calib/mu_c": 0.884967320261438, "calib/mu_w": 0.5720238095238096, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1913080168776372, "calib/std_conf": 0.3850591022222962, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.538657156910969, "calib/step_q_c_n": 2033.0, "calib/step_q_gap": 0.15307531700654842, "calib/step_q_w": 0.38558183990442063, "calib/step_q_w_n": 2511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 767.2421875, "completions/mean_terminated_length": 825.2689208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.14506666666666668, "grad_norm": 1.668074369430542, "kl": 0.194366455078125, "learning_rate": 1.8055555555555557e-06, "loss": -0.3404, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.018842482939362526, "mask/share_reasoning": 0.7297143936157227, "mask/share_step_conf": 0.18113064765930176, "num_tokens": 42894356.0, "reward": 0.9083887338638306, "reward_std": 0.21544815599918365, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6939855217933655, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8181042671203613, "step": 136 }, { "adv/mean_abs_final_conf": 0.4304378926753998, "adv/mean_abs_reasoning": 0.3329238295555115, "adv/mean_abs_step_conf": 0.735514760017395, "adv/ratio_final_to_reasoning": 1.2929020228142873, "adv/ratio_step_to_reasoning": 2.2092583790093516, "adv/std_final_conf": 0.7025694847106934, "adv/std_reasoning": 0.6404469609260559, "adv/std_step_conf": 0.9302935600280762, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6655455351862805, "calib/avg_num_step_conf": 16.9375, "calib/ece": 0.27730290456431533, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8049792531120332, "calib/gap": 0.24703356002365473, "calib/mean_conf": 0.8542323651452283, "calib/mu_c": 0.9454605263157895, "calib/mu_w": 0.6984269662921347, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2504149377593361, "calib/std_conf": 0.32701645743574614, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.589626168224299, "calib/step_q_c_n": 2247.0, "calib/step_q_gap": 0.1252936646340645, "calib/step_q_w": 0.46433250359023454, "calib/step_q_w_n": 2089.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 759.49609375, "completions/mean_terminated_length": 806.7677001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.14613333333333334, "grad_norm": 1.3590768575668335, "kl": 0.18255615234375, "learning_rate": 1.777777777777778e-06, "loss": -0.2852, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018493767827749252, "mask/share_reasoning": 0.742875337600708, "mask/share_step_conf": 0.18003717064857483, "num_tokens": 43195771.0, "reward": 0.9076789617538452, "reward_std": 0.18388333916664124, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6827605962753296, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8255661129951477, "step": 137 }, { "adv/mean_abs_final_conf": 0.4995764195919037, "adv/mean_abs_reasoning": 0.36340421438217163, "adv/mean_abs_step_conf": 0.744683563709259, "adv/ratio_final_to_reasoning": 1.3747127848840175, "adv/ratio_step_to_reasoning": 2.049188023246526, "adv/std_final_conf": 0.7227240800857544, "adv/std_reasoning": 0.640505850315094, "adv/std_step_conf": 0.9312154054641724, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7903603603603604, "calib/avg_num_step_conf": 16.06640625, "calib/ece": 0.1607755102040816, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8040816326530612, "calib/gap": 0.41652702702702715, "calib/mean_conf": 0.8410204081632654, "calib/mu_c": 0.9430270270270271, "calib/mu_w": 0.5265, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12334693877551015, "calib/std_conf": 0.343259554903409, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5379794385132464, "calib/step_q_c_n": 2529.0, "calib/step_q_gap": 0.08227868093748875, "calib/step_q_w": 0.4557007575757576, "calib/step_q_w_n": 1584.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 750.390625, "completions/mean_terminated_length": 784.0816040039062, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.1472, "grad_norm": 0.9787538647651672, "kl": 0.197845458984375, "learning_rate": 1.75e-06, "loss": -0.2109, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020196985453367233, "mask/share_reasoning": 0.7511271238327026, "mask/share_step_conf": 0.18570713698863983, "num_tokens": 43492207.0, "reward": 0.9933643341064453, "reward_std": 0.1851043999195099, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8076566457748413, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8431345224380493, "step": 138 }, { "adv/mean_abs_final_conf": 0.47331178188323975, "adv/mean_abs_reasoning": 0.35503894090652466, "adv/mean_abs_step_conf": 0.7147395610809326, "adv/ratio_final_to_reasoning": 1.3331263907973807, "adv/ratio_step_to_reasoning": 2.013130050624815, "adv/std_final_conf": 0.7576723694801331, "adv/std_reasoning": 0.6612539887428284, "adv/std_step_conf": 0.9314609169960022, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.786339962121212, "calib/avg_num_step_conf": 14.28125, "calib/ece": 0.17427419354838708, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8064516129032258, "calib/gap": 0.3549621212121211, "calib/mean_conf": 0.8727419354838709, "calib/mu_c": 0.9757954545454545, "calib/mu_w": 0.6208333333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16866935483870968, "calib/std_conf": 0.2950050265008548, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5645802583025831, "calib/step_q_c_n": 2168.0, "calib/step_q_gap": 0.12490283894774434, "calib/step_q_w": 0.4396774193548387, "calib/step_q_w_n": 1488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3037.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 717.12109375, "completions/mean_terminated_length": 737.2810668945312, "completions/min_length": 0.0, "completions/min_terminated_length": 280.0, "epoch": 0.14826666666666666, "grad_norm": 1.5027626752853394, "kl": 0.204742431640625, "learning_rate": 1.7222222222222224e-06, "loss": -0.1245, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020879067480564117, "mask/share_reasoning": 0.7668943405151367, "mask/share_step_conf": 0.18488281965255737, "num_tokens": 43778886.0, "reward": 0.9996874928474426, "reward_std": 0.1625138521194458, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.8007844090461731, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8673404455184937, "step": 139 }, { "adv/mean_abs_final_conf": 0.3784880042076111, "adv/mean_abs_reasoning": 0.26932770013809204, "adv/mean_abs_step_conf": 0.7536041140556335, "adv/ratio_final_to_reasoning": 1.4053066357955362, "adv/ratio_step_to_reasoning": 2.7980935999870757, "adv/std_final_conf": 0.6626982688903809, "adv/std_reasoning": 0.5727473497390747, "adv/std_step_conf": 0.9322657585144043, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6601719967750604, "calib/avg_num_step_conf": 14.76953125, "calib/ece": 0.19098360655737703, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8688524590163934, "calib/gap": 0.23781420765027328, "calib/mean_conf": 0.9154098360655737, "calib/mu_c": 0.9748633879781421, "calib/mu_w": 0.7370491803278688, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1781967213114754, "calib/std_conf": 0.2487267496045119, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.605308095046315, "calib/step_q_c_n": 2483.0, "calib/step_q_gap": 0.17678729381364933, "calib/step_q_w": 0.42852080123266567, "calib/step_q_w_n": 1298.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 726.44921875, "completions/mean_terminated_length": 762.1762084960938, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.14933333333333335, "grad_norm": 1.572951078414917, "kl": 0.19830322265625, "learning_rate": 1.6944444444444446e-06, "loss": -0.1577, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01956244185566902, "mask/share_reasoning": 0.752668023109436, "mask/share_step_conf": 0.18089447915554047, "num_tokens": 44069873.0, "reward": 0.9776472449302673, "reward_std": 0.15357211232185364, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7743710279464722, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8473297357559204, "step": 140 }, { "adv/mean_abs_final_conf": 0.41001710295677185, "adv/mean_abs_reasoning": 0.33884507417678833, "adv/mean_abs_step_conf": 0.7443231344223022, "adv/ratio_final_to_reasoning": 1.2100429789422005, "adv/ratio_step_to_reasoning": 2.1966473505055606, "adv/std_final_conf": 0.6817522048950195, "adv/std_reasoning": 0.6402838230133057, "adv/std_step_conf": 0.9308294057846069, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7327779696132597, "calib/avg_num_step_conf": 15.7265625, "calib/ece": 0.1717551020408163, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.3450569751381215, "calib/mean_conf": 0.8908571428571428, "calib/mu_c": 0.9809944751381215, "calib/mu_w": 0.6359375, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16191836734693874, "calib/std_conf": 0.29237465481041647, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5821700310559006, "calib/step_q_c_n": 2576.0, "calib/step_q_gap": 0.19642520346969367, "calib/step_q_w": 0.38574482758620693, "calib/step_q_w_n": 1450.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 788.2265625, "completions/mean_terminated_length": 823.6162719726562, "completions/min_length": 0.0, "completions/min_terminated_length": 363.0, "epoch": 0.1504, "grad_norm": 1.1141630411148071, "kl": 0.174774169921875, "learning_rate": 1.6666666666666667e-06, "loss": -0.233, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018559856340289116, "mask/share_reasoning": 0.7591939568519592, "mask/share_step_conf": 0.1792774498462677, "num_tokens": 44378755.0, "reward": 0.9904670119285583, "reward_std": 0.16573883593082428, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7958523035049438, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8522689938545227, "step": 141 }, { "adv/mean_abs_final_conf": 0.4961957633495331, "adv/mean_abs_reasoning": 0.3493594527244568, "adv/mean_abs_step_conf": 0.7250053882598877, "adv/ratio_final_to_reasoning": 1.4203015246331048, "adv/ratio_step_to_reasoning": 2.0752419395152493, "adv/std_final_conf": 0.7523589134216309, "adv/std_reasoning": 0.6614307761192322, "adv/std_step_conf": 0.9321356415748596, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7254901960784313, "calib/avg_num_step_conf": 15.59375, "calib/ece": 0.2209128630705394, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7344398340248963, "calib/gap": 0.35012594268476627, "calib/mean_conf": 0.8187551867219917, "calib/mu_c": 0.9422435897435898, "calib/mu_w": 0.5921176470588235, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19618257261410788, "calib/std_conf": 0.3494466672633746, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5328260869565218, "calib/step_q_c_n": 2070.0, "calib/step_q_gap": 0.1425711441885717, "calib/step_q_w": 0.3902549427679501, "calib/step_q_w_n": 1922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 801.91015625, "completions/mean_terminated_length": 844.8106689453125, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.15146666666666667, "grad_norm": 1.2045674324035645, "kl": 0.1831817626953125, "learning_rate": 1.638888888888889e-06, "loss": -0.2896, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018864743411540985, "mask/share_reasoning": 0.7495563626289368, "mask/share_step_conf": 0.18079766631126404, "num_tokens": 44689204.0, "reward": 0.939654529094696, "reward_std": 0.17866742610931396, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7343515753746033, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.834801197052002, "step": 142 }, { "adv/mean_abs_final_conf": 0.4655070900917053, "adv/mean_abs_reasoning": 0.3487567603588104, "adv/mean_abs_step_conf": 0.7415541410446167, "adv/ratio_final_to_reasoning": 1.3347614813624802, "adv/ratio_step_to_reasoning": 2.126278900749295, "adv/std_final_conf": 0.7253050804138184, "adv/std_reasoning": 0.6404346227645874, "adv/std_step_conf": 0.9328659772872925, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7829540918163673, "calib/avg_num_step_conf": 15.8359375, "calib/ece": 0.1960330578512397, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7272727272727273, "calib/gap": 0.3796630738522956, "calib/mean_conf": 0.8141322314049586, "calib/mu_c": 0.9317964071856288, "calib/mu_w": 0.5521333333333333, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1600413223140496, "calib/std_conf": 0.35013142614267967, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5567909391222275, "calib/step_q_c_n": 2119.0, "calib/step_q_gap": 0.1689563137992301, "calib/step_q_w": 0.38783462532299745, "calib/step_q_w_n": 1935.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 756.80859375, "completions/mean_terminated_length": 794.0286254882812, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 0.15253333333333333, "grad_norm": 1.2626639604568481, "kl": 0.1893310546875, "learning_rate": 1.6111111111111113e-06, "loss": -0.2919, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.019008919596672058, "mask/share_reasoning": 0.7597336173057556, "mask/share_step_conf": 0.17438244819641113, "num_tokens": 44990283.0, "reward": 0.9566188454627991, "reward_std": 0.17720302939414978, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7662203311920166, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8274861574172974, "step": 143 }, { "adv/mean_abs_final_conf": 0.4861205816268921, "adv/mean_abs_reasoning": 0.42723405361175537, "adv/mean_abs_step_conf": 0.7385736703872681, "adv/ratio_final_to_reasoning": 1.1378320092167775, "adv/ratio_step_to_reasoning": 1.7287331478928396, "adv/std_final_conf": 0.7436112761497498, "adv/std_reasoning": 0.7208815217018127, "adv/std_step_conf": 0.9324020743370056, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7337199837199837, "calib/avg_num_step_conf": 15.44140625, "calib/ece": 0.1280932203389831, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.7330508474576272, "calib/gap": 0.3989397639397638, "calib/mean_conf": 0.8408050847457628, "calib/mu_c": 0.932087912087912, "calib/mu_w": 0.5331481481481481, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09885593220338987, "calib/std_conf": 0.31586810051554415, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5030563554092965, "calib/step_q_c_n": 2431.0, "calib/step_q_gap": 0.10362797170364602, "calib/step_q_w": 0.3994283837056505, "calib/step_q_w_n": 1522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 717.25390625, "completions/mean_terminated_length": 765.0708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.1536, "grad_norm": 1.177304744720459, "kl": 0.21185302734375, "learning_rate": 1.5833333333333333e-06, "loss": -0.4129, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.019997183233499527, "mask/share_reasoning": 0.7369341850280762, "mask/share_step_conf": 0.1805686056613922, "num_tokens": 45278028.0, "reward": 0.9652903079986572, "reward_std": 0.2277950793504715, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7925496101379395, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8114685416221619, "step": 144 }, { "adv/mean_abs_final_conf": 0.5769491195678711, "adv/mean_abs_reasoning": 0.47640109062194824, "adv/mean_abs_step_conf": 0.7512475848197937, "adv/ratio_final_to_reasoning": 1.2110575120948106, "adv/ratio_step_to_reasoning": 1.5769224705993632, "adv/std_final_conf": 0.8105969429016113, "adv/std_reasoning": 0.7394625544548035, "adv/std_step_conf": 0.9313513040542603, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7123809523809523, "calib/avg_num_step_conf": 16.21484375, "calib/ece": 0.18113360323886635, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6923076923076923, "calib/gap": 0.29737142857142873, "calib/mean_conf": 0.8306882591093118, "calib/mu_c": 0.9173714285714287, "calib/mu_w": 0.62, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15165991902834003, "calib/std_conf": 0.31082483315637105, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.521555190808371, "calib/step_q_c_n": 2437.0, "calib/step_q_gap": 0.09775705778620059, "calib/step_q_w": 0.4237981330221704, "calib/step_q_w_n": 1714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 797.68359375, "completions/mean_terminated_length": 823.415283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.15466666666666667, "grad_norm": 1.193318247795105, "kl": 0.202301025390625, "learning_rate": 1.5555555555555558e-06, "loss": -0.1414, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019369829446077347, "mask/share_reasoning": 0.7623429298400879, "mask/share_step_conf": 0.18703722953796387, "num_tokens": 45584939.0, "reward": 0.9841883778572083, "reward_std": 0.20110999047756195, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7764695286750793, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8622196912765503, "step": 145 }, { "adv/mean_abs_final_conf": 0.6755393743515015, "adv/mean_abs_reasoning": 0.5243810415267944, "adv/mean_abs_step_conf": 0.7434419393539429, "adv/ratio_final_to_reasoning": 1.2882604839881178, "adv/ratio_step_to_reasoning": 1.4177513687171603, "adv/std_final_conf": 0.8616426587104797, "adv/std_reasoning": 0.7756788730621338, "adv/std_step_conf": 0.9332654476165771, "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.740364068064899, "calib/avg_num_step_conf": 20.47265625, "calib/ece": 0.21043859649122815, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.6096491228070176, "calib/gap": 0.3247218045112782, "calib/mean_conf": 0.7618421052631579, "calib/mu_c": 0.8971428571428571, "calib/mu_w": 0.5724210526315789, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19447368421052638, "calib/std_conf": 0.351004987692542, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4424310776942356, "calib/step_q_c_n": 1995.0, "calib/step_q_gap": 0.05988024590125973, "calib/step_q_w": 0.3825508317929759, "calib/step_q_w_n": 3246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 796.4453125, "completions/mean_terminated_length": 894.25439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.15573333333333333, "grad_norm": 1.6938319206237793, "kl": 0.184173583984375, "learning_rate": 1.527777777777778e-06, "loss": -0.5351, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.016806162893772125, "mask/share_reasoning": 0.7056422233581543, "mask/share_step_conf": 0.16817660629749298, "num_tokens": 45896045.0, "reward": 0.8753678798675537, "reward_std": 0.2581670880317688, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6766304969787598, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7920740842819214, "step": 146 }, { "adv/mean_abs_final_conf": 0.49448519945144653, "adv/mean_abs_reasoning": 0.4241216778755188, "adv/mean_abs_step_conf": 0.7673935890197754, "adv/ratio_final_to_reasoning": 1.16590409131735, "adv/ratio_step_to_reasoning": 1.809371293784724, "adv/std_final_conf": 0.7421457171440125, "adv/std_reasoning": 0.7013562917709351, "adv/std_step_conf": 0.9325929880142212, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.8000789577575997, "calib/avg_num_step_conf": 17.5546875, "calib/ece": 0.17153846153846147, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.6623931623931624, "calib/gap": 0.44292459534149226, "calib/mean_conf": 0.778974358974359, "calib/mu_c": 0.9398657718120805, "calib/mu_w": 0.4969411764705883, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15688034188034183, "calib/std_conf": 0.3681772719982075, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.508916864608076, "calib/step_q_c_n": 2105.0, "calib/step_q_gap": 0.15171301362440082, "calib/step_q_w": 0.35720385098367513, "calib/step_q_w_n": 2389.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 760.4296875, "completions/mean_terminated_length": 828.3829345703125, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.1568, "grad_norm": 1.6760679483413696, "kl": 0.202850341796875, "learning_rate": 1.5e-06, "loss": -0.3291, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.018419239670038223, "mask/share_reasoning": 0.7257556915283203, "mask/share_step_conf": 0.17379385232925415, "num_tokens": 46194395.0, "reward": 0.9290902614593506, "reward_std": 0.19981816411018372, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7475343942642212, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.8114272952079773, "step": 147 }, { "adv/mean_abs_final_conf": 0.4096311926841736, "adv/mean_abs_reasoning": 0.30846136808395386, "adv/mean_abs_step_conf": 0.6967654824256897, "adv/ratio_final_to_reasoning": 1.3279821561729064, "adv/ratio_step_to_reasoning": 2.2588419637562236, "adv/std_final_conf": 0.68379145860672, "adv/std_reasoning": 0.6187735199928284, "adv/std_step_conf": 0.9166436791419983, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.8216737689393939, "calib/avg_num_step_conf": 16.0078125, "calib/ece": 0.10741525423728815, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.8220338983050848, "calib/gap": 0.39049242424242436, "calib/mean_conf": 0.8992796610169492, "calib/mu_c": 0.9720833333333334, "calib/mu_w": 0.581590909090909, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09656779661016951, "calib/std_conf": 0.2564455334378512, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5034263754045307, "calib/step_q_c_n": 2472.0, "calib/step_q_gap": 0.14775601870096372, "calib/step_q_w": 0.355670356703567, "calib/step_q_w_n": 1626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 703.31640625, "completions/mean_terminated_length": 759.7003784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.15786666666666666, "grad_norm": 1.8525220155715942, "kl": 0.207855224609375, "learning_rate": 1.4722222222222225e-06, "loss": -0.2657, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.020124338567256927, "mask/share_reasoning": 0.7255045175552368, "mask/share_step_conf": 0.18015241622924805, "num_tokens": 46479556.0, "reward": 0.994626522064209, "reward_std": 0.16123086214065552, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.823849618434906, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8302472829818726, "step": 148 }, { "adv/mean_abs_final_conf": 0.5615799427032471, "adv/mean_abs_reasoning": 0.4756777286529541, "adv/mean_abs_step_conf": 0.7611234188079834, "adv/ratio_final_to_reasoning": 1.1805891024024917, "adv/ratio_step_to_reasoning": 1.6000820996252387, "adv/std_final_conf": 0.7934271097183228, "adv/std_reasoning": 0.7575734853744507, "adv/std_step_conf": 0.9321354627609253, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.8452263374485597, "calib/avg_num_step_conf": 16.22265625, "calib/ece": 0.13831223628691985, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6624472573839663, "calib/gap": 0.48383456790123486, "calib/mean_conf": 0.8031223628691982, "calib/mu_c": 0.9562345679012347, "calib/mu_w": 0.4723999999999999, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1289451476793249, "calib/std_conf": 0.34784380710493085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47241913439635536, "calib/step_q_c_n": 2195.0, "calib/step_q_gap": 0.08704068700105405, "calib/step_q_w": 0.3853784473953013, "calib/step_q_w_n": 1958.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 798.0859375, "completions/mean_terminated_length": 851.2916870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 339.0, "epoch": 0.15893333333333334, "grad_norm": 1.6723697185516357, "kl": 0.195953369140625, "learning_rate": 1.4444444444444445e-06, "loss": -0.2217, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01764771156013012, "mask/share_reasoning": 0.7466893792152405, "mask/share_step_conf": 0.17316293716430664, "num_tokens": 46788322.0, "reward": 0.9726622104644775, "reward_std": 0.20295557379722595, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.794053852558136, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8395517468452454, "step": 149 }, { "adv/mean_abs_final_conf": 0.4634353518486023, "adv/mean_abs_reasoning": 0.33932197093963623, "adv/mean_abs_step_conf": 0.7568655014038086, "adv/ratio_final_to_reasoning": 1.365768772842137, "adv/ratio_step_to_reasoning": 2.230523120291705, "adv/std_final_conf": 0.7231396436691284, "adv/std_reasoning": 0.6402050256729126, "adv/std_step_conf": 0.9301274418830872, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7805030463990003, "calib/avg_num_step_conf": 14.36328125, "calib/ece": 0.2019433198380567, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8299595141700404, "calib/gap": 0.3053624433682238, "calib/mean_conf": 0.8907692307692306, "calib/mu_c": 0.9822543352601156, "calib/mu_w": 0.6768918918918918, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19615384615384618, "calib/std_conf": 0.27554549391198135, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5212333179935572, "calib/step_q_c_n": 2173.0, "calib/step_q_gap": 0.12877321161057848, "calib/step_q_w": 0.39246010638297874, "calib/step_q_w_n": 1504.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 717.453125, "completions/mean_terminated_length": 740.5967407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 270.0, "epoch": 0.16, "grad_norm": 1.1799750328063965, "kl": 0.21728515625, "learning_rate": 1.4166666666666667e-06, "loss": -0.183, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.021657707169651985, "mask/share_reasoning": 0.7615066766738892, "mask/share_step_conf": 0.1855856478214264, "num_tokens": 47076950.0, "reward": 0.982369065284729, "reward_std": 0.1594199240207672, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7778101563453674, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8588029146194458, "step": 150 }, { "adv/mean_abs_final_conf": 0.4652855694293976, "adv/mean_abs_reasoning": 0.28941550850868225, "adv/mean_abs_step_conf": 0.7605600357055664, "adv/ratio_final_to_reasoning": 1.607673244004612, "adv/ratio_step_to_reasoning": 2.627917348398592, "adv/std_final_conf": 0.7397387027740479, "adv/std_reasoning": 0.5962411761283875, "adv/std_step_conf": 0.9323870539665222, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.808349146110057, "calib/avg_num_step_conf": 17.26953125, "calib/ece": 0.16074999999999995, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.44242884250474385, "calib/mean_conf": 0.7595, "calib/mu_c": 0.9161935483870968, "calib/mu_w": 0.4737647058823529, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13720833333333327, "calib/std_conf": 0.37644134115866357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4616205357142857, "calib/step_q_c_n": 2240.0, "calib/step_q_gap": 0.09816340595729345, "calib/step_q_w": 0.36345712975699224, "calib/step_q_w_n": 2181.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 815.72265625, "completions/mean_terminated_length": 866.4938354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 369.0, "epoch": 0.16106666666666666, "grad_norm": 1.327895998954773, "kl": 0.18670654296875, "learning_rate": 1.3888888888888892e-06, "loss": -0.3169, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017748720943927765, "mask/share_reasoning": 0.7482398748397827, "mask/share_step_conf": 0.1754177063703537, "num_tokens": 47392799.0, "reward": 0.9465281963348389, "reward_std": 0.1743602752685547, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7678452730178833, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8166172504425049, "step": 151 }, { "adv/mean_abs_final_conf": 0.5090725421905518, "adv/mean_abs_reasoning": 0.392426073551178, "adv/mean_abs_step_conf": 0.7673565149307251, "adv/ratio_final_to_reasoning": 1.2972444404211114, "adv/ratio_step_to_reasoning": 1.9554167438128975, "adv/std_final_conf": 0.7426143288612366, "adv/std_reasoning": 0.6614143252372742, "adv/std_step_conf": 0.9298550486564636, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7461066187481282, "calib/avg_num_step_conf": 15.6328125, "calib/ece": 0.20102880658436215, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7037037037037037, "calib/gap": 0.34817834681042226, "calib/mean_conf": 0.7967489711934156, "calib/mu_c": 0.9171069182389937, "calib/mu_w": 0.5689285714285715, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1717283950617284, "calib/std_conf": 0.349703764624816, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45090648854961835, "calib/step_q_c_n": 2096.0, "calib/step_q_gap": 0.00855601635654385, "calib/step_q_w": 0.4423504721930745, "calib/step_q_w_n": 1906.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 763.87890625, "completions/mean_terminated_length": 804.7448120117188, "completions/min_length": 0.0, "completions/min_terminated_length": 366.0, "epoch": 0.16213333333333332, "grad_norm": 2.4743285179138184, "kl": 0.199310302734375, "learning_rate": 1.3611111111111112e-06, "loss": -0.2462, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01930128037929535, "mask/share_reasoning": 0.7521140575408936, "mask/share_step_conf": 0.1778033971786499, "num_tokens": 47693744.0, "reward": 0.9490625262260437, "reward_std": 0.16226524114608765, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7486886978149414, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8353736996650696, "step": 152 }, { "adv/mean_abs_final_conf": 0.47801291942596436, "adv/mean_abs_reasoning": 0.30730733275413513, "adv/mean_abs_step_conf": 0.7640332579612732, "adv/ratio_final_to_reasoning": 1.5554881660061273, "adv/ratio_step_to_reasoning": 2.48621876710162, "adv/std_final_conf": 0.6935111880302429, "adv/std_reasoning": 0.5962584614753723, "adv/std_step_conf": 0.9324053525924683, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.8020631538163503, "calib/avg_num_step_conf": 15.8359375, "calib/ece": 0.16, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.7669491525423728, "calib/gap": 0.3534947694691979, "calib/mean_conf": 0.8511016949152543, "calib/mu_c": 0.9379775280898877, "calib/mu_w": 0.5844827586206898, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12843220338983052, "calib/std_conf": 0.3076917450909713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4673752151462994, "calib/step_q_c_n": 2324.0, "calib/step_q_gap": 0.09412087988618378, "calib/step_q_w": 0.37325433526011564, "calib/step_q_w_n": 1730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3037.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 792.3984375, "completions/mean_terminated_length": 855.9240112304688, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.1632, "grad_norm": 1.8146662712097168, "kl": 0.183135986328125, "learning_rate": 1.3333333333333334e-06, "loss": -0.2822, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01758674532175064, "mask/share_reasoning": 0.7406385540962219, "mask/share_step_conf": 0.16755595803260803, "num_tokens": 48003918.0, "reward": 0.9537002444267273, "reward_std": 0.17488151788711548, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7758773565292358, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8080856800079346, "step": 153 }, { "adv/mean_abs_final_conf": 0.5142377614974976, "adv/mean_abs_reasoning": 0.33599966764450073, "adv/mean_abs_step_conf": 0.7726404666900635, "adv/ratio_final_to_reasoning": 1.5304710421368002, "adv/ratio_step_to_reasoning": 2.299527473067455, "adv/std_final_conf": 0.7494176030158997, "adv/std_reasoning": 0.6186435222625732, "adv/std_step_conf": 0.9310169816017151, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8091487107029318, "calib/avg_num_step_conf": 15.57421875, "calib/ece": 0.19286885245901636, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6885245901639344, "calib/gap": 0.4432553867891206, "calib/mean_conf": 0.778360655737705, "calib/mu_c": 0.9509395973154362, "calib/mu_w": 0.5076842105263156, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1802868852459016, "calib/std_conf": 0.37448841750509676, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4889015338941118, "calib/step_q_c_n": 2021.0, "calib/step_q_gap": 0.12712635586766213, "calib/step_q_w": 0.36177517802644965, "calib/step_q_w_n": 1966.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 771.22265625, "completions/mean_terminated_length": 805.8489379882812, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.16426666666666667, "grad_norm": 1.7066091299057007, "kl": 0.20452880859375, "learning_rate": 1.3055555555555556e-06, "loss": -0.1992, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018794827163219452, "mask/share_reasoning": 0.7621910572052002, "mask/share_step_conf": 0.17604538798332214, "num_tokens": 48305791.0, "reward": 0.9651025533676147, "reward_std": 0.14344683289527893, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7669328451156616, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.85624098777771, "step": 154 }, { "adv/mean_abs_final_conf": 0.5909117460250854, "adv/mean_abs_reasoning": 0.5282171368598938, "adv/mean_abs_step_conf": 0.7549092769622803, "adv/ratio_final_to_reasoning": 1.1186909791263, "adv/ratio_step_to_reasoning": 1.4291646830127647, "adv/std_final_conf": 0.7798319458961487, "adv/std_reasoning": 0.7577393651008606, "adv/std_step_conf": 0.9332888126373291, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8047785547785548, "calib/avg_num_step_conf": 15.34765625, "calib/ece": 0.2442213114754098, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7090163934426229, "calib/gap": 0.30337995337995316, "calib/mean_conf": 0.8178278688524591, "calib/mu_c": 0.9272435897435896, "calib/mu_w": 0.6238636363636364, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2113524590163934, "calib/std_conf": 0.3316547482099605, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45850907307503685, "calib/step_q_c_n": 2039.0, "calib/step_q_gap": 0.10640854397450772, "calib/step_q_w": 0.35210052910052914, "calib/step_q_w_n": 1890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 761.20703125, "completions/mean_terminated_length": 795.3836669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 343.0, "epoch": 0.16533333333333333, "grad_norm": 1.7521440982818604, "kl": 0.205413818359375, "learning_rate": 1.2777777777777779e-06, "loss": -0.1898, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01886514201760292, "mask/share_reasoning": 0.7580100297927856, "mask/share_step_conf": 0.18015605211257935, "num_tokens": 48607876.0, "reward": 0.9422191381454468, "reward_std": 0.20991146564483643, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7314988374710083, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8404393196105957, "step": 155 }, { "adv/mean_abs_final_conf": 0.5372999906539917, "adv/mean_abs_reasoning": 0.3968615233898163, "adv/mean_abs_step_conf": 0.7515037059783936, "adv/ratio_final_to_reasoning": 1.3538727213074522, "adv/ratio_step_to_reasoning": 1.89361694618158, "adv/std_final_conf": 0.7575808763504028, "adv/std_reasoning": 0.6818038821220398, "adv/std_step_conf": 0.9326856136322021, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7635087719298246, "calib/avg_num_step_conf": 15.37109375, "calib/ece": 0.18650406504065048, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6829268292682927, "calib/gap": 0.2947368421052631, "calib/mean_conf": 0.8015447154471546, "calib/mu_c": 0.8914035087719299, "calib/mu_w": 0.5966666666666668, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14646341463414642, "calib/std_conf": 0.33290183254880096, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46967656180770934, "calib/step_q_c_n": 2257.0, "calib/step_q_gap": 0.06360981568136848, "calib/step_q_w": 0.40606674612634086, "calib/step_q_w_n": 1678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 748.78125, "completions/mean_terminated_length": 779.219482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.1664, "grad_norm": 1.54910147190094, "kl": 0.2027587890625, "learning_rate": 1.25e-06, "loss": -0.2438, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01979167014360428, "mask/share_reasoning": 0.7604625821113586, "mask/share_step_conf": 0.18068325519561768, "num_tokens": 48904324.0, "reward": 0.9566689729690552, "reward_std": 0.17362357676029205, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7599562406539917, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.827600359916687, "step": 156 }, { "adv/mean_abs_final_conf": 0.5187393426895142, "adv/mean_abs_reasoning": 0.4431944191455841, "adv/mean_abs_step_conf": 0.7509469985961914, "adv/ratio_final_to_reasoning": 1.1704554937527643, "adv/ratio_step_to_reasoning": 1.6943963329770952, "adv/std_final_conf": 0.7611110210418701, "adv/std_reasoning": 0.720543622970581, "adv/std_step_conf": 0.9303745627403259, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7757313109425785, "calib/avg_num_step_conf": 16.953125, "calib/ece": 0.18070833333333336, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7166666666666667, "calib/gap": 0.36560130010834235, "calib/mean_conf": 0.8064583333333334, "calib/mu_c": 0.9146153846153846, "calib/mu_w": 0.5490140845070423, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14150000000000004, "calib/std_conf": 0.3458364834193884, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5982460850111857, "calib/step_q_c_n": 2235.0, "calib/step_q_gap": 0.24769501612757527, "calib/step_q_w": 0.35055106888361043, "calib/step_q_w_n": 2105.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 736.5625, "completions/mean_terminated_length": 782.4066772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 217.0, "epoch": 0.16746666666666668, "grad_norm": 2.9453022480010986, "kl": 0.201568603515625, "learning_rate": 1.2222222222222223e-06, "loss": -0.2135, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.02099085971713066, "mask/share_reasoning": 0.7406794428825378, "mask/share_step_conf": 0.17973598837852478, "num_tokens": 49196612.0, "reward": 0.9562191963195801, "reward_std": 0.17957063019275665, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7630675435066223, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8298394680023193, "step": 157 }, { "adv/mean_abs_final_conf": 0.4423435628414154, "adv/mean_abs_reasoning": 0.3423171043395996, "adv/mean_abs_step_conf": 0.7591568827629089, "adv/ratio_final_to_reasoning": 1.2922040915682185, "adv/ratio_step_to_reasoning": 2.217700702474331, "adv/std_final_conf": 0.6848023533821106, "adv/std_reasoning": 0.6403859853744507, "adv/std_step_conf": 0.9318983554840088, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6693627450980393, "calib/avg_num_step_conf": 13.98828125, "calib/ece": 0.20366935483870974, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8225806451612904, "calib/gap": 0.2294803921568629, "calib/mean_conf": 0.8914112903225807, "calib/mu_c": 0.9543333333333335, "calib/mu_w": 0.7248529411764706, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1846370967741936, "calib/std_conf": 0.26610395470800186, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5386679087098276, "calib/step_q_c_n": 2147.0, "calib/step_q_gap": 0.12125507746854453, "calib/step_q_w": 0.4174128312412831, "calib/step_q_w_n": 1434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 713.98046875, "completions/mean_terminated_length": 737.0120849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 245.0, "epoch": 0.16853333333333334, "grad_norm": 2.30918550491333, "kl": 0.21026611328125, "learning_rate": 1.1944444444444446e-06, "loss": -0.1787, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02201329544186592, "mask/share_reasoning": 0.7656834125518799, "mask/share_step_conf": 0.1810533106327057, "num_tokens": 49484631.0, "reward": 0.9711004495620728, "reward_std": 0.17076840996742249, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7692753672599792, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8385504484176636, "step": 158 }, { "adv/mean_abs_final_conf": 0.516288161277771, "adv/mean_abs_reasoning": 0.37251782417297363, "adv/mean_abs_step_conf": 0.7694894671440125, "adv/ratio_final_to_reasoning": 1.3859421691404477, "adv/ratio_step_to_reasoning": 2.065644694592413, "adv/std_final_conf": 0.7581419348716736, "adv/std_reasoning": 0.6403560638427734, "adv/std_step_conf": 0.9302523732185364, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6553340687046959, "calib/avg_num_step_conf": 15.1796875, "calib/ece": 0.23337448559670787, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7489711934156379, "calib/gap": 0.20324141191301615, "calib/mean_conf": 0.8362551440329217, "calib/mu_c": 0.8998203592814372, "calib/mu_w": 0.6965789473684211, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19119341563786013, "calib/std_conf": 0.31445748030216003, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5401206636500754, "calib/step_q_c_n": 1989.0, "calib/step_q_gap": 0.18555028937490403, "calib/step_q_w": 0.35457037427517135, "calib/step_q_w_n": 1897.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 694.02734375, "completions/mean_terminated_length": 731.1563720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.1696, "grad_norm": 1.6902092695236206, "kl": 0.22381591796875, "learning_rate": 1.1666666666666668e-06, "loss": -0.2319, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.021202802658081055, "mask/share_reasoning": 0.7504836320877075, "mask/share_step_conf": 0.1775323450565338, "num_tokens": 49767086.0, "reward": 0.9452986717224121, "reward_std": 0.16794392466545105, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7131870985031128, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8570977449417114, "step": 159 }, { "adv/mean_abs_final_conf": 0.5271440744400024, "adv/mean_abs_reasoning": 0.450272798538208, "adv/mean_abs_step_conf": 0.7533363103866577, "adv/ratio_final_to_reasoning": 1.1707215629088719, "adv/ratio_step_to_reasoning": 1.673066445124673, "adv/std_final_conf": 0.7414671182632446, "adv/std_reasoning": 0.7208443880081177, "adv/std_step_conf": 0.9328213930130005, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7145422652856666, "calib/avg_num_step_conf": 14.625, "calib/ece": 0.18497890295358643, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.7426160337552743, "calib/gap": 0.36997160040093535, "calib/mean_conf": 0.8173839662447256, "calib/mu_c": 0.931341463414634, "calib/mu_w": 0.5613698630136986, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15518987341772145, "calib/std_conf": 0.34348043667118816, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5828970976253298, "calib/step_q_c_n": 1895.0, "calib/step_q_gap": 0.19918157572159806, "calib/step_q_w": 0.38371552190373176, "calib/step_q_w_n": 1849.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 683.99609375, "completions/mean_terminated_length": 732.6484985351562, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 0.17066666666666666, "grad_norm": 1.5047662258148193, "kl": 0.22320556640625, "learning_rate": 1.138888888888889e-06, "loss": -0.3172, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01983489654958248, "mask/share_reasoning": 0.7423506379127502, "mask/share_step_conf": 0.17140822112560272, "num_tokens": 50047029.0, "reward": 0.9287228584289551, "reward_std": 0.21447834372520447, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7506851553916931, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7934792041778564, "step": 160 }, { "adv/mean_abs_final_conf": 0.3155440092086792, "adv/mean_abs_reasoning": 0.20698705315589905, "adv/mean_abs_step_conf": 0.7502477169036865, "adv/ratio_final_to_reasoning": 1.52446254196883, "adv/ratio_step_to_reasoning": 3.624611807669985, "adv/std_final_conf": 0.5859242677688599, "adv/std_reasoning": 0.4961038827896118, "adv/std_step_conf": 0.929783284664154, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7206486042692939, "calib/avg_num_step_conf": 12.8359375, "calib/ece": 0.12103585657370519, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8565737051792829, "calib/gap": 0.4051313628899835, "calib/mean_conf": 0.8893227091633468, "calib/mu_c": 0.9667980295566502, "calib/mu_w": 0.5616666666666666, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.100796812749004, "calib/std_conf": 0.28332252439738614, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6057334450963957, "calib/step_q_c_n": 2386.0, "calib/step_q_gap": 0.21162233398528468, "calib/step_q_w": 0.394111111111111, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 686.609375, "completions/mean_terminated_length": 697.5079956054688, "completions/min_length": 0.0, "completions/min_terminated_length": 252.0, "epoch": 0.17173333333333332, "grad_norm": 1.3031444549560547, "kl": 0.220123291015625, "learning_rate": 1.111111111111111e-06, "loss": -0.082, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.022836053743958473, "mask/share_reasoning": 0.7714052796363831, "mask/share_step_conf": 0.19013366103172302, "num_tokens": 50326721.0, "reward": 1.0375888347625732, "reward_std": 0.1130094826221466, "rewards/accuracy_reward_step": 0.79296875, "rewards/final_brier_reward_step": 0.8666296601295471, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8538604974746704, "step": 161 }, { "adv/mean_abs_final_conf": 0.3733126223087311, "adv/mean_abs_reasoning": 0.3079017400741577, "adv/mean_abs_step_conf": 0.7446285486221313, "adv/ratio_final_to_reasoning": 1.212440768339988, "adv/ratio_step_to_reasoning": 2.418396688640956, "adv/std_final_conf": 0.6513224840164185, "adv/std_reasoning": 0.6186278462409973, "adv/std_step_conf": 0.9291735887527466, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.771883786316776, "calib/avg_num_step_conf": 13.60546875, "calib/ece": 0.13269076305220887, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8554216867469879, "calib/gap": 0.3629400187441425, "calib/mean_conf": 0.905863453815261, "calib/mu_c": 0.9860309278350516, "calib/mu_w": 0.6230909090909091, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12971887550200806, "calib/std_conf": 0.2454373962779218, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5696476360392507, "calib/step_q_c_n": 2242.0, "calib/step_q_gap": 0.14555416303361013, "calib/step_q_w": 0.4240934730056406, "calib/step_q_w_n": 1241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 699.2421875, "completions/mean_terminated_length": 718.8995971679688, "completions/min_length": 0.0, "completions/min_terminated_length": 308.0, "epoch": 0.1728, "grad_norm": 2.057448387145996, "kl": 0.2279052734375, "learning_rate": 1.0833333333333335e-06, "loss": -0.1111, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.022536642849445343, "mask/share_reasoning": 0.7652218341827393, "mask/share_step_conf": 0.1848977655172348, "num_tokens": 50609871.0, "reward": 1.0423606634140015, "reward_std": 0.14033734798431396, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.8525538444519043, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8860735893249512, "step": 162 }, { "adv/mean_abs_final_conf": 0.4807455539703369, "adv/mean_abs_reasoning": 0.44717273116111755, "adv/mean_abs_step_conf": 0.7487645745277405, "adv/ratio_final_to_reasoning": 1.075077974280867, "adv/ratio_step_to_reasoning": 1.6744414906148617, "adv/std_final_conf": 0.7246531844139099, "adv/std_reasoning": 0.7206774950027466, "adv/std_step_conf": 0.9323588013648987, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.775825463064646, "calib/avg_num_step_conf": 15.87890625, "calib/ece": 0.20209016393442636, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7295081967213115, "calib/gap": 0.38604070576176874, "calib/mean_conf": 0.8024180327868852, "calib/mu_c": 0.9400636942675159, "calib/mu_w": 0.5540229885057472, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18053278688524604, "calib/std_conf": 0.350715155547285, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5740049627791562, "calib/step_q_c_n": 2015.0, "calib/step_q_gap": 0.1694098408279367, "calib/step_q_w": 0.40459512195121955, "calib/step_q_w_n": 2050.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 757.2578125, "completions/mean_terminated_length": 794.4999389648438, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.17386666666666667, "grad_norm": 2.0743064880371094, "kl": 0.20587158203125, "learning_rate": 1.0555555555555557e-06, "loss": -0.2623, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02081698551774025, "mask/share_reasoning": 0.7513790130615234, "mask/share_step_conf": 0.18092897534370422, "num_tokens": 50908561.0, "reward": 0.9499805569648743, "reward_std": 0.1852256953716278, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7580621242523193, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8293989896774292, "step": 163 }, { "adv/mean_abs_final_conf": 0.534378170967102, "adv/mean_abs_reasoning": 0.4416940212249756, "adv/mean_abs_step_conf": 0.7589446306228638, "adv/ratio_final_to_reasoning": 1.2098379087973166, "adv/ratio_step_to_reasoning": 1.7182587813120918, "adv/std_final_conf": 0.7614235877990723, "adv/std_reasoning": 0.7206831574440002, "adv/std_step_conf": 0.932367742061615, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.8968085106382979, "calib/avg_num_step_conf": 15.94921875, "calib/ece": 0.1466945606694561, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.6276150627615062, "calib/gap": 0.5442670579603816, "calib/mean_conf": 0.7455230125523011, "calib/mu_c": 0.9595862068965518, "calib/mu_w": 0.4153191489361703, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14276150627615067, "calib/std_conf": 0.37108181564928355, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5484053738317758, "calib/step_q_c_n": 1712.0, "calib/step_q_gap": 0.18230668129698036, "calib/step_q_w": 0.36609869253479543, "calib/step_q_w_n": 2371.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 764.62109375, "completions/mean_terminated_length": 812.211669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.17493333333333333, "grad_norm": 1.9145711660385132, "kl": 0.2144775390625, "learning_rate": 1.0277777777777777e-06, "loss": -0.3196, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01828896999359131, "mask/share_reasoning": 0.7554370164871216, "mask/share_step_conf": 0.1676802784204483, "num_tokens": 51210440.0, "reward": 0.960978090763092, "reward_std": 0.20468957722187042, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.806765615940094, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8151905536651611, "step": 164 }, { "adv/mean_abs_final_conf": 0.4792923331260681, "adv/mean_abs_reasoning": 0.42320019006729126, "adv/mean_abs_step_conf": 0.7414774894714355, "adv/ratio_final_to_reasoning": 1.1325428115943377, "adv/ratio_step_to_reasoning": 1.7520726759445366, "adv/std_final_conf": 0.7423551082611084, "adv/std_reasoning": 0.7014817595481873, "adv/std_step_conf": 0.9322285652160645, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7303945965161749, "calib/avg_num_step_conf": 14.32421875, "calib/ece": 0.2420247933884297, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7396694214876033, "calib/gap": 0.3118258087451119, "calib/mean_conf": 0.8363223140495867, "calib/mu_c": 0.9613103448275861, "calib/mu_w": 0.6494845360824743, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23958677685950408, "calib/std_conf": 0.31260454026292117, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5433864541832669, "calib/step_q_c_n": 1757.0, "calib/step_q_gap": 0.151496401827246, "calib/step_q_w": 0.3918900523560209, "calib/step_q_w_n": 1910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 750.84375, "completions/mean_terminated_length": 791.0123291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 0.176, "grad_norm": 1.6786460876464844, "kl": 0.20703125, "learning_rate": 1.0000000000000002e-06, "loss": -0.2091, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.019337885081768036, "mask/share_reasoning": 0.7642934918403625, "mask/share_step_conf": 0.1655873954296112, "num_tokens": 51508232.0, "reward": 0.9292345643043518, "reward_std": 0.189374178647995, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7143285274505615, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.841796875, "step": 165 }, { "adv/mean_abs_final_conf": 0.48200446367263794, "adv/mean_abs_reasoning": 0.4373070001602173, "adv/mean_abs_step_conf": 0.7434042692184448, "adv/ratio_final_to_reasoning": 1.1022107203773202, "adv/ratio_step_to_reasoning": 1.6999596826624817, "adv/std_final_conf": 0.7467506527900696, "adv/std_reasoning": 0.7392253875732422, "adv/std_step_conf": 0.928732693195343, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8753036744609779, "calib/avg_num_step_conf": 14.046875, "calib/ece": 0.10749999999999996, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7182539682539683, "calib/gap": 0.6016960218645611, "calib/mean_conf": 0.7752777777777778, "calib/mu_c": 0.9519662921348314, "calib/mu_w": 0.3502702702702703, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08821428571428569, "calib/std_conf": 0.3825876834634367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5334027470093045, "calib/step_q_c_n": 2257.0, "calib/step_q_gap": 0.24329072311087274, "calib/step_q_w": 0.2901120238984317, "calib/step_q_w_n": 1339.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 769.36328125, "completions/mean_terminated_length": 781.575439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.17706666666666668, "grad_norm": 1.19493567943573, "kl": 0.208831787109375, "learning_rate": 9.722222222222224e-07, "loss": -0.0943, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020818617194890976, "mask/share_reasoning": 0.7784860134124756, "mask/share_step_conf": 0.18507036566734314, "num_tokens": 51811373.0, "reward": 1.0529277324676514, "reward_std": 0.14112304151058197, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.8771401643753052, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.893558919429779, "step": 166 }, { "adv/mean_abs_final_conf": 0.34852081537246704, "adv/mean_abs_reasoning": 0.24475020170211792, "adv/mean_abs_step_conf": 0.7498742938041687, "adv/ratio_final_to_reasoning": 1.4239858147150657, "adv/ratio_step_to_reasoning": 3.0638352433998413, "adv/std_final_conf": 0.6023914813995361, "adv/std_reasoning": 0.5229523777961731, "adv/std_step_conf": 0.9294629693031311, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6680033769523006, "calib/avg_num_step_conf": 13.7265625, "calib/ece": 0.12880952380952373, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8809523809523809, "calib/gap": 0.27689742507387105, "calib/mean_conf": 0.9272222222222222, "calib/mu_c": 0.9777669902912622, "calib/mu_w": 0.7008695652173912, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11928571428571422, "calib/std_conf": 0.22004198332774536, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5642033773861969, "calib/step_q_c_n": 2724.0, "calib/step_q_gap": 0.14616540270265255, "calib/step_q_w": 0.4180379746835443, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 737.0234375, "completions/mean_terminated_length": 748.7222900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 244.0, "epoch": 0.17813333333333334, "grad_norm": 1.5744816064834595, "kl": 0.20538330078125, "learning_rate": 9.444444444444445e-07, "loss": -0.1278, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.021401219069957733, "mask/share_reasoning": 0.7758229970932007, "mask/share_step_conf": 0.1871507614850998, "num_tokens": 52105659.0, "reward": 1.0399054288864136, "reward_std": 0.11180633306503296, "rewards/accuracy_reward_step": 0.8046875, "rewards/final_brier_reward_step": 0.8593116998672485, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8626866340637207, "step": 167 }, { "adv/mean_abs_final_conf": 0.5078991055488586, "adv/mean_abs_reasoning": 0.4405806064605713, "adv/mean_abs_step_conf": 0.7511824369430542, "adv/ratio_final_to_reasoning": 1.1527949666897375, "adv/ratio_step_to_reasoning": 1.7049829836535928, "adv/std_final_conf": 0.743903636932373, "adv/std_reasoning": 0.701566219329834, "adv/std_step_conf": 0.9324759244918823, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6993558776167472, "calib/avg_num_step_conf": 14.44921875, "calib/ece": 0.19493975903614444, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7991967871485943, "calib/gap": 0.24073671497584526, "calib/mean_conf": 0.8796787148594378, "calib/mu_c": 0.9463888888888888, "calib/mu_w": 0.7056521739130436, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17586345381526092, "calib/std_conf": 0.27079960652914614, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5443938767066612, "calib/step_q_c_n": 2417.0, "calib/step_q_gap": 0.15087593598903248, "calib/step_q_w": 0.3935179407176287, "calib/step_q_w_n": 1282.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 799.359375, "completions/mean_terminated_length": 815.2828979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.1792, "grad_norm": 3.78358793258667, "kl": 0.19183349609375, "learning_rate": 9.166666666666666e-07, "loss": -0.1157, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019614530727267265, "mask/share_reasoning": 0.7847484350204468, "mask/share_step_conf": 0.17610576748847961, "num_tokens": 52414967.0, "reward": 0.9864051342010498, "reward_std": 0.1815844625234604, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7763882875442505, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8612658381462097, "step": 168 }, { "adv/mean_abs_final_conf": 0.5090618133544922, "adv/mean_abs_reasoning": 0.38595885038375854, "adv/mean_abs_step_conf": 0.7447162866592407, "adv/ratio_final_to_reasoning": 1.3189535953077187, "adv/ratio_step_to_reasoning": 1.9295225019941116, "adv/std_final_conf": 0.772860586643219, "adv/std_reasoning": 0.6816369891166687, "adv/std_step_conf": 0.9323936700820923, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7356291883842144, "calib/avg_num_step_conf": 13.77734375, "calib/ece": 0.1758024691358025, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7407407407407407, "calib/gap": 0.40005137751303055, "calib/mean_conf": 0.8232921810699588, "calib/mu_c": 0.9632278481012658, "calib/mu_w": 0.5631764705882353, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17444444444444446, "calib/std_conf": 0.33155057993948056, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5378848063555115, "calib/step_q_c_n": 2014.0, "calib/step_q_gap": 0.15610688170250414, "calib/step_q_w": 0.38177792465300736, "calib/step_q_w_n": 1513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 752.94140625, "completions/mean_terminated_length": 780.3765258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.18026666666666666, "grad_norm": 2.0797224044799805, "kl": 0.21844482421875, "learning_rate": 8.88888888888889e-07, "loss": -0.2049, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02032134309411049, "mask/share_reasoning": 0.7662501335144043, "mask/share_step_conf": 0.17827224731445312, "num_tokens": 52711904.0, "reward": 0.9632647037506104, "reward_std": 0.17676101624965668, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7732820510864258, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8399661183357239, "step": 169 }, { "adv/mean_abs_final_conf": 0.475443959236145, "adv/mean_abs_reasoning": 0.401976078748703, "adv/mean_abs_step_conf": 0.7568126916885376, "adv/ratio_final_to_reasoning": 1.1827667972585274, "adv/ratio_step_to_reasoning": 1.8827306690596934, "adv/std_final_conf": 0.7249166965484619, "adv/std_reasoning": 0.6816323399543762, "adv/std_step_conf": 0.931459367275238, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7871527777777778, "calib/avg_num_step_conf": 14.08203125, "calib/ece": 0.1501229508196721, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7213114754098361, "calib/gap": 0.4193020833333333, "calib/mean_conf": 0.8113524590163934, "calib/mu_c": 0.9213333333333332, "calib/mu_w": 0.5020312499999999, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11188524590163931, "calib/std_conf": 0.3407913505726059, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5339683860232944, "calib/step_q_c_n": 2404.0, "calib/step_q_gap": 0.19768195804660837, "calib/step_q_w": 0.33628642797668606, "calib/step_q_w_n": 1201.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2939.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 747.84375, "completions/mean_terminated_length": 784.6229248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.18133333333333335, "grad_norm": 4.0449910163879395, "kl": 0.200775146484375, "learning_rate": 8.611111111111112e-07, "loss": -0.177, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01912543550133705, "mask/share_reasoning": 0.7598277926445007, "mask/share_step_conf": 0.17417177557945251, "num_tokens": 53007504.0, "reward": 0.9965099692344666, "reward_std": 0.166769877076149, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.8074949383735657, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.854275107383728, "step": 170 }, { "adv/mean_abs_final_conf": 0.5136815309524536, "adv/mean_abs_reasoning": 0.39976370334625244, "adv/mean_abs_step_conf": 0.7210954427719116, "adv/ratio_final_to_reasoning": 1.284962908469787, "adv/ratio_step_to_reasoning": 1.8038041891645675, "adv/std_final_conf": 0.7939493656158447, "adv/std_reasoning": 0.7203819751739502, "adv/std_step_conf": 0.9326278567314148, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7286138613861386, "calib/avg_num_step_conf": 12.9296875, "calib/ece": 0.24808764940239042, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7450199203187251, "calib/gap": 0.32988382838283836, "calib/mean_conf": 0.822191235059761, "calib/mu_c": 0.9549333333333334, "calib/mu_w": 0.625049504950495, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23633466135458167, "calib/std_conf": 0.34068301924875966, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5601183431952663, "calib/step_q_c_n": 1859.0, "calib/step_q_gap": 0.20683095518699618, "calib/step_q_w": 0.35328738800827014, "calib/step_q_w_n": 1451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 747.11328125, "completions/mean_terminated_length": 758.9722900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.1824, "grad_norm": 2.1775567531585693, "kl": 0.203399658203125, "learning_rate": 8.333333333333333e-07, "loss": -0.1275, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02061760053038597, "mask/share_reasoning": 0.7885169982910156, "mask/share_step_conf": 0.1752403974533081, "num_tokens": 53305661.0, "reward": 0.9526859521865845, "reward_std": 0.1728176474571228, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7330940961837769, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8597778081893921, "step": 171 }, { "adv/mean_abs_final_conf": 0.42454326152801514, "adv/mean_abs_reasoning": 0.33200234174728394, "adv/mean_abs_step_conf": 0.7488305568695068, "adv/ratio_final_to_reasoning": 1.2787357441327092, "adv/ratio_step_to_reasoning": 2.2554978164566903, "adv/std_final_conf": 0.701727569103241, "adv/std_reasoning": 0.6185781955718994, "adv/std_step_conf": 0.9305645823478699, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6784007509813961, "calib/avg_num_step_conf": 13.05078125, "calib/ece": 0.18140562248995973, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8594377510040161, "calib/gap": 0.2814004096262164, "calib/mean_conf": 0.9003614457831325, "calib/mu_c": 0.9715591397849465, "calib/mu_w": 0.6901587301587301, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1673895582329316, "calib/std_conf": 0.26738408117721973, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5301723356009072, "calib/step_q_c_n": 2205.0, "calib/step_q_gap": 0.1354188144741466, "calib/step_q_w": 0.39475352112676054, "calib/step_q_w_n": 1136.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 711.67578125, "completions/mean_terminated_length": 728.7560424804688, "completions/min_length": 0.0, "completions/min_terminated_length": 332.0, "epoch": 0.18346666666666667, "grad_norm": 3.8231351375579834, "kl": 0.215911865234375, "learning_rate": 8.055555555555557e-07, "loss": -0.1219, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.021542852744460106, "mask/share_reasoning": 0.7778093814849854, "mask/share_step_conf": 0.17721021175384521, "num_tokens": 53591202.0, "reward": 1.0019153356552124, "reward_std": 0.14845198392868042, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.799866795539856, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8641201257705688, "step": 172 }, { "adv/mean_abs_final_conf": 0.4479511082172394, "adv/mean_abs_reasoning": 0.4000486135482788, "adv/mean_abs_step_conf": 0.7449871301651001, "adv/ratio_final_to_reasoning": 1.1197416840020609, "adv/ratio_step_to_reasoning": 1.8622414999950831, "adv/std_final_conf": 0.7215878963470459, "adv/std_reasoning": 0.6816327571868896, "adv/std_step_conf": 0.9319837093353271, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6383422778771616, "calib/avg_num_step_conf": 13.26171875, "calib/ece": 0.2302000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.876, "calib/gap": 0.23776833631484795, "calib/mean_conf": 0.9182, "calib/mu_c": 0.9923837209302325, "calib/mu_w": 0.7546153846153846, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2302000000000001, "calib/std_conf": 0.25108954577998666, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5869095127610209, "calib/step_q_c_n": 2155.0, "calib/step_q_gap": 0.14361919018037567, "calib/step_q_w": 0.4432903225806452, "calib/step_q_w_n": 1240.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 739.125, "completions/mean_terminated_length": 753.8486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.18453333333333333, "grad_norm": 21.863386154174805, "kl": 0.211456298828125, "learning_rate": 7.777777777777779e-07, "loss": -0.1071, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.021954456344246864, "mask/share_reasoning": 0.780661404132843, "mask/share_step_conf": 0.17785286903381348, "num_tokens": 53883578.0, "reward": 0.9595485329627991, "reward_std": 0.17794638872146606, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7533035278320312, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8368873596191406, "step": 173 }, { "adv/mean_abs_final_conf": 0.6084309816360474, "adv/mean_abs_reasoning": 0.5327798128128052, "adv/mean_abs_step_conf": 0.7505396604537964, "adv/ratio_final_to_reasoning": 1.1419933094383639, "adv/ratio_step_to_reasoning": 1.4087239088345531, "adv/std_final_conf": 0.8276432752609253, "adv/std_reasoning": 0.775614321231842, "adv/std_step_conf": 0.9331154823303223, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6577911646586346, "calib/avg_num_step_conf": 15.55078125, "calib/ece": 0.22875518672199174, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7178423236514523, "calib/gap": 0.23516385542168672, "calib/mean_conf": 0.8027800829875519, "calib/mu_c": 0.8759638554216868, "calib/mu_w": 0.6408, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17136929460580916, "calib/std_conf": 0.34784599387363224, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4490396753832281, "calib/step_q_c_n": 2218.0, "calib/step_q_gap": 0.09075266460614351, "calib/step_q_w": 0.3582870107770846, "calib/step_q_w_n": 1763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 791.8125, "completions/mean_terminated_length": 841.095458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.1856, "grad_norm": 116.86260223388672, "kl": 0.196624755859375, "learning_rate": 7.5e-07, "loss": -0.2862, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01782982051372528, "mask/share_reasoning": 0.7585914134979248, "mask/share_step_conf": 0.1649850457906723, "num_tokens": 54190514.0, "reward": 0.9199211001396179, "reward_std": 0.21593840420246124, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7083824276924133, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8134909868240356, "step": 174 }, { "adv/mean_abs_final_conf": 0.6689625382423401, "adv/mean_abs_reasoning": 0.6045829057693481, "adv/mean_abs_step_conf": 0.7395843267440796, "adv/ratio_final_to_reasoning": 1.1064860283984825, "adv/ratio_step_to_reasoning": 1.2232967880607515, "adv/std_final_conf": 0.845448911190033, "adv/std_reasoning": 0.826880156993866, "adv/std_step_conf": 0.9331654906272888, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.6941247728649305, "calib/avg_num_step_conf": 19.22265625, "calib/ece": 0.24064935064935067, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.5151515151515151, "calib/gap": 0.29001817080557235, "calib/mean_conf": 0.6602164502164501, "calib/mu_c": 0.7907874015748031, "calib/mu_w": 0.5007692307692307, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17554112554112555, "calib/std_conf": 0.39857695557186973, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5009797482211276, "calib/step_q_c_n": 1827.0, "calib/step_q_gap": 0.14841672301104353, "calib/step_q_w": 0.35256302521008404, "calib/step_q_w_n": 3094.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 834.09765625, "completions/mean_terminated_length": 920.3836059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.18666666666666668, "grad_norm": 3.5096359252929688, "kl": 0.185760498046875, "learning_rate": 7.222222222222222e-07, "loss": -0.484, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.016539614647626877, "mask/share_reasoning": 0.7213195562362671, "mask/share_step_conf": 0.16839079558849335, "num_tokens": 54509867.0, "reward": 0.8547284007072449, "reward_std": 0.2743695080280304, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6541910171508789, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7755782604217529, "step": 175 }, { "adv/mean_abs_final_conf": 0.4440191984176636, "adv/mean_abs_reasoning": 0.3347865343093872, "adv/mean_abs_step_conf": 0.7367527484893799, "adv/ratio_final_to_reasoning": 1.326275560436164, "adv/ratio_step_to_reasoning": 2.2006642232764433, "adv/std_final_conf": 0.7230446338653564, "adv/std_reasoning": 0.6403601765632629, "adv/std_step_conf": 0.9292125105857849, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.732235142118863, "calib/avg_num_step_conf": 16.4453125, "calib/ece": 0.2008467741935484, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7258064516129032, "calib/gap": 0.3366135515360321, "calib/mean_conf": 0.8080241935483871, "calib/mu_c": 0.9247530864197531, "calib/mu_w": 0.588139534883721, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1778225806451613, "calib/std_conf": 0.34240805131921276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.527275720164609, "calib/step_q_c_n": 2430.0, "calib/step_q_gap": 0.18369145050168767, "calib/step_q_w": 0.3435842696629213, "calib/step_q_w_n": 1780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 841.77734375, "completions/mean_terminated_length": 868.931396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.18773333333333334, "grad_norm": 2.7580301761627197, "kl": 0.18585205078125, "learning_rate": 6.944444444444446e-07, "loss": -0.1996, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019048092886805534, "mask/share_reasoning": 0.7681172490119934, "mask/share_step_conf": 0.1815846860408783, "num_tokens": 54829426.0, "reward": 0.9763741493225098, "reward_std": 0.15018296241760254, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7602488398551941, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8729680180549622, "step": 176 }, { "adv/mean_abs_final_conf": 0.4446534514427185, "adv/mean_abs_reasoning": 0.31699615716934204, "adv/mean_abs_step_conf": 0.7730125188827515, "adv/ratio_final_to_reasoning": 1.402709280179636, "adv/ratio_step_to_reasoning": 2.4385548575271896, "adv/std_final_conf": 0.7017505168914795, "adv/std_reasoning": 0.5962578058242798, "adv/std_step_conf": 0.9323491454124451, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8686733128834356, "calib/avg_num_step_conf": 16.82421875, "calib/ece": 0.12279835390946497, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6748971193415638, "calib/gap": 0.5731411042944784, "calib/mean_conf": 0.7504526748971193, "calib/mu_c": 0.9391411042944784, "calib/mu_w": 0.366, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10123456790123453, "calib/std_conf": 0.39061337333879975, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5236284403669724, "calib/step_q_c_n": 2180.0, "calib/step_q_gap": 0.19998951229927142, "calib/step_q_w": 0.323638928067701, "calib/step_q_w_n": 2127.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 836.375, "completions/mean_terminated_length": 881.1193237304688, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.1888, "grad_norm": 2.547269821166992, "kl": 0.192718505859375, "learning_rate": 6.666666666666667e-07, "loss": -0.2609, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018137561157345772, "mask/share_reasoning": 0.7572490572929382, "mask/share_step_conf": 0.17383214831352234, "num_tokens": 55147370.0, "reward": 0.9972747564315796, "reward_std": 0.1674659103155136, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.8290265798568726, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8483355045318604, "step": 177 }, { "adv/mean_abs_final_conf": 0.4974687099456787, "adv/mean_abs_reasoning": 0.34740737080574036, "adv/mean_abs_step_conf": 0.7481155395507812, "adv/ratio_final_to_reasoning": 1.4319463308792262, "adv/ratio_step_to_reasoning": 2.1534244878445734, "adv/std_final_conf": 0.7631050944328308, "adv/std_reasoning": 0.6611331701278687, "adv/std_step_conf": 0.9296133518218994, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7711500974658869, "calib/avg_num_step_conf": 15.90625, "calib/ece": 0.17333333333333334, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7764227642276422, "calib/gap": 0.35116725146198846, "calib/mean_conf": 0.8631707317073171, "calib/mu_c": 0.9702339181286551, "calib/mu_w": 0.6190666666666667, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1706910569105691, "calib/std_conf": 0.28852108762646045, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5056272259596359, "calib/step_q_c_n": 2527.0, "calib/step_q_gap": 0.17791201560364883, "calib/step_q_w": 0.3277152103559871, "calib/step_q_w_n": 1545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 794.45703125, "completions/mean_terminated_length": 826.7520141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 300.0, "epoch": 0.18986666666666666, "grad_norm": 3.4901745319366455, "kl": 0.192596435546875, "learning_rate": 6.388888888888889e-07, "loss": -0.1602, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01828203722834587, "mask/share_reasoning": 0.7588692307472229, "mask/share_step_conf": 0.1837862730026245, "num_tokens": 55456823.0, "reward": 0.993706226348877, "reward_std": 0.1627664566040039, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7931882739067078, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8684428930282593, "step": 178 }, { "adv/mean_abs_final_conf": 0.528492271900177, "adv/mean_abs_reasoning": 0.43051037192344666, "adv/mean_abs_step_conf": 0.7316839694976807, "adv/ratio_final_to_reasoning": 1.2275947488534689, "adv/ratio_step_to_reasoning": 1.6995733836298577, "adv/std_final_conf": 0.7771602869033813, "adv/std_reasoning": 0.7015863656997681, "adv/std_step_conf": 0.9322218894958496, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.8905714005684603, "calib/avg_num_step_conf": 17.3359375, "calib/ece": 0.10220338983050842, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6991525423728814, "calib/gap": 0.5390414583945898, "calib/mean_conf": 0.7877966101694915, "calib/mu_c": 0.9179888268156424, "calib/mu_w": 0.3789473684210526, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06576271186440671, "calib/std_conf": 0.35402799753600916, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4976065573770491, "calib/step_q_c_n": 2440.0, "calib/step_q_gap": 0.19436831913881092, "calib/step_q_w": 0.3032382382382382, "calib/step_q_w_n": 1998.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 799.25, "completions/mean_terminated_length": 863.3248291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 327.0, "epoch": 0.19093333333333334, "grad_norm": 1.8444643020629883, "kl": 0.197601318359375, "learning_rate": 6.111111111111112e-07, "loss": -0.3591, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.017289811745285988, "mask/share_reasoning": 0.7421362400054932, "mask/share_step_conf": 0.1663551926612854, "num_tokens": 55767695.0, "reward": 0.9892854690551758, "reward_std": 0.2062341272830963, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.8187249898910522, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8356271982192993, "step": 179 }, { "adv/mean_abs_final_conf": 0.5253371000289917, "adv/mean_abs_reasoning": 0.4269232749938965, "adv/mean_abs_step_conf": 0.7388336658477783, "adv/ratio_final_to_reasoning": 1.2305187625024712, "adv/ratio_step_to_reasoning": 1.7306005765517025, "adv/std_final_conf": 0.7783913612365723, "adv/std_reasoning": 0.7206528186798096, "adv/std_step_conf": 0.931144118309021, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.743296470231783, "calib/avg_num_step_conf": 18.3671875, "calib/ece": 0.18205761316872424, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6954732510288066, "calib/gap": 0.356134676564157, "calib/mean_conf": 0.7985185185185186, "calib/mu_c": 0.918695652173913, "calib/mu_w": 0.562560975609756, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1590123456790123, "calib/std_conf": 0.33621511501794427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5128173258003768, "calib/step_q_c_n": 2655.0, "calib/step_q_gap": 0.11947585047062587, "calib/step_q_w": 0.3933414753297509, "calib/step_q_w_n": 2047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 951.59765625, "completions/mean_terminated_length": 994.3223876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.192, "grad_norm": 1.8266518115997314, "kl": 0.161590576171875, "learning_rate": 5.833333333333334e-07, "loss": -0.2662, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.015438073314726353, "mask/share_reasoning": 0.768194317817688, "mask/share_step_conf": 0.17339888215065002, "num_tokens": 56115160.0, "reward": 0.9568233489990234, "reward_std": 0.194644495844841, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7633070349693298, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8347145318984985, "step": 180 }, { "adv/mean_abs_final_conf": 0.5601932406425476, "adv/mean_abs_reasoning": 0.477272629737854, "adv/mean_abs_step_conf": 0.7499115467071533, "adv/ratio_final_to_reasoning": 1.1737384583529091, "adv/ratio_step_to_reasoning": 1.5712435618171705, "adv/std_final_conf": 0.7927375435829163, "adv/std_reasoning": 0.7395595908164978, "adv/std_step_conf": 0.9321924448013306, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.770859375, "calib/avg_num_step_conf": 17.01953125, "calib/ece": 0.18929166666666677, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7166666666666667, "calib/gap": 0.36106250000000006, "calib/mean_conf": 0.7972083333333333, "calib/mu_c": 0.9175625000000001, "calib/mu_w": 0.5565, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15991666666666676, "calib/std_conf": 0.3491789750217247, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5166346593507087, "calib/step_q_c_n": 2187.0, "calib/step_q_gap": 0.18568535059494834, "calib/step_q_w": 0.3309493087557604, "calib/step_q_w_n": 2170.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 787.59765625, "completions/mean_terminated_length": 840.1041870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.19306666666666666, "grad_norm": 1.3831990957260132, "kl": 0.20159912109375, "learning_rate": 5.555555555555555e-07, "loss": -0.3336, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01745080202817917, "mask/share_reasoning": 0.749775767326355, "mask/share_step_conf": 0.17027348279953003, "num_tokens": 56423049.0, "reward": 0.9441324472427368, "reward_std": 0.2243872582912445, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7493276596069336, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8264370560646057, "step": 181 }, { "adv/mean_abs_final_conf": 0.3835284411907196, "adv/mean_abs_reasoning": 0.3649190068244934, "adv/mean_abs_step_conf": 0.7412173748016357, "adv/ratio_final_to_reasoning": 1.0509960676703702, "adv/ratio_step_to_reasoning": 2.031183251460842, "adv/std_final_conf": 0.6631271243095398, "adv/std_reasoning": 0.6403862833976746, "adv/std_step_conf": 0.9314233064651489, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7775993237531699, "calib/avg_num_step_conf": 15.7421875, "calib/ece": 0.1294736842105263, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7611336032388664, "calib/gap": 0.4233296703296704, "calib/mean_conf": 0.8308502024291498, "calib/mu_c": 0.9422527472527473, "calib/mu_w": 0.5189230769230769, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1117408906882591, "calib/std_conf": 0.33151914751945144, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4647194186515947, "calib/step_q_c_n": 2477.0, "calib/step_q_gap": 0.16221458928907057, "calib/step_q_w": 0.30250482936252415, "calib/step_q_w_n": 1553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 830.52734375, "completions/mean_terminated_length": 853.87548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.19413333333333332, "grad_norm": 1.501712441444397, "kl": 0.1904296875, "learning_rate": 5.277777777777779e-07, "loss": -0.1673, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018058259040117264, "mask/share_reasoning": 0.7801955938339233, "mask/share_step_conf": 0.1744024157524109, "num_tokens": 56741824.0, "reward": 1.0078319311141968, "reward_std": 0.15414117276668549, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.8215875029563904, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8581387996673584, "step": 182 }, { "adv/mean_abs_final_conf": 0.5015547275543213, "adv/mean_abs_reasoning": 0.49065980315208435, "adv/mean_abs_step_conf": 0.7444531917572021, "adv/ratio_final_to_reasoning": 1.0222046402257654, "adv/ratio_step_to_reasoning": 1.5172491958271388, "adv/std_final_conf": 0.7602401971817017, "adv/std_reasoning": 0.7577841877937317, "adv/std_step_conf": 0.9302000403404236, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7326443397871969, "calib/avg_num_step_conf": 16.484375, "calib/ece": 0.1668163265306123, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.31327838827838816, "calib/mean_conf": 0.7536734693877551, "calib/mu_c": 0.8342307692307691, "calib/mu_w": 0.520952380952381, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0888163265306123, "calib/std_conf": 0.3438514978321786, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47177004538577916, "calib/step_q_c_n": 2644.0, "calib/step_q_gap": 0.11793438548730201, "calib/step_q_w": 0.35383565989847715, "calib/step_q_w_n": 1576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 858.68359375, "completions/mean_terminated_length": 893.5894165039062, "completions/min_length": 0.0, "completions/min_terminated_length": 354.0, "epoch": 0.1952, "grad_norm": 2.1440982818603516, "kl": 0.187255859375, "learning_rate": 5.000000000000001e-07, "loss": -0.2307, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01752399280667305, "mask/share_reasoning": 0.7727809548377991, "mask/share_step_conf": 0.17063254117965698, "num_tokens": 57068327.0, "reward": 0.9805730581283569, "reward_std": 0.17938153445720673, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7754956483840942, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.852056622505188, "step": 183 }, { "adv/mean_abs_final_conf": 0.49326831102371216, "adv/mean_abs_reasoning": 0.37065669894218445, "adv/mean_abs_step_conf": 0.7485692501068115, "adv/ratio_final_to_reasoning": 1.3307956187799883, "adv/ratio_step_to_reasoning": 2.019575667303869, "adv/std_final_conf": 0.738467812538147, "adv/std_reasoning": 0.6815593838691711, "adv/std_step_conf": 0.9300689697265625, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8003418660885516, "calib/avg_num_step_conf": 15.9765625, "calib/ece": 0.15186991869918703, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7520325203252033, "calib/gap": 0.40597348453264415, "calib/mean_conf": 0.8376422764227642, "calib/mu_c": 0.9482122905027934, "calib/mu_w": 0.5422388059701493, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13093495934959354, "calib/std_conf": 0.3186087166263112, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5366829865361078, "calib/step_q_c_n": 2451.0, "calib/step_q_gap": 0.18017291942201386, "calib/step_q_w": 0.35651006711409394, "calib/step_q_w_n": 1639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 832.3671875, "completions/mean_terminated_length": 866.2032470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.19626666666666667, "grad_norm": 1.1191779375076294, "kl": 0.190216064453125, "learning_rate": 4.7222222222222226e-07, "loss": -0.2065, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018394174054265022, "mask/share_reasoning": 0.7704482078552246, "mask/share_step_conf": 0.1720951348543167, "num_tokens": 57386693.0, "reward": 1.0043140649795532, "reward_std": 0.16147670149803162, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.8159515857696533, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8606454133987427, "step": 184 }, { "adv/mean_abs_final_conf": 0.4545571208000183, "adv/mean_abs_reasoning": 0.38821011781692505, "adv/mean_abs_step_conf": 0.7668112516403198, "adv/ratio_final_to_reasoning": 1.1709048784101543, "adv/ratio_step_to_reasoning": 1.9752479815632684, "adv/std_final_conf": 0.7058054804801941, "adv/std_reasoning": 0.661405622959137, "adv/std_step_conf": 0.9312621355056763, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7954785827604443, "calib/avg_num_step_conf": 15.66796875, "calib/ece": 0.12644897959183674, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7183673469387755, "calib/gap": 0.45998589811387264, "calib/mean_conf": 0.8093877551020408, "calib/mu_c": 0.9257923497267759, "calib/mu_w": 0.4658064516129033, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09444897959183673, "calib/std_conf": 0.33732324570797134, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5495782881002088, "calib/step_q_c_n": 2395.0, "calib/step_q_gap": 0.2175795257239712, "calib/step_q_w": 0.3319987623762376, "calib/step_q_w_n": 1616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 808.58203125, "completions/mean_terminated_length": 841.451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.19733333333333333, "grad_norm": 2.842849016189575, "kl": 0.179779052734375, "learning_rate": 4.444444444444445e-07, "loss": -0.1784, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01822826638817787, "mask/share_reasoning": 0.7694197297096252, "mask/share_step_conf": 0.17328950762748718, "num_tokens": 57700610.0, "reward": 0.995876133441925, "reward_std": 0.16500087082386017, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.8271839618682861, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8317557573318481, "step": 185 }, { "adv/mean_abs_final_conf": 0.4359062910079956, "adv/mean_abs_reasoning": 0.347331702709198, "adv/mean_abs_step_conf": 0.7288975119590759, "adv/ratio_final_to_reasoning": 1.2550144072882294, "adv/ratio_step_to_reasoning": 2.098563149501335, "adv/std_final_conf": 0.7057287096977234, "adv/std_reasoning": 0.6403841376304626, "adv/std_step_conf": 0.9288153052330017, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8899591619318181, "calib/avg_num_step_conf": 16.38671875, "calib/ece": 0.09120833333333339, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6583333333333333, "calib/gap": 0.6343323863636363, "calib/mean_conf": 0.7434583333333333, "calib/mu_c": 0.9126136363636363, "calib/mu_w": 0.27828125000000004, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05066666666666673, "calib/std_conf": 0.3907025167531203, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5288340425531914, "calib/step_q_c_n": 2350.0, "calib/step_q_gap": 0.21499122412500715, "calib/step_q_w": 0.3138428184281843, "calib/step_q_w_n": 1845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 837.27734375, "completions/mean_terminated_length": 893.0958862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.1984, "grad_norm": 3.691143274307251, "kl": 0.183441162109375, "learning_rate": 4.1666666666666667e-07, "loss": -0.2042, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017624402418732643, "mask/share_reasoning": 0.7532752752304077, "mask/share_step_conf": 0.1666003316640854, "num_tokens": 58019993.0, "reward": 1.0225932598114014, "reward_std": 0.15233799815177917, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.8435511589050293, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8766350746154785, "step": 186 }, { "adv/mean_abs_final_conf": 0.553040623664856, "adv/mean_abs_reasoning": 0.4809000492095947, "adv/mean_abs_step_conf": 0.7330244183540344, "adv/ratio_final_to_reasoning": 1.150011576363594, "adv/ratio_step_to_reasoning": 1.5242760310772066, "adv/std_final_conf": 0.792948842048645, "adv/std_reasoning": 0.7395135164260864, "adv/std_step_conf": 0.9329288005828857, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6994841892801077, "calib/avg_num_step_conf": 17.8359375, "calib/ece": 0.23710084033613438, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.6932773109243697, "calib/gap": 0.28153322867608577, "calib/mean_conf": 0.8128991596638655, "calib/mu_c": 0.9205442176870747, "calib/mu_w": 0.639010989010989, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2161764705882352, "calib/std_conf": 0.3218479962472494, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.550241662638956, "calib/step_q_c_n": 2069.0, "calib/step_q_gap": 0.11051799423687353, "calib/step_q_w": 0.4397236684020825, "calib/step_q_w_n": 2497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 835.84375, "completions/mean_terminated_length": 895.2970581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 269.0, "epoch": 0.19946666666666665, "grad_norm": 1.614307165145874, "kl": 0.179290771484375, "learning_rate": 3.8888888888888895e-07, "loss": -0.3185, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01730452850461006, "mask/share_reasoning": 0.7477213144302368, "mask/share_step_conf": 0.168567955493927, "num_tokens": 58335513.0, "reward": 0.9096522331237793, "reward_std": 0.20705881714820862, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7020113468170166, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.816511869430542, "step": 187 }, { "adv/mean_abs_final_conf": 0.40143418312072754, "adv/mean_abs_reasoning": 0.3243428170681, "adv/mean_abs_step_conf": 0.779598593711853, "adv/ratio_final_to_reasoning": 1.2376848260414572, "adv/ratio_step_to_reasoning": 2.4036252775968405, "adv/std_final_conf": 0.6553907990455627, "adv/std_reasoning": 0.6187350749969482, "adv/std_step_conf": 0.9319950342178345, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7458125445473983, "calib/avg_num_step_conf": 15.53125, "calib/ece": 0.15759183673469385, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.7591836734693878, "calib/gap": 0.35543121881682105, "calib/mean_conf": 0.8375918367346938, "calib/mu_c": 0.926086956521739, "calib/mu_w": 0.570655737704918, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12208163265306121, "calib/std_conf": 0.3266207885686363, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5173934934159565, "calib/step_q_c_n": 2582.0, "calib/step_q_gap": 0.15309650632843863, "calib/step_q_w": 0.3642969870875179, "calib/step_q_w_n": 1394.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 871.5078125, "completions/mean_terminated_length": 903.26318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.20053333333333334, "grad_norm": 2.7746031284332275, "kl": 0.186920166015625, "learning_rate": 3.611111111111111e-07, "loss": -0.1654, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017918666824698448, "mask/share_reasoning": 0.7796072959899902, "mask/share_step_conf": 0.16731780767440796, "num_tokens": 58662691.0, "reward": 0.9855931997299194, "reward_std": 0.15250849723815918, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7960191369056702, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8400110602378845, "step": 188 }, { "adv/mean_abs_final_conf": 0.5323823690414429, "adv/mean_abs_reasoning": 0.3076331615447998, "adv/mean_abs_step_conf": 0.7639738321304321, "adv/ratio_final_to_reasoning": 1.7305753591974624, "adv/ratio_step_to_reasoning": 2.4833923244623177, "adv/std_final_conf": 0.7600097060203552, "adv/std_reasoning": 0.5728651881217957, "adv/std_step_conf": 0.9303438663482666, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7872156744989116, "calib/avg_num_step_conf": 14.35546875, "calib/ece": 0.15716000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.58, "calib/gap": 0.3989948202086929, "calib/mean_conf": 0.7205199999999999, "calib/mu_c": 0.8434104046242773, "calib/mu_w": 0.4444155844155844, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09284000000000002, "calib/std_conf": 0.3652365392454594, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5084188817598534, "calib/step_q_c_n": 2182.0, "calib/step_q_gap": 0.17178123942897594, "calib/step_q_w": 0.33663764233087745, "calib/step_q_w_n": 1493.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 778.0, "completions/mean_terminated_length": 796.6720581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.2016, "grad_norm": 2.5854461193084717, "kl": 0.20245361328125, "learning_rate": 3.3333333333333335e-07, "loss": -0.0904, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01966189220547676, "mask/share_reasoning": 0.783621072769165, "mask/share_step_conf": 0.1732795536518097, "num_tokens": 58969627.0, "reward": 0.9924060106277466, "reward_std": 0.13933482766151428, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.799544095993042, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8555803894996643, "step": 189 }, { "adv/mean_abs_final_conf": 0.44011932611465454, "adv/mean_abs_reasoning": 0.3728107810020447, "adv/mean_abs_step_conf": 0.7390671968460083, "adv/ratio_final_to_reasoning": 1.1805434513768547, "adv/ratio_step_to_reasoning": 1.9824190568189466, "adv/std_final_conf": 0.6736248731613159, "adv/std_reasoning": 0.661288321018219, "adv/std_step_conf": 0.9317901730537415, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7737532408113468, "calib/avg_num_step_conf": 16.7265625, "calib/ece": 0.16651452282157686, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7053941908713693, "calib/gap": 0.38488104315998173, "calib/mean_conf": 0.7949792531120332, "calib/mu_c": 0.9275316455696203, "calib/mu_w": 0.5426506024096386, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1529460580912864, "calib/std_conf": 0.34785546252154953, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5116193053676139, "calib/step_q_c_n": 2217.0, "calib/step_q_gap": 0.1892900075467906, "calib/step_q_w": 0.3223292978208233, "calib/step_q_w_n": 2065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 849.9453125, "completions/mean_terminated_length": 902.8464965820312, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.20266666666666666, "grad_norm": 6.191027641296387, "kl": 0.17401123046875, "learning_rate": 3.055555555555556e-07, "loss": -0.25, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016699232161045074, "mask/share_reasoning": 0.7608840465545654, "mask/share_step_conf": 0.16382291913032532, "num_tokens": 59292821.0, "reward": 0.9518024921417236, "reward_std": 0.15938414633274078, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7602660059928894, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8316202163696289, "step": 190 }, { "adv/mean_abs_final_conf": 0.4920806884765625, "adv/mean_abs_reasoning": 0.3932753801345825, "adv/mean_abs_step_conf": 0.7426341772079468, "adv/ratio_final_to_reasoning": 1.251236953373913, "adv/ratio_step_to_reasoning": 1.8883312170566344, "adv/std_final_conf": 0.7414677143096924, "adv/std_reasoning": 0.6816805005073547, "adv/std_step_conf": 0.932053804397583, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7293021880544056, "calib/avg_num_step_conf": 16.4375, "calib/ece": 0.20124481327800833, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7053941908713693, "calib/gap": 0.3466972205795388, "calib/mean_conf": 0.7901244813278009, "calib/mu_c": 0.9181578947368422, "calib/mu_w": 0.5714606741573034, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18033195020746892, "calib/std_conf": 0.3554758149914254, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5582369402985076, "calib/step_q_c_n": 2144.0, "calib/step_q_gap": 0.18351794805044552, "calib/step_q_w": 0.37471899224806204, "calib/step_q_w_n": 2064.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 792.23046875, "completions/mean_terminated_length": 838.0619506835938, "completions/min_length": 0.0, "completions/min_terminated_length": 290.0, "epoch": 0.20373333333333332, "grad_norm": 2.695556402206421, "kl": 0.198333740234375, "learning_rate": 2.7777777777777776e-07, "loss": -0.2552, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01833411678671837, "mask/share_reasoning": 0.7514849901199341, "mask/share_step_conf": 0.17549338936805725, "num_tokens": 59599800.0, "reward": 0.9281906485557556, "reward_std": 0.1847456991672516, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7312929630279541, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8180569410324097, "step": 191 }, { "adv/mean_abs_final_conf": 0.45248425006866455, "adv/mean_abs_reasoning": 0.42182350158691406, "adv/mean_abs_step_conf": 0.7288972735404968, "adv/ratio_final_to_reasoning": 1.0726862025619808, "adv/ratio_step_to_reasoning": 1.7279674337687707, "adv/std_final_conf": 0.7338057160377502, "adv/std_reasoning": 0.7016082406044006, "adv/std_step_conf": 0.9309428930282593, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.9181267474370921, "calib/avg_num_step_conf": 15.56640625, "calib/ece": 0.05666666666666652, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6460905349794238, "calib/gap": 0.6170074557315937, "calib/mean_conf": 0.7537037037037039, "calib/mu_c": 0.900972972972973, "calib/mu_w": 0.2839655172413793, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02452674897119328, "calib/std_conf": 0.3672490698241373, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5251578947368422, "calib/step_q_c_n": 2375.0, "calib/step_q_gap": 0.18752435436417142, "calib/step_q_w": 0.33763354037267074, "calib/step_q_w_n": 1610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 832.171875, "completions/mean_terminated_length": 873.0983276367188, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.2048, "grad_norm": 3.757995367050171, "kl": 0.1964111328125, "learning_rate": 2.5000000000000004e-07, "loss": -0.307, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.019388530403375626, "mask/share_reasoning": 0.763080358505249, "mask/share_step_conf": 0.17065608501434326, "num_tokens": 59917812.0, "reward": 1.0268250703811646, "reward_std": 0.16568073630332947, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8615050911903381, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8577700257301331, "step": 192 }, { "adv/mean_abs_final_conf": 0.5093462467193604, "adv/mean_abs_reasoning": 0.4073982834815979, "adv/mean_abs_step_conf": 0.7269412279129028, "adv/ratio_final_to_reasoning": 1.2502415139467995, "adv/ratio_step_to_reasoning": 1.7843502473808008, "adv/std_final_conf": 0.7761521339416504, "adv/std_reasoning": 0.6817643642425537, "adv/std_step_conf": 0.9319035410881042, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7133481646273638, "calib/avg_num_step_conf": 16.76953125, "calib/ece": 0.22276859504132224, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6694214876033058, "calib/gap": 0.3279310344827585, "calib/mean_conf": 0.7721074380165289, "calib/mu_c": 0.8899999999999999, "calib/mu_w": 0.5620689655172414, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17719008264462804, "calib/std_conf": 0.3646011361159927, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5089395306859206, "calib/step_q_c_n": 2216.0, "calib/step_q_gap": 0.06470265057036934, "calib/step_q_w": 0.44423688011555124, "calib/step_q_w_n": 2077.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 790.2578125, "completions/mean_terminated_length": 835.9751586914062, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.20586666666666667, "grad_norm": 2.3553261756896973, "kl": 0.200439453125, "learning_rate": 2.2222222222222224e-07, "loss": -0.214, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018386702984571457, "mask/share_reasoning": 0.7529720067977905, "mask/share_step_conf": 0.17395377159118652, "num_tokens": 60225830.0, "reward": 0.9332730770111084, "reward_std": 0.1623665690422058, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7283660173416138, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8280237913131714, "step": 193 }, { "adv/mean_abs_final_conf": 0.4929681420326233, "adv/mean_abs_reasoning": 0.40592581033706665, "adv/mean_abs_step_conf": 0.7234945297241211, "adv/ratio_final_to_reasoning": 1.2144291628642183, "adv/ratio_step_to_reasoning": 1.7823319219917464, "adv/std_final_conf": 0.7625535130500793, "adv/std_reasoning": 0.7014209032058716, "adv/std_step_conf": 0.931851863861084, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8175357374918777, "calib/avg_num_step_conf": 15.16015625, "calib/ece": 0.11987654320987658, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.691358024691358, "calib/gap": 0.5247660818713452, "calib/mean_conf": 0.7676131687242799, "calib/mu_c": 0.9230994152046785, "calib/mu_w": 0.3983333333333333, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09189300411522637, "calib/std_conf": 0.38048169403713317, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5554409884959522, "calib/step_q_c_n": 2347.0, "calib/step_q_gap": 0.2952193457319366, "calib/step_q_w": 0.26022164276401566, "calib/step_q_w_n": 1534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 781.05859375, "completions/mean_terminated_length": 819.4712524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 386.0, "epoch": 0.20693333333333333, "grad_norm": 6.7906880378723145, "kl": 0.21337890625, "learning_rate": 1.9444444444444447e-07, "loss": -0.2742, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01827390119433403, "mask/share_reasoning": 0.7622230648994446, "mask/share_step_conf": 0.17262804508209229, "num_tokens": 60531725.0, "reward": 0.9942914843559265, "reward_std": 0.19554367661476135, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.8150488138198853, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8508778810501099, "step": 194 }, { "adv/mean_abs_final_conf": 0.5361847281455994, "adv/mean_abs_reasoning": 0.44143611192703247, "adv/mean_abs_step_conf": 0.7179086208343506, "adv/ratio_final_to_reasoning": 1.2146372117246005, "adv/ratio_step_to_reasoning": 1.6263024284542853, "adv/std_final_conf": 0.7934735417366028, "adv/std_reasoning": 0.7575832009315491, "adv/std_step_conf": 0.9309477210044861, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8233430799220272, "calib/avg_num_step_conf": 17.4296875, "calib/ece": 0.12493827160493823, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6995884773662552, "calib/gap": 0.5016447368421055, "calib/mean_conf": 0.777037037037037, "calib/mu_c": 0.9256725146198832, "calib/mu_w": 0.4240277777777777, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09913580246913575, "calib/std_conf": 0.3607289525794692, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5536256597645148, "calib/step_q_c_n": 2463.0, "calib/step_q_gap": 0.2096136537615133, "calib/step_q_w": 0.3440120060030015, "calib/step_q_w_n": 1999.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2046.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 829.6484375, "completions/mean_terminated_length": 874.0328979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.208, "grad_norm": 1.9862955808639526, "kl": 0.1944580078125, "learning_rate": 1.6666666666666668e-07, "loss": -0.2701, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01789841055870056, "mask/share_reasoning": 0.7571510076522827, "mask/share_step_conf": 0.1741693913936615, "num_tokens": 60850099.0, "reward": 0.9972332119941711, "reward_std": 0.1784655749797821, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.8212476372718811, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8497812151908875, "step": 195 }, { "adv/mean_abs_final_conf": 0.41140317916870117, "adv/mean_abs_reasoning": 0.30111199617385864, "adv/mean_abs_step_conf": 0.762473464012146, "adv/ratio_final_to_reasoning": 1.3662796049187016, "adv/ratio_step_to_reasoning": 2.5321922530509298, "adv/std_final_conf": 0.6846246719360352, "adv/std_reasoning": 0.572742760181427, "adv/std_step_conf": 0.9304603338241577, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6749640804597701, "calib/avg_num_step_conf": 12.875, "calib/ece": 0.2270472440944883, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8228346456692913, "calib/gap": 0.25895402298850567, "calib/mean_conf": 0.8753937007874015, "calib/mu_c": 0.9569540229885057, "calib/mu_w": 0.6980000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20870078740157488, "calib/std_conf": 0.2841502779037192, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5729505135387488, "calib/step_q_c_n": 2142.0, "calib/step_q_gap": 0.09578413572245759, "calib/step_q_w": 0.47716637781629123, "calib/step_q_w_n": 1154.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 700.09765625, "completions/mean_terminated_length": 708.3992309570312, "completions/min_length": 0.0, "completions/min_terminated_length": 278.0, "epoch": 0.20906666666666668, "grad_norm": 1.3349313735961914, "kl": 0.21282958984375, "learning_rate": 1.3888888888888888e-07, "loss": -0.0415, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021374821662902832, "mask/share_reasoning": 0.7805554270744324, "mask/share_step_conf": 0.1863510012626648, "num_tokens": 61131868.0, "reward": 0.9822046160697937, "reward_std": 0.12111905962228775, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7729214429855347, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8571127653121948, "step": 196 }, { "adv/mean_abs_final_conf": 0.6260608434677124, "adv/mean_abs_reasoning": 0.5000250339508057, "adv/mean_abs_step_conf": 0.7354952096939087, "adv/ratio_final_to_reasoning": 1.2520589989686528, "adv/ratio_step_to_reasoning": 1.4709167736715147, "adv/std_final_conf": 0.8446632027626038, "adv/std_reasoning": 0.7576134204864502, "adv/std_step_conf": 0.931730329990387, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6902694610778444, "calib/avg_num_step_conf": 16.265625, "calib/ece": 0.2023076923076924, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6558704453441295, "calib/gap": 0.3003106287425149, "calib/mean_conf": 0.7699190283400811, "calib/mu_c": 0.8671856287425149, "calib/mu_w": 0.566875, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1480566801619434, "calib/std_conf": 0.35858583227953456, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.508295358649789, "calib/step_q_c_n": 2370.0, "calib/step_q_gap": 0.14093192498200757, "calib/step_q_w": 0.36736343366778146, "calib/step_q_w_n": 1794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 812.73828125, "completions/mean_terminated_length": 842.3522338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.21013333333333334, "grad_norm": 3.056001901626587, "kl": 0.19342041015625, "learning_rate": 1.1111111111111112e-07, "loss": -0.1512, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01817398890852928, "mask/share_reasoning": 0.7675711512565613, "mask/share_step_conf": 0.17909863591194153, "num_tokens": 61444985.0, "reward": 0.961862325668335, "reward_std": 0.1939438283443451, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7479074001312256, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8523796796798706, "step": 197 }, { "adv/mean_abs_final_conf": 0.49982357025146484, "adv/mean_abs_reasoning": 0.44635361433029175, "adv/mean_abs_step_conf": 0.6956985592842102, "adv/ratio_final_to_reasoning": 1.1197928149442664, "adv/ratio_step_to_reasoning": 1.5586264722601948, "adv/std_final_conf": 0.7426099181175232, "adv/std_reasoning": 0.7395592331886292, "adv/std_step_conf": 0.9324936270713806, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8129637526652451, "calib/avg_num_step_conf": 16.24609375, "calib/ece": 0.15714876033057862, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6363636363636364, "calib/gap": 0.4352042643923241, "calib/mean_conf": 0.7312809917355372, "calib/mu_c": 0.8517714285714286, "calib/mu_w": 0.4165671641791045, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08264462809917367, "calib/std_conf": 0.3870245838207969, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5408882636655948, "calib/step_q_c_n": 2488.0, "calib/step_q_gap": 0.17511926306715075, "calib/step_q_w": 0.3657690005984441, "calib/step_q_w_n": 1671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 786.62109375, "completions/mean_terminated_length": 832.1280517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.2112, "grad_norm": 2.4650635719299316, "kl": 0.19927978515625, "learning_rate": 8.333333333333334e-08, "loss": -0.3366, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01857197843492031, "mask/share_reasoning": 0.7518168687820435, "mask/share_step_conf": 0.1749236285686493, "num_tokens": 61751744.0, "reward": 0.976554811000824, "reward_std": 0.1942240297794342, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7791269421577454, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8482013940811157, "step": 198 }, { "adv/mean_abs_final_conf": 0.5385314226150513, "adv/mean_abs_reasoning": 0.4549802243709564, "adv/mean_abs_step_conf": 0.7567774057388306, "adv/ratio_final_to_reasoning": 1.1836369885297993, "adv/ratio_step_to_reasoning": 1.663319338296804, "adv/std_final_conf": 0.7775031924247742, "adv/std_reasoning": 0.7207505702972412, "adv/std_step_conf": 0.9314705729484558, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7192427692048233, "calib/avg_num_step_conf": 16.0625, "calib/ece": 0.19364754098360665, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7336065573770492, "calib/gap": 0.28843325744160553, "calib/mean_conf": 0.806844262295082, "calib/mu_c": 0.886045197740113, "calib/mu_w": 0.5976119402985075, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13754098360655748, "calib/std_conf": 0.3499547399422708, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5201003210272874, "calib/step_q_c_n": 2492.0, "calib/step_q_gap": 0.10728550621247263, "calib/step_q_w": 0.4128148148148148, "calib/step_q_w_n": 1620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 818.87109375, "completions/mean_terminated_length": 859.1433715820312, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.21226666666666666, "grad_norm": 2.299373149871826, "kl": 0.196563720703125, "learning_rate": 5.555555555555556e-08, "loss": -0.2315, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01903655380010605, "mask/share_reasoning": 0.756743311882019, "mask/share_step_conf": 0.17734506726264954, "num_tokens": 62065575.0, "reward": 0.9705625772476196, "reward_std": 0.20622840523719788, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7497433423995972, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8624755144119263, "step": 199 }, { "adv/mean_abs_final_conf": 0.42380672693252563, "adv/mean_abs_reasoning": 0.43650728464126587, "adv/mean_abs_step_conf": 0.7625610828399658, "adv/ratio_final_to_reasoning": 0.9709041334346163, "adv/ratio_step_to_reasoning": 1.7469607259055489, "adv/std_final_conf": 0.6845618486404419, "adv/std_reasoning": 0.7015100717544556, "adv/std_step_conf": 0.9306857585906982, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8689584448011414, "calib/avg_num_step_conf": 15.40234375, "calib/ece": 0.10784232365145224, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6721991701244814, "calib/gap": 0.5163055109684322, "calib/mean_conf": 0.7660995850622406, "calib/mu_c": 0.901067415730337, "calib/mu_w": 0.3847619047619048, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06767634854771781, "calib/std_conf": 0.3675089047384682, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6017685305591678, "calib/step_q_c_n": 2307.0, "calib/step_q_gap": 0.27264872615818975, "calib/step_q_w": 0.32911980440097804, "calib/step_q_w_n": 1636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 824.4921875, "completions/mean_terminated_length": 865.0409545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.21333333333333335, "grad_norm": 1737.5865478515625, "kl": 30.181243896484375, "learning_rate": 2.777777777777778e-08, "loss": 1.625, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018182244151830673, "mask/share_reasoning": 0.7701466679573059, "mask/share_step_conf": 0.16479608416557312, "num_tokens": 62384693.0, "reward": 0.9936589598655701, "reward_std": 0.17970676720142365, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.8194723129272461, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8397207260131836, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.153218922985252, "train_runtime": 14726.264, "train_samples_per_second": 3.477, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 62384693, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }