{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7498364448547363, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5715035496705603, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9352971315383911, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04299506917595863, "learning_rate": 2.5000000000000004e-07, "loss": -0.0136, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 1.0788748264312744, "reward_std": 0.22853493690490723, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.770571768283844, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.509578923891962, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9354329705238342, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.040479063987731934, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 1.016056776046753, "reward_std": 0.2184845209121704, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7734627723693848, "adv/mean_abs_reasoning": 0.40483397245407104, "adv/mean_abs_step_conf": 0.7342471480369568, "adv/ratio_final_to_reasoning": 1.9105678500268037, "adv/ratio_step_to_reasoning": 1.8136994372927981, "adv/std_final_conf": 0.9286358952522278, "adv/std_reasoning": 0.681647002696991, "adv/std_step_conf": 0.9334685206413269, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.45913971367974554, "calib/avg_num_step_conf": 4.94140625, "calib/ece": 0.2468359375000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.32421875, "calib/gap": -0.006550901378579055, "calib/mean_conf": 0.8855859375, "calib/mu_c": 0.8832317073170731, "calib/mu_w": 0.8897826086956522, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24589843750000012, "calib/std_conf": 0.041828897633646694, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7931343283582089, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.01995251017639066, "calib/step_q_w": 0.7731818181818182, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 485.28515625, "completions/mean_terminated_length": 487.1882629394531, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.0032, "grad_norm": 0.06393119692802429, "learning_rate": 7.5e-07, "loss": 0.0896, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03299188241362572, "mask/share_reasoning": 0.8538076281547546, "mask/share_step_conf": 0.10929422080516815, "num_tokens": 688150.0, "reward": 1.0805888175964355, "reward_std": 0.19701236486434937, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.702303946018219, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7553948760032654, "step": 3 }, { "adv/mean_abs_final_conf": 0.75523841381073, "adv/mean_abs_reasoning": 0.4327036142349243, "adv/mean_abs_step_conf": 0.7525294423103333, "adv/ratio_final_to_reasoning": 1.7453942813629804, "adv/ratio_step_to_reasoning": 1.7391337108216722, "adv/std_final_conf": 0.9310765862464905, "adv/std_reasoning": 0.7205736637115479, "adv/std_step_conf": 0.9354071617126465, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.47407503908285564, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.2733992094861661, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.22924901185770752, "calib/gap": 0.0018649035956228577, "calib/mean_conf": 0.8741897233201581, "calib/mu_c": 0.8749342105263158, "calib/mu_w": 0.873069306930693, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2733992094861661, "calib/std_conf": 0.05110474763031544, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7992339832869081, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.0020678595084063778, "calib/step_q_w": 0.7971661237785017, "calib/step_q_w_n": 614.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 523.75390625, "completions/mean_terminated_length": 525.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.004266666666666667, "grad_norm": 0.03903208300471306, "learning_rate": 1.0000000000000002e-06, "loss": 0.0801, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032517582178115845, "mask/share_reasoning": 0.8548072576522827, "mask/share_step_conf": 0.10876892507076263, "num_tokens": 928399.0, "reward": 1.0342053174972534, "reward_std": 0.22188444435596466, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6746652722358704, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7187469601631165, "step": 4 }, { "adv/mean_abs_final_conf": 0.7322856187820435, "adv/mean_abs_reasoning": 0.45622092485427856, "adv/mean_abs_step_conf": 0.7435009479522705, "adv/ratio_final_to_reasoning": 1.605111863327932, "adv/ratio_step_to_reasoning": 1.6296949733065225, "adv/std_final_conf": 0.9311108589172363, "adv/std_reasoning": 0.7392563819885254, "adv/std_step_conf": 0.9351912140846252, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4667686034658512, "calib/avg_num_step_conf": 5.0078125, "calib/ece": 0.32454918032786884, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.2786885245901639, "calib/gap": -0.0030968399592251616, "calib/mean_conf": 0.877827868852459, "calib/mu_c": 0.8764444444444446, "calib/mu_w": 0.8795412844036697, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.32454918032786884, "calib/std_conf": 0.04913801116610084, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.8004434907010014, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.00407985433736513, "calib/step_q_w": 0.7963636363636363, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 533.38671875, "completions/mean_terminated_length": 535.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.005333333333333333, "grad_norm": 0.040636006742715836, "learning_rate": 1.25e-06, "loss": -0.0229, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03362197056412697, "mask/share_reasoning": 0.8532348871231079, "mask/share_step_conf": 0.10923691093921661, "num_tokens": 1171634.0, "reward": 0.9456138610839844, "reward_std": 0.22638621926307678, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6081289052963257, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6600866317749023, "step": 5 }, { "adv/mean_abs_final_conf": 0.7784135937690735, "adv/mean_abs_reasoning": 0.4036681056022644, "adv/mean_abs_step_conf": 0.7451872229576111, "adv/ratio_final_to_reasoning": 1.9283505012309472, "adv/ratio_step_to_reasoning": 1.8460393888335747, "adv/std_final_conf": 0.93086838722229, "adv/std_reasoning": 0.6816917061805725, "adv/std_step_conf": 0.9351860880851746, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5571309156378601, "calib/avg_num_step_conf": 5.0625, "calib/ece": 0.3103571428571429, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.29365079365079366, "calib/gap": 0.0076620370370369395, "calib/mean_conf": 0.8817857142857142, "calib/mu_c": 0.8850694444444444, "calib/mu_w": 0.8774074074074074, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3103571428571429, "calib/std_conf": 0.03887793415515202, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8020992907801419, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.0034190877344566495, "calib/step_q_w": 0.7986802030456852, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 455.55078125, "completions/mean_terminated_length": 455.55078125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0064, "grad_norm": 0.038733117282390594, "learning_rate": 1.5e-06, "loss": -0.0042, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03740730136632919, "mask/share_reasoning": 0.8363658785820007, "mask/share_step_conf": 0.12622681260108948, "num_tokens": 1394207.0, "reward": 1.0009714365005493, "reward_std": 0.2220253348350525, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6465281248092651, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6984013915061951, "step": 6 }, { "adv/mean_abs_final_conf": 0.7462236881256104, "adv/mean_abs_reasoning": 0.5316657423973083, "adv/mean_abs_step_conf": 0.7468922138214111, "adv/ratio_final_to_reasoning": 1.4035579662530995, "adv/ratio_step_to_reasoning": 1.4048153835408606, "adv/std_final_conf": 0.9319174289703369, "adv/std_reasoning": 0.7927316427230835, "adv/std_step_conf": 0.9351266026496887, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.45222758990874934, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.24657480314960623, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.29133858267716534, "calib/gap": -0.008946591519055636, "calib/mean_conf": 0.8811417322834646, "calib/mu_c": 0.877901234567901, "calib/mu_w": 0.8868478260869567, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24496062992125978, "calib/std_conf": 0.045009574556127536, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7882035928143714, "calib/step_q_c_n": 835.0, "calib/step_q_gap": -0.01015355004277152, "calib/step_q_w": 0.7983571428571429, "calib/step_q_w_n": 420.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 519.80078125, "completions/mean_terminated_length": 521.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.007466666666666667, "grad_norm": 0.04315668344497681, "learning_rate": 1.75e-06, "loss": 0.0193, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030725453048944473, "mask/share_reasoning": 0.8632899522781372, "mask/share_step_conf": 0.10207832604646683, "num_tokens": 1634700.0, "reward": 1.0672800540924072, "reward_std": 0.2497740387916565, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6942952871322632, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7440304756164551, "step": 7 }, { "adv/mean_abs_final_conf": 0.7801766395568848, "adv/mean_abs_reasoning": 0.42296385765075684, "adv/mean_abs_step_conf": 0.7665424346923828, "adv/ratio_final_to_reasoning": 1.844546822251371, "adv/ratio_step_to_reasoning": 1.8123119052061425, "adv/std_final_conf": 0.9308924674987793, "adv/std_reasoning": 0.681679368019104, "adv/std_step_conf": 0.9358082413673401, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.469505104018607, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.309402390438247, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.30278884462151395, "calib/gap": 0.0029222121721154126, "calib/mean_conf": 0.8751394422310756, "calib/mu_c": 0.8764084507042255, "calib/mu_w": 0.87348623853211, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.309402390438247, "calib/std_conf": 0.056018996476398596, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.796939393939394, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.031232680158475246, "calib/step_q_w": 0.7657067137809187, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 528.79296875, "completions/mean_terminated_length": 530.86669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.008533333333333334, "grad_norm": 0.04369485378265381, "learning_rate": 2.0000000000000003e-06, "loss": -0.0333, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033193521201610565, "mask/share_reasoning": 0.8561908006668091, "mask/share_step_conf": 0.10670942813158035, "num_tokens": 1876583.0, "reward": 1.025631308555603, "reward_std": 0.22201970219612122, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6433879137039185, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7344164848327637, "step": 8 }, { "adv/mean_abs_final_conf": 0.7931197285652161, "adv/mean_abs_reasoning": 0.478039026260376, "adv/mean_abs_step_conf": 0.7831205129623413, "adv/ratio_final_to_reasoning": 1.6591108361375153, "adv/ratio_step_to_reasoning": 1.638193682822446, "adv/std_final_conf": 0.9304501414299011, "adv/std_reasoning": 0.7207533717155457, "adv/std_step_conf": 0.9356123805046082, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4702091767881242, "calib/avg_num_step_conf": 4.8203125, "calib/ece": 0.26191235059760953, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.30677290836653387, "calib/gap": -0.005671390013495148, "calib/mean_conf": 0.8801593625498008, "calib/mu_c": 0.8780128205128207, "calib/mu_w": 0.8836842105263158, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.26027888446215136, "calib/std_conf": 0.04849091694212926, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7883587786259543, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.011104314340239951, "calib/step_q_w": 0.7772544642857143, "calib/step_q_w_n": 448.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 502.4453125, "completions/mean_terminated_length": 506.4015808105469, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0096, "grad_norm": 0.04787834361195564, "learning_rate": 2.25e-06, "loss": -0.0005, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033189356327056885, "mask/share_reasoning": 0.8479753732681274, "mask/share_step_conf": 0.11102279275655746, "num_tokens": 2112745.0, "reward": 1.0018759965896606, "reward_std": 0.24265122413635254, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6702331900596619, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6796374320983887, "step": 9 }, { "adv/mean_abs_final_conf": 0.7296777963638306, "adv/mean_abs_reasoning": 0.4447851777076721, "adv/mean_abs_step_conf": 0.7466875910758972, "adv/ratio_final_to_reasoning": 1.640517339459094, "adv/ratio_step_to_reasoning": 1.678760058786504, "adv/std_final_conf": 0.9287781715393066, "adv/std_reasoning": 0.739201545715332, "adv/std_step_conf": 0.934630274772644, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5115869358991734, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.2650996015936255, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.002316031982653066, "calib/mean_conf": 0.890597609561753, "calib/mu_c": 0.8914649681528661, "calib/mu_w": 0.8891489361702131, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2650996015936255, "calib/std_conf": 0.044979426840582565, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7772214606741573, "calib/step_q_c_n": 801.0, "calib/step_q_gap": -0.009859518610211837, "calib/step_q_w": 0.7870809792843692, "calib/step_q_w_n": 531.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 511.8984375, "completions/mean_terminated_length": 515.9291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.010666666666666666, "grad_norm": 0.056707367300987244, "learning_rate": 2.5e-06, "loss": 0.0109, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032274842262268066, "mask/share_reasoning": 0.8470996618270874, "mask/share_step_conf": 0.11281301081180573, "num_tokens": 2350591.0, "reward": 1.042642593383789, "reward_std": 0.20550422370433807, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6809687614440918, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7237110137939453, "step": 10 }, { "adv/mean_abs_final_conf": 0.7741892337799072, "adv/mean_abs_reasoning": 0.49845805764198303, "adv/mean_abs_step_conf": 0.7594119310379028, "adv/ratio_final_to_reasoning": 1.553168259416458, "adv/ratio_step_to_reasoning": 1.5235222289923331, "adv/std_final_conf": 0.9311660528182983, "adv/std_reasoning": 0.7575029730796814, "adv/std_step_conf": 0.9356948733329773, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.40005658953722334, "calib/avg_num_step_conf": 5.23828125, "calib/ece": 0.3330314960629921, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.37401574803149606, "calib/gap": -0.020489185110664, "calib/mean_conf": 0.8845275590551179, "calib/mu_c": 0.8754929577464788, "calib/mu_w": 0.8959821428571428, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.32925196850393695, "calib/std_conf": 0.054472896789072724, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7695857142857143, "calib/step_q_c_n": 700.0, "calib/step_q_gap": -0.020308201470915943, "calib/step_q_w": 0.7898939157566303, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 531.6328125, "completions/mean_terminated_length": 531.6328125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.011733333333333333, "grad_norm": 0.03830113634467125, "learning_rate": 2.7500000000000004e-06, "loss": 0.0432, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031960394233465195, "mask/share_reasoning": 0.8549282550811768, "mask/share_step_conf": 0.11311139166355133, "num_tokens": 2591169.0, "reward": 0.9934152364730835, "reward_std": 0.22627121210098267, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6214656233787537, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7055557370185852, "step": 11 }, { "adv/mean_abs_final_conf": 0.7641464471817017, "adv/mean_abs_reasoning": 0.4781668484210968, "adv/mean_abs_step_conf": 0.7605932950973511, "adv/ratio_final_to_reasoning": 1.5980749182108867, "adv/ratio_step_to_reasoning": 1.5906441394020188, "adv/std_final_conf": 0.9308655858039856, "adv/std_reasoning": 0.7394251823425293, "adv/std_step_conf": 0.9351321458816528, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.47628443782576324, "calib/avg_num_step_conf": 5.640625, "calib/ece": 0.20963855421686747, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3534136546184739, "calib/gap": -0.004539836187639645, "calib/mean_conf": 0.8853815261044177, "calib/mu_c": 0.8839411764705881, "calib/mu_w": 0.8884810126582278, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.206144578313253, "calib/std_conf": 0.05083198385278664, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7810807860262009, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.029660331480746316, "calib/step_q_w": 0.7514204545454546, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 486.28515625, "completions/mean_terminated_length": 490.1141662597656, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.0128, "grad_norm": 0.057817842811346054, "learning_rate": 3e-06, "loss": 0.0705, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035377874970436096, "mask/share_reasoning": 0.8293168544769287, "mask/share_step_conf": 0.127492755651474, "num_tokens": 2819834.0, "reward": 1.0887587070465088, "reward_std": 0.23510727286338806, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.709521472454071, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7619972229003906, "step": 12 }, { "adv/mean_abs_final_conf": 0.7575913667678833, "adv/mean_abs_reasoning": 0.5171928405761719, "adv/mean_abs_step_conf": 0.7762430906295776, "adv/ratio_final_to_reasoning": 1.4648141028477861, "adv/ratio_step_to_reasoning": 1.5008774865576526, "adv/std_final_conf": 0.9276416897773743, "adv/std_reasoning": 0.7753491401672363, "adv/std_step_conf": 0.9352768659591675, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5217447916666667, "calib/avg_num_step_conf": 5.04296875, "calib/ece": 0.2686328125000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.453125, "calib/gap": 0.009312500000000057, "calib/mean_conf": 0.8936328124999999, "calib/mu_c": 0.897125, "calib/mu_w": 0.8878124999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2686328125000001, "calib/std_conf": 0.05293427101925409, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7646658259773014, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.006272251680112695, "calib/step_q_w": 0.7583935742971887, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 484.59765625, "completions/mean_terminated_length": 486.4980773925781, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.013866666666666666, "grad_norm": 0.03953487053513527, "learning_rate": 3.2500000000000002e-06, "loss": 0.0582, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03431730717420578, "mask/share_reasoning": 0.8443996906280518, "mask/share_step_conf": 0.11737672984600067, "num_tokens": 3048483.0, "reward": 1.0782309770584106, "reward_std": 0.23127850890159607, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6862156391143799, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7655808329582214, "step": 13 }, { "adv/mean_abs_final_conf": 0.7818053960800171, "adv/mean_abs_reasoning": 0.595024585723877, "adv/mean_abs_step_conf": 0.7662662267684937, "adv/ratio_final_to_reasoning": 1.313904357630722, "adv/ratio_step_to_reasoning": 1.2877891857801014, "adv/std_final_conf": 0.9330616593360901, "adv/std_reasoning": 0.826580286026001, "adv/std_step_conf": 0.9357913732528687, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.435423880979864, "calib/avg_num_step_conf": 5.234375, "calib/ece": 0.3532669322709163, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5139442231075697, "calib/gap": -0.0076189560087213115, "calib/mean_conf": 0.9030677290836654, "calib/mu_c": 0.8996376811594203, "calib/mu_w": 0.9072566371681416, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3532669322709163, "calib/std_conf": 0.043963850081247945, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7427310924369749, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.004056971031224155, "calib/step_q_w": 0.7386741214057507, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 542.8515625, "completions/mean_terminated_length": 547.1259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.014933333333333333, "grad_norm": 0.046196747571229935, "learning_rate": 3.5e-06, "loss": -0.0362, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03230297192931175, "mask/share_reasoning": 0.8441320657730103, "mask/share_step_conf": 0.11575242131948471, "num_tokens": 3292853.0, "reward": 1.0177867412567139, "reward_std": 0.2599422335624695, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6077121496200562, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.75034499168396, "step": 14 }, { "adv/mean_abs_final_conf": 0.7688440084457397, "adv/mean_abs_reasoning": 0.39704400300979614, "adv/mean_abs_step_conf": 0.7553344964981079, "adv/ratio_final_to_reasoning": 1.9364201514630868, "adv/ratio_step_to_reasoning": 1.9023949254296426, "adv/std_final_conf": 0.9207010269165039, "adv/std_reasoning": 0.6612381339073181, "adv/std_step_conf": 0.9349432587623596, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4999685059208869, "calib/avg_num_step_conf": 4.98046875, "calib/ece": 0.3353725490196079, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6352941176470588, "calib/gap": 0.0038397581254724367, "calib/mean_conf": 0.911843137254902, "calib/mu_c": 0.9134693877551019, "calib/mu_w": 0.9096296296296295, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3353725490196079, "calib/std_conf": 0.04342375447727579, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.711628895184136, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 8.232224916226993e-05, "calib/step_q_w": 0.7115465729349737, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 472.23828125, "completions/mean_terminated_length": 472.23828125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.016, "grad_norm": 0.035142287611961365, "learning_rate": 3.7500000000000005e-06, "loss": 0.0654, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034696608781814575, "mask/share_reasoning": 0.8488560914993286, "mask/share_step_conf": 0.11644729971885681, "num_tokens": 3521626.0, "reward": 1.0546379089355469, "reward_std": 0.1892717480659485, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.640849232673645, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7695759534835815, "step": 15 }, { "adv/mean_abs_final_conf": 0.7650634050369263, "adv/mean_abs_reasoning": 0.4421984553337097, "adv/mean_abs_step_conf": 0.764449954032898, "adv/ratio_final_to_reasoning": 1.7301358605144, "adv/ratio_step_to_reasoning": 1.7287485851934914, "adv/std_final_conf": 0.925650417804718, "adv/std_reasoning": 0.7014507055282593, "adv/std_step_conf": 0.9351567625999451, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5157855088344062, "calib/avg_num_step_conf": 6.7734375, "calib/ece": 0.32368421052631585, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7935222672064778, "calib/gap": -2.6023832351618204e-05, "calib/mean_conf": 0.9269230769230768, "calib/mu_c": 0.9269127516778525, "calib/mu_w": 0.9269387755102041, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32368421052631585, "calib/std_conf": 0.03425680218286145, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.669151219512195, "calib/step_q_c_n": 1025.0, "calib/step_q_gap": 0.024694237847879053, "calib/step_q_w": 0.644456981664316, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 662.47265625, "completions/mean_terminated_length": 667.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.017066666666666667, "grad_norm": 0.050142817199230194, "learning_rate": 4.000000000000001e-06, "loss": 0.0649, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02496221847832203, "mask/share_reasoning": 0.8537300825119019, "mask/share_step_conf": 0.11349518597126007, "num_tokens": 3800067.0, "reward": 1.0430816411972046, "reward_std": 0.21906863152980804, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6314531564712524, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.763556718826294, "step": 16 }, { "adv/mean_abs_final_conf": 0.7514055967330933, "adv/mean_abs_reasoning": 0.4066672623157501, "adv/mean_abs_step_conf": 0.7520856857299805, "adv/ratio_final_to_reasoning": 1.8477159741215576, "adv/ratio_step_to_reasoning": 1.8493883216643976, "adv/std_final_conf": 0.9193625450134277, "adv/std_reasoning": 0.6816253662109375, "adv/std_step_conf": 0.9349757432937622, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5193163685227177, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.18273809523809512, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8134920634920635, "calib/gap": 0.0010052910052912312, "calib/mean_conf": 0.9267857142857142, "calib/mu_c": 0.9270370370370371, "calib/mu_w": 0.9260317460317459, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17976190476190462, "calib/std_conf": 0.038652728839139124, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6473840445269017, "calib/step_q_c_n": 1078.0, "calib/step_q_gap": 0.01521826912583224, "calib/step_q_w": 0.6321657754010694, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 535.9609375, "completions/mean_terminated_length": 538.0628051757812, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.018133333333333335, "grad_norm": 0.074327252805233, "learning_rate": 4.25e-06, "loss": 0.03, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031611405313014984, "mask/share_reasoning": 0.839647650718689, "mask/share_step_conf": 0.12483467906713486, "num_tokens": 4040801.0, "reward": 1.1753484010696411, "reward_std": 0.1905728280544281, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7602598071098328, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8316453695297241, "step": 17 }, { "adv/mean_abs_final_conf": 0.7634913921356201, "adv/mean_abs_reasoning": 0.39834457635879517, "adv/mean_abs_step_conf": 0.7617502212524414, "adv/ratio_final_to_reasoning": 1.9166606939011805, "adv/ratio_step_to_reasoning": 1.9122896769813709, "adv/std_final_conf": 0.9149329662322998, "adv/std_reasoning": 0.6816260814666748, "adv/std_step_conf": 0.9351475834846497, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.46570048309178746, "calib/avg_num_step_conf": 6.01171875, "calib/ece": 0.4038399999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.96, "calib/gap": -0.0011014492753622651, "calib/mean_conf": 0.94384, "calib/mu_c": 0.9433333333333332, "calib/mu_w": 0.9444347826086955, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4038399999999998, "calib/std_conf": 0.02372455268282205, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6368571428571428, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.05236369828026555, "calib/step_q_w": 0.5844934445768772, "calib/step_q_w_n": 839.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 552.50390625, "completions/mean_terminated_length": 554.6705932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.0192, "grad_norm": 0.037905480712652206, "learning_rate": 4.5e-06, "loss": 0.0232, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030872829258441925, "mask/share_reasoning": 0.8516048192977905, "mask/share_step_conf": 0.11361609399318695, "num_tokens": 4292962.0, "reward": 1.0092458724975586, "reward_std": 0.20301344990730286, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5696554780006409, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7664117813110352, "step": 18 }, { "adv/mean_abs_final_conf": 0.7377002239227295, "adv/mean_abs_reasoning": 0.39063119888305664, "adv/mean_abs_step_conf": 0.7582512497901917, "adv/ratio_final_to_reasoning": 1.8884826046461665, "adv/ratio_step_to_reasoning": 1.9410923959946924, "adv/std_final_conf": 0.9077786803245544, "adv/std_reasoning": 0.661250114440918, "adv/std_step_conf": 0.9351308941841125, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.45849313373747375, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.4124313725490197, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9607843137254902, "calib/gap": 0.005094643078065397, "calib/mean_conf": 0.949686274509804, "calib/mu_c": 0.9520437956204381, "calib/mu_w": 0.9469491525423727, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4124313725490197, "calib/std_conf": 0.06354170783934227, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6217280453257791, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.022828820519577464, "calib/step_q_w": 0.5988992248062016, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 511.37890625, "completions/mean_terminated_length": 511.37890625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.020266666666666665, "grad_norm": 0.03694775328040123, "learning_rate": 4.75e-06, "loss": 0.0088, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.031130220741033554, "mask/share_reasoning": 0.8523164987564087, "mask/share_step_conf": 0.11655329167842865, "num_tokens": 4528635.0, "reward": 1.0464892387390137, "reward_std": 0.18434491753578186, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5775191783905029, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8061395287513733, "step": 19 }, { "adv/mean_abs_final_conf": 0.7078642249107361, "adv/mean_abs_reasoning": 0.46765512228012085, "adv/mean_abs_step_conf": 0.7625718116760254, "adv/ratio_final_to_reasoning": 1.5136458282749918, "adv/ratio_step_to_reasoning": 1.630628587917513, "adv/std_final_conf": 0.8795217871665955, "adv/std_reasoning": 0.7392780184745789, "adv/std_step_conf": 0.9349386692047119, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.406062424969988, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.35462151394422314, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.003850873682806344, "calib/mean_conf": 0.964183266932271, "calib/mu_c": 0.9626797385620917, "calib/mu_w": 0.966530612244898, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35462151394422314, "calib/std_conf": 0.014100884338506946, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5732155074116305, "calib/step_q_c_n": 877.0, "calib/step_q_gap": -0.00704620057735017, "calib/step_q_w": 0.5802617079889807, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 539.5625, "completions/mean_terminated_length": 539.5625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.021333333333333333, "grad_norm": 0.029928909614682198, "learning_rate": 5e-06, "loss": 0.0835, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032758794724941254, "mask/share_reasoning": 0.8348656892776489, "mask/share_step_conf": 0.13237547874450684, "num_tokens": 4771635.0, "reward": 1.0676593780517578, "reward_std": 0.21638810634613037, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.617925763130188, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8016993999481201, "step": 20 }, { "adv/mean_abs_final_conf": 0.7431222200393677, "adv/mean_abs_reasoning": 0.4694896936416626, "adv/mean_abs_step_conf": 0.754639208316803, "adv/ratio_final_to_reasoning": 1.5828296767821164, "adv/ratio_step_to_reasoning": 1.6073605417476542, "adv/std_final_conf": 0.8799741268157959, "adv/std_reasoning": 0.7206478118896484, "adv/std_step_conf": 0.9352614283561707, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5926800472255017, "calib/avg_num_step_conf": 6.60546875, "calib/ece": 0.3619762845849802, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004256854256854292, "calib/mean_conf": 0.9706719367588933, "calib/mu_c": 0.9723376623376624, "calib/mu_w": 0.9680808080808081, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3619762845849802, "calib/std_conf": 0.012188533902793975, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.550582627118644, "calib/step_q_c_n": 944.0, "calib/step_q_gap": -0.03467841705806285, "calib/step_q_w": 0.5852610441767069, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 516.67578125, "completions/mean_terminated_length": 522.8023681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0224, "grad_norm": 0.03543133661150932, "learning_rate": 4.9722222222222224e-06, "loss": -0.1217, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031131424009799957, "mask/share_reasoning": 0.8254822492599487, "mask/share_step_conf": 0.1316676139831543, "num_tokens": 5006864.0, "reward": 1.0767557621002197, "reward_std": 0.21480554342269897, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6252531409263611, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8068596124649048, "step": 21 }, { "adv/mean_abs_final_conf": 0.6841797828674316, "adv/mean_abs_reasoning": 0.41725265979766846, "adv/mean_abs_step_conf": 0.7678303122520447, "adv/ratio_final_to_reasoning": 1.6397253961165874, "adv/ratio_step_to_reasoning": 1.8402047158294357, "adv/std_final_conf": 0.8309208154678345, "adv/std_reasoning": 0.6815685033798218, "adv/std_step_conf": 0.9349187016487122, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44874088344571345, "calib/avg_num_step_conf": 6.37109375, "calib/ece": 0.3083921568627451, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.002670290353653182, "calib/mean_conf": 0.9711372549019608, "calib/mu_c": 0.9702366863905327, "calib/mu_w": 0.9729069767441859, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3083921568627451, "calib/std_conf": 0.011231991438748663, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5689056603773586, "calib/step_q_c_n": 1060.0, "calib/step_q_gap": 0.04160268314443394, "calib/step_q_w": 0.5273029772329246, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 513.28125, "completions/mean_terminated_length": 515.2941284179688, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.023466666666666667, "grad_norm": 0.045953840017318726, "learning_rate": 4.944444444444445e-06, "loss": 0.0053, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030370274558663368, "mask/share_reasoning": 0.8338354229927063, "mask/share_step_conf": 0.13188806176185608, "num_tokens": 5240080.0, "reward": 1.1322379112243652, "reward_std": 0.18259194493293762, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6774039268493652, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8372147083282471, "step": 22 }, { "adv/mean_abs_final_conf": 0.7485775351524353, "adv/mean_abs_reasoning": 0.4457091689109802, "adv/mean_abs_step_conf": 0.7674142718315125, "adv/ratio_final_to_reasoning": 1.679520161053599, "adv/ratio_step_to_reasoning": 1.7217825554420783, "adv/std_final_conf": 0.8745721578598022, "adv/std_reasoning": 0.701352059841156, "adv/std_step_conf": 0.9352415800094604, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5172980345960692, "calib/avg_num_step_conf": 6.4609375, "calib/ece": 0.4768897637795275, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007086614173227312, "calib/mean_conf": 0.9768897637795275, "calib/mu_c": 0.9772440944881889, "calib/mu_w": 0.9765354330708662, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4768897637795275, "calib/std_conf": 0.011301681532232766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5868795180722892, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.007656217101415508, "calib/step_q_w": 0.5792233009708737, "calib/step_q_w_n": 824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 563.00390625, "completions/mean_terminated_length": 563.00390625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.024533333333333334, "grad_norm": 0.04560421034693718, "learning_rate": 4.9166666666666665e-06, "loss": 0.0387, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030303383246064186, "mask/share_reasoning": 0.8368549942970276, "mask/share_step_conf": 0.13284161686897278, "num_tokens": 5488145.0, "reward": 1.0002158880233765, "reward_std": 0.21627703309059143, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5187183618545532, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7888506054878235, "step": 23 }, { "adv/mean_abs_final_conf": 0.7822250127792358, "adv/mean_abs_reasoning": 0.5776165127754211, "adv/mean_abs_step_conf": 0.7525804042816162, "adv/ratio_final_to_reasoning": 1.3542289658941364, "adv/ratio_step_to_reasoning": 1.3029066649523253, "adv/std_final_conf": 0.8965274691581726, "adv/std_reasoning": 0.7929160594940186, "adv/std_step_conf": 0.935590386390686, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.545442395081529, "calib/avg_num_step_conf": 7.59765625, "calib/ece": 0.4527755102040817, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0017929697941725387, "calib/mean_conf": 0.9793061224489796, "calib/mu_c": 0.9801550387596899, "calib/mu_w": 0.9783620689655174, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.4527755102040817, "calib/std_conf": 0.010687020702478803, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5750436681222707, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.00674434839437954, "calib/step_q_w": 0.5682993197278912, "calib/step_q_w_n": 1029.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 648.9921875, "completions/mean_terminated_length": 659.293701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.0256, "grad_norm": 0.05360132455825806, "learning_rate": 4.888888888888889e-06, "loss": -0.0151, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.026258273050189018, "mask/share_reasoning": 0.8305296897888184, "mask/share_step_conf": 0.12758705019950867, "num_tokens": 5758799.0, "reward": 0.9910411238670349, "reward_std": 0.25226694345474243, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5229964852333069, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7779321074485779, "step": 24 }, { "adv/mean_abs_final_conf": 0.7140051126480103, "adv/mean_abs_reasoning": 0.43053969740867615, "adv/mean_abs_step_conf": 0.7535181641578674, "adv/ratio_final_to_reasoning": 1.658395536916689, "adv/ratio_step_to_reasoning": 1.7501711658486494, "adv/std_final_conf": 0.8654311895370483, "adv/std_reasoning": 0.7013791799545288, "adv/std_step_conf": 0.935165286064148, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4696669310071372, "calib/avg_num_step_conf": 6.87890625, "calib/ece": 0.36600790513834003, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0011194818926778538, "calib/mean_conf": 0.982608695652174, "calib/mu_c": 0.982179487179487, "calib/mu_w": 0.9832989690721649, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36600790513834003, "calib/std_conf": 0.00934161941203924, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5901769911504425, "calib/step_q_c_n": 1017.0, "calib/step_q_gap": 0.04207215244076512, "calib/step_q_w": 0.5481048387096774, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 569.87890625, "completions/mean_terminated_length": 572.11376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.02666666666666667, "grad_norm": 0.05113760754466057, "learning_rate": 4.861111111111111e-06, "loss": 0.0074, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02784719131886959, "mask/share_reasoning": 0.837721586227417, "mask/share_step_conf": 0.13052499294281006, "num_tokens": 6007912.0, "reward": 1.0738012790679932, "reward_std": 0.20915639400482178, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6216461062431335, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.804283618927002, "step": 25 }, { "adv/mean_abs_final_conf": 0.6176284551620483, "adv/mean_abs_reasoning": 0.31737247109413147, "adv/mean_abs_step_conf": 0.7418646812438965, "adv/ratio_final_to_reasoning": 1.9460681420565367, "adv/ratio_step_to_reasoning": 2.337520575386837, "adv/std_final_conf": 0.7885181307792664, "adv/std_reasoning": 0.5960696339607239, "adv/std_step_conf": 0.9348067045211792, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4531923101652414, "calib/avg_num_step_conf": 6.32421875, "calib/ece": 0.39157480314960635, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0018864527743842618, "calib/mean_conf": 0.9860629921259844, "calib/mu_c": 0.985298013245033, "calib/mu_w": 0.9871844660194172, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39157480314960635, "calib/std_conf": 0.007116298285162213, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5744040862656073, "calib/step_q_c_n": 881.0, "calib/step_q_gap": 0.03607075293227402, "calib/step_q_w": 0.5383333333333333, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 579.5234375, "completions/mean_terminated_length": 579.5234375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.027733333333333332, "grad_norm": 0.0754050686955452, "learning_rate": 4.833333333333333e-06, "loss": -0.0124, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02674679085612297, "mask/share_reasoning": 0.8562403917312622, "mask/share_step_conf": 0.1170128583908081, "num_tokens": 6261510.0, "reward": 1.070356845855713, "reward_std": 0.15499469637870789, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5999132394790649, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8162626028060913, "step": 26 }, { "adv/mean_abs_final_conf": 0.5428198575973511, "adv/mean_abs_reasoning": 0.4752519428730011, "adv/mean_abs_step_conf": 0.7459293603897095, "adv/ratio_final_to_reasoning": 1.1421728321948297, "adv/ratio_step_to_reasoning": 1.569545104603686, "adv/std_final_conf": 0.764316976070404, "adv/std_reasoning": 0.7206440567970276, "adv/std_step_conf": 0.935205340385437, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4837928464977645, "calib/avg_num_step_conf": 7.6328125, "calib/ece": 0.46937007874015746, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000398658718331113, "calib/mean_conf": 0.9890551181102362, "calib/mu_c": 0.9888636363636362, "calib/mu_w": 0.9892622950819673, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46937007874015746, "calib/std_conf": 0.0034213481862047958, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5747283702213279, "calib/step_q_c_n": 994.0, "calib/step_q_gap": 0.017290870221327803, "calib/step_q_w": 0.5574375000000001, "calib/step_q_w_n": 960.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 578.6015625, "completions/mean_terminated_length": 578.6015625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0288, "grad_norm": 0.040866825729608536, "learning_rate": 4.805555555555556e-06, "loss": 0.0375, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02749168500304222, "mask/share_reasoning": 0.8314797878265381, "mask/share_step_conf": 0.14102855324745178, "num_tokens": 6514848.0, "reward": 1.0136632919311523, "reward_std": 0.22315937280654907, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5257288813591003, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8000233173370361, "step": 27 }, { "adv/mean_abs_final_conf": 0.45969158411026, "adv/mean_abs_reasoning": 0.4384334683418274, "adv/mean_abs_step_conf": 0.7739053964614868, "adv/ratio_final_to_reasoning": 1.0484865260146121, "adv/ratio_step_to_reasoning": 1.765160400250527, "adv/std_final_conf": 0.7234077453613281, "adv/std_reasoning": 0.7206025719642639, "adv/std_step_conf": 0.9350380897521973, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4940119760479042, "calib/avg_num_step_conf": 7.3671875, "calib/ece": 0.31653225806451624, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001197604790422746, "calib/mean_conf": 0.9899193548387097, "calib/mu_c": 0.9898802395209577, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31653225806451624, "calib/std_conf": 0.001267438197217881, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5770805921052632, "calib/step_q_c_n": 1216.0, "calib/step_q_gap": 0.007408950314218399, "calib/step_q_w": 0.5696716417910448, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 655.50390625, "completions/mean_terminated_length": 655.50390625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.029866666666666666, "grad_norm": 0.03843703120946884, "learning_rate": 4.777777777777778e-06, "loss": 0.0027, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02509191818535328, "mask/share_reasoning": 0.8488214612007141, "mask/share_step_conf": 0.12608662247657776, "num_tokens": 6789601.0, "reward": 1.0947589874267578, "reward_std": 0.22782441973686218, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6585718393325806, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8039641976356506, "step": 28 }, { "adv/mean_abs_final_conf": 0.4598812460899353, "adv/mean_abs_reasoning": 0.4270208179950714, "adv/mean_abs_step_conf": 0.737694501876831, "adv/ratio_final_to_reasoning": 1.0769527543156998, "adv/ratio_step_to_reasoning": 1.7275375597387044, "adv/std_final_conf": 0.7559602856636047, "adv/std_reasoning": 0.7392783761024475, "adv/std_step_conf": 0.9354400038719177, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4927007299270073, "calib/avg_num_step_conf": 8.2421875, "calib/ece": 0.4284426229508197, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014598540145960293, "calib/mean_conf": 0.9899180327868853, "calib/mu_c": 0.9898540145985403, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4284426229508197, "calib/std_conf": 0.0009016393442622958, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5657422222222223, "calib/step_q_c_n": 1125.0, "calib/step_q_gap": 0.0332549125775522, "calib/step_q_w": 0.5324873096446701, "calib/step_q_w_n": 985.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 664.14453125, "completions/mean_terminated_length": 677.37451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 212.0, "epoch": 0.030933333333333334, "grad_norm": 0.04360277205705643, "learning_rate": 4.75e-06, "loss": -0.1501, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.022655852138996124, "mask/share_reasoning": 0.8310916423797607, "mask/share_step_conf": 0.12672126293182373, "num_tokens": 7066750.0, "reward": 1.0225168466567993, "reward_std": 0.2292087972164154, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5434179306030273, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8015979528427124, "step": 29 }, { "adv/mean_abs_final_conf": 0.582615077495575, "adv/mean_abs_reasoning": 0.5706990957260132, "adv/mean_abs_step_conf": 0.7505882382392883, "adv/ratio_final_to_reasoning": 1.0208796226571955, "adv/ratio_step_to_reasoning": 1.3152083889048916, "adv/std_final_conf": 0.8106728792190552, "adv/std_reasoning": 0.8098874092102051, "adv/std_step_conf": 0.9352176785469055, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49640287769784175, "calib/avg_num_step_conf": 7.48828125, "calib/ece": 0.42261224489795923, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.19424460432494e-05, "calib/mean_conf": 0.9899591836734695, "calib/mu_c": 0.9899280575539567, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42261224489795923, "calib/std_conf": 0.0006375714021148296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5885585585585585, "calib/step_q_c_n": 999.0, "calib/step_q_gap": -0.006724665842312905, "calib/step_q_w": 0.5952832244008714, "calib/step_q_w_n": 918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 713.55078125, "completions/mean_terminated_length": 713.55078125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.032, "grad_norm": 0.04562646895647049, "learning_rate": 4.722222222222222e-06, "loss": 0.0619, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02327953279018402, "mask/share_reasoning": 0.8579779863357544, "mask/share_step_conf": 0.11874253302812576, "num_tokens": 7356403.0, "reward": 1.0039558410644531, "reward_std": 0.28641945123672485, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5511531233787537, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7690889835357666, "step": 30 }, { "adv/mean_abs_final_conf": 0.40484005212783813, "adv/mean_abs_reasoning": 0.39064812660217285, "adv/mean_abs_step_conf": 0.7507187128067017, "adv/ratio_final_to_reasoning": 1.036329178509329, "adv/ratio_step_to_reasoning": 1.921726130716138, "adv/std_final_conf": 0.6624351739883423, "adv/std_reasoning": 0.6614300608634949, "adv/std_step_conf": 0.9350001215934753, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.90625, "calib/ece": 0.526, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.526, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5834707158351409, "calib/step_q_c_n": 922.0, "calib/step_q_gap": -0.02866477753746588, "calib/step_q_w": 0.6121354933726068, "calib/step_q_w_n": 1358.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 677.59765625, "completions/mean_terminated_length": 685.6324462890625, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.03306666666666667, "grad_norm": 0.0428326353430748, "learning_rate": 4.694444444444445e-06, "loss": -0.0011, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.024062704294919968, "mask/share_reasoning": 0.8310606479644775, "mask/share_step_conf": 0.1331578940153122, "num_tokens": 7635780.0, "reward": 0.9524698853492737, "reward_std": 0.22021548449993134, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.4634960889816284, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7703374028205872, "step": 31 }, { "adv/mean_abs_final_conf": 0.4177335500717163, "adv/mean_abs_reasoning": 0.3731395900249481, "adv/mean_abs_step_conf": 0.7657584547996521, "adv/ratio_final_to_reasoning": 1.1195101276811357, "adv/ratio_step_to_reasoning": 2.0522037201907564, "adv/std_final_conf": 0.6802933216094971, "adv/std_reasoning": 0.640331506729126, "adv/std_step_conf": 0.934921383857727, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5158730158730158, "calib/avg_num_step_conf": 8.03515625, "calib/ece": 0.48984126984126997, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003174603174602719, "calib/mean_conf": 0.98984126984127, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9896825396825396, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.48984126984126997, "calib/std_conf": 0.0012498425196844154, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6162170706006322, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.061623207784747724, "calib/step_q_w": 0.5545938628158845, "calib/step_q_w_n": 1108.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 643.13671875, "completions/mean_terminated_length": 648.2008056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.034133333333333335, "grad_norm": 0.05958246812224388, "learning_rate": 4.666666666666667e-06, "loss": -0.0464, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.024628007784485817, "mask/share_reasoning": 0.8358334898948669, "mask/share_step_conf": 0.1317259967327118, "num_tokens": 7907127.0, "reward": 0.9951291680335999, "reward_std": 0.19687864184379578, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5022405982017517, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7951366901397705, "step": 32 }, { "adv/mean_abs_final_conf": 0.534870982170105, "adv/mean_abs_reasoning": 0.5064483880996704, "adv/mean_abs_step_conf": 0.7465532422065735, "adv/ratio_final_to_reasoning": 1.0561214029668131, "adv/ratio_step_to_reasoning": 1.4740954058672011, "adv/std_final_conf": 0.773914098739624, "adv/std_reasoning": 0.7576087713241577, "adv/std_step_conf": 0.9350910782814026, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5148124602670058, "calib/avg_num_step_conf": 8.46484375, "calib/ece": 0.5076095617529881, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00029624920533999344, "calib/mean_conf": 0.9896812749003985, "calib/mu_c": 0.9898347107438017, "calib/mu_w": 0.9895384615384617, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5076095617529881, "calib/std_conf": 0.001756606190043715, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5811604439959637, "calib/step_q_c_n": 991.0, "calib/step_q_gap": -0.0017307124666213758, "calib/step_q_w": 0.582891156462585, "calib/step_q_w_n": 1176.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2897.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 646.796875, "completions/mean_terminated_length": 646.796875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.0352, "grad_norm": 0.050102751702070236, "learning_rate": 4.638888888888889e-06, "loss": 0.0859, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.024720996618270874, "mask/share_reasoning": 0.8411494493484497, "mask/share_step_conf": 0.13412953913211823, "num_tokens": 8179579.0, "reward": 0.9854416847229004, "reward_std": 0.23391908407211304, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.48317378759384155, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7980563044548035, "step": 33 }, { "adv/mean_abs_final_conf": 0.6293555498123169, "adv/mean_abs_reasoning": 0.5206021666526794, "adv/mean_abs_step_conf": 0.7541499137878418, "adv/ratio_final_to_reasoning": 1.2088992134990335, "adv/ratio_step_to_reasoning": 1.4486107859227833, "adv/std_final_conf": 0.8442690372467041, "adv/std_reasoning": 0.7753821015357971, "adv/std_step_conf": 0.9347956776618958, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5234375, "calib/avg_num_step_conf": 8.515625, "calib/ece": 0.5046370967741937, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004687499999997957, "calib/mean_conf": 0.9885080645161292, "calib/mu_c": 0.98875, "calib/mu_w": 0.9882812500000002, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5046370967741937, "calib/std_conf": 0.0035627915109751234, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5425289778714436, "calib/step_q_c_n": 949.0, "calib/step_q_gap": -0.008348357628150227, "calib/step_q_w": 0.5508773354995938, "calib/step_q_w_n": 1231.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 579.2734375, "completions/mean_terminated_length": 590.812744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.03626666666666667, "grad_norm": 0.07354850322008133, "learning_rate": 4.611111111111112e-06, "loss": -0.0333, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02619742415845394, "mask/share_reasoning": 0.8041331171989441, "mask/share_step_conf": 0.15013819932937622, "num_tokens": 8432985.0, "reward": 0.9838611483573914, "reward_std": 0.23865635693073273, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.480173796415329, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8005531430244446, "step": 34 }, { "adv/mean_abs_final_conf": 0.7236064672470093, "adv/mean_abs_reasoning": 0.5600302219390869, "adv/mean_abs_step_conf": 0.787669837474823, "adv/ratio_final_to_reasoning": 1.2920846748976238, "adv/ratio_step_to_reasoning": 1.406477376787883, "adv/std_final_conf": 0.869294285774231, "adv/std_reasoning": 0.7927871942520142, "adv/std_step_conf": 0.9348177909851074, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5486153846153846, "calib/avg_num_step_conf": 8.19140625, "calib/ece": 0.49258823529411794, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009723076923078278, "calib/mean_conf": 0.9827843137254905, "calib/mu_c": 0.9832800000000003, "calib/mu_w": 0.9823076923076924, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.49258823529411794, "calib/std_conf": 0.004482268882268098, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.51757852077001, "calib/step_q_c_n": 987.0, "calib/step_q_gap": 0.0436956378871271, "calib/step_q_w": 0.4738828828828829, "calib/step_q_w_n": 1110.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 656.984375, "completions/mean_terminated_length": 656.984375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.037333333333333336, "grad_norm": 0.06646399945020676, "learning_rate": 4.583333333333333e-06, "loss": -0.025, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.024299103766679764, "mask/share_reasoning": 0.8431927561759949, "mask/share_step_conf": 0.13250812888145447, "num_tokens": 8710429.0, "reward": 1.0185070037841797, "reward_std": 0.22961218655109406, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.505856990814209, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.823375403881073, "step": 35 }, { "adv/mean_abs_final_conf": 0.3987484574317932, "adv/mean_abs_reasoning": 0.3482089340686798, "adv/mean_abs_step_conf": 0.7686375379562378, "adv/ratio_final_to_reasoning": 1.1451413746700283, "adv/ratio_step_to_reasoning": 2.2074032649737636, "adv/std_final_conf": 0.6580438017845154, "adv/std_reasoning": 0.6402967572212219, "adv/std_step_conf": 0.9345235824584961, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4962904498816101, "calib/avg_num_step_conf": 9.3515625, "calib/ece": 0.2594023904382473, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.419100236727427e-05, "calib/mean_conf": 0.9805179282868529, "calib/mu_c": 0.9804972375690613, "calib/mu_w": 0.9805714285714285, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2594023904382473, "calib/std_conf": 0.002216085097238788, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48154626108998727, "calib/step_q_c_n": 1578.0, "calib/step_q_gap": 0.03804135912920298, "calib/step_q_w": 0.4435049019607843, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 597.12109375, "completions/mean_terminated_length": 604.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0384, "grad_norm": 0.04976440966129303, "learning_rate": 4.555555555555556e-06, "loss": -0.0363, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027500033378601074, "mask/share_reasoning": 0.800820529460907, "mask/share_step_conf": 0.15996065735816956, "num_tokens": 8966004.0, "reward": 1.1506966352462769, "reward_std": 0.17550379037857056, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.717279314994812, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8310760259628296, "step": 36 }, { "adv/mean_abs_final_conf": 0.403596431016922, "adv/mean_abs_reasoning": 0.37260866165161133, "adv/mean_abs_step_conf": 0.7900395393371582, "adv/ratio_final_to_reasoning": 1.0831643827815367, "adv/ratio_step_to_reasoning": 2.120293006167002, "adv/std_final_conf": 0.6856962442398071, "adv/std_reasoning": 0.6612308621406555, "adv/std_step_conf": 0.9350776672363281, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5011961722488039, "calib/avg_num_step_conf": 9.46875, "calib/ece": 0.5167479674796751, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.392344497625931e-05, "calib/mean_conf": 0.9801626016260166, "calib/mu_c": 0.9801754385964916, "calib/mu_w": 0.9801515151515153, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5167479674796751, "calib/std_conf": 0.0012647438362686227, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47984143763213527, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.08639759460101215, "calib/step_q_w": 0.3934438430311231, "calib/step_q_w_n": 1478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 656.3671875, "completions/mean_terminated_length": 672.1200561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 293.0, "epoch": 0.039466666666666664, "grad_norm": 0.057230979204177856, "learning_rate": 4.527777777777778e-06, "loss": -0.1028, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02309846132993698, "mask/share_reasoning": 0.8134510517120361, "mask/share_step_conf": 0.14001299440860748, "num_tokens": 9241130.0, "reward": 0.9769242405891418, "reward_std": 0.17409387230873108, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.4654015302658081, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.804797887802124, "step": 37 }, { "adv/mean_abs_final_conf": 0.36862608790397644, "adv/mean_abs_reasoning": 0.3565206229686737, "adv/mean_abs_step_conf": 0.7455395460128784, "adv/ratio_final_to_reasoning": 1.033954459168457, "adv/ratio_step_to_reasoning": 2.091154053880318, "adv/std_final_conf": 0.6618428230285645, "adv/std_reasoning": 0.6613090634346008, "adv/std_step_conf": 0.935276448726654, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.8359375, "calib/ece": 0.4137349397590363, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4137349397590363, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4717914213624895, "calib/step_q_c_n": 1189.0, "calib/step_q_gap": 0.07567772145568619, "calib/step_q_w": 0.39611369990680334, "calib/step_q_w_n": 1073.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 628.61328125, "completions/mean_terminated_length": 636.0671997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 252.0, "epoch": 0.04053333333333333, "grad_norm": 0.052082207053899765, "learning_rate": 4.5e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.025044191628694534, "mask/share_reasoning": 0.8164798021316528, "mask/share_step_conf": 0.14675727486610413, "num_tokens": 9508943.0, "reward": 1.0493329763412476, "reward_std": 0.18128329515457153, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5671124458312988, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8184314370155334, "step": 38 }, { "adv/mean_abs_final_conf": 0.43636903166770935, "adv/mean_abs_reasoning": 0.4297660291194916, "adv/mean_abs_step_conf": 0.7607371807098389, "adv/ratio_final_to_reasoning": 1.0153641798113873, "adv/ratio_step_to_reasoning": 1.7701193886088296, "adv/std_final_conf": 0.7025984525680542, "adv/std_reasoning": 0.7013990879058838, "adv/std_step_conf": 0.9351004958152771, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.27734375, "calib/ece": 0.4345454545454548, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4345454545454548, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46675302245250433, "calib/step_q_c_n": 1158.0, "calib/step_q_gap": 0.02381957956014613, "calib/step_q_w": 0.4429334428923582, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 665.265625, "completions/mean_terminated_length": 665.265625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.0416, "grad_norm": 0.051825810223817825, "learning_rate": 4.472222222222223e-06, "loss": 0.0314, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02482118085026741, "mask/share_reasoning": 0.8257359266281128, "mask/share_step_conf": 0.1494428813457489, "num_tokens": 9785339.0, "reward": 1.0435525178909302, "reward_std": 0.1997736394405365, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5525765419006348, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.820414662361145, "step": 39 }, { "adv/mean_abs_final_conf": 0.35352271795272827, "adv/mean_abs_reasoning": 0.3396007716655731, "adv/mean_abs_step_conf": 0.7508925199508667, "adv/ratio_final_to_reasoning": 1.0409950372576449, "adv/ratio_step_to_reasoning": 2.2111036917499094, "adv/std_final_conf": 0.6413707733154297, "adv/std_reasoning": 0.6403252482414246, "adv/std_step_conf": 0.9351861476898193, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.421875, "calib/ece": 0.5157142857142858, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000001, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5157142857142858, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49702857142857143, "calib/step_q_c_n": 1050.0, "calib/step_q_gap": 0.05893752884413672, "calib/step_q_w": 0.4380910425844347, "calib/step_q_w_n": 1362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 686.40234375, "completions/mean_terminated_length": 691.8070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.042666666666666665, "grad_norm": 0.05516641214489937, "learning_rate": 4.444444444444444e-06, "loss": -0.0246, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.023958513513207436, "mask/share_reasoning": 0.8191065788269043, "mask/share_step_conf": 0.1491224467754364, "num_tokens": 10067818.0, "reward": 1.0020737648010254, "reward_std": 0.1748155653476715, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.47773122787475586, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8254233002662659, "step": 40 }, { "adv/mean_abs_final_conf": 0.43498802185058594, "adv/mean_abs_reasoning": 0.4303048849105835, "adv/mean_abs_step_conf": 0.7763193845748901, "adv/ratio_final_to_reasoning": 1.0108832995028063, "adv/ratio_step_to_reasoning": 1.804114737707911, "adv/std_final_conf": 0.6828204989433289, "adv/std_reasoning": 0.6816701889038086, "adv/std_step_conf": 0.9349315762519836, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.3828125, "calib/ece": 0.250588235294118, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.250588235294118, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47175101803374053, "calib/step_q_c_n": 1719.0, "calib/step_q_gap": 0.008690988751163697, "calib/step_q_w": 0.46306002928257683, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 631.42578125, "completions/mean_terminated_length": 631.42578125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.04373333333333333, "grad_norm": 0.05414958298206329, "learning_rate": 4.416666666666667e-06, "loss": -0.0339, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.02605331689119339, "mask/share_reasoning": 0.8131687641143799, "mask/share_step_conf": 0.16077792644500732, "num_tokens": 10336711.0, "reward": 1.1583480834960938, "reward_std": 0.18238838016986847, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7369453310966492, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8234797120094299, "step": 41 }, { "adv/mean_abs_final_conf": 0.3892165422439575, "adv/mean_abs_reasoning": 0.38388973474502563, "adv/mean_abs_step_conf": 0.7426350116729736, "adv/ratio_final_to_reasoning": 1.0138758789746485, "adv/ratio_step_to_reasoning": 1.9345008330744289, "adv/std_final_conf": 0.6827824115753174, "adv/std_reasoning": 0.6815664768218994, "adv/std_step_conf": 0.9347171783447266, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5045454545454545, "calib/avg_num_step_conf": 9.15234375, "calib/ece": 0.41133333333333355, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.090909090880839e-05, "calib/mean_conf": 0.9799607843137257, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9799090909090914, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.41133333333333355, "calib/std_conf": 0.0006249951941376173, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49545524691358017, "calib/step_q_c_n": 1296.0, "calib/step_q_gap": 0.029810547773178986, "calib/step_q_w": 0.4656446991404012, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 573.15625, "completions/mean_terminated_length": 573.15625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.0448, "grad_norm": 0.05311352014541626, "learning_rate": 4.388888888888889e-06, "loss": 0.0432, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.027020232751965523, "mask/share_reasoning": 0.807121753692627, "mask/share_step_conf": 0.16585806012153625, "num_tokens": 10587807.0, "reward": 1.0639019012451172, "reward_std": 0.17825046181678772, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5789812803268433, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.825777530670166, "step": 42 }, { "adv/mean_abs_final_conf": 0.4678432047367096, "adv/mean_abs_reasoning": 0.46236586570739746, "adv/mean_abs_step_conf": 0.7421578764915466, "adv/ratio_final_to_reasoning": 1.0118463308724837, "adv/ratio_step_to_reasoning": 1.6051311991988875, "adv/std_final_conf": 0.7400902509689331, "adv/std_reasoning": 0.7392212748527527, "adv/std_step_conf": 0.9346301555633545, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.496551724137931, "calib/avg_num_step_conf": 9.6328125, "calib/ece": 0.41133333333333355, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.896551724167033e-05, "calib/mean_conf": 0.9799607843137257, "calib/mu_c": 0.9799310344827589, "calib/mu_w": 0.9800000000000005, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41133333333333355, "calib/std_conf": 0.0006249951941376173, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4877061469265367, "calib/step_q_c_n": 1334.0, "calib/step_q_gap": 0.015895192862932395, "calib/step_q_w": 0.4718109540636043, "calib/step_q_w_n": 1132.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2056.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 668.37109375, "completions/mean_terminated_length": 670.9921875, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.04586666666666667, "grad_norm": 0.053375594317913055, "learning_rate": 4.361111111111112e-06, "loss": 0.0592, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02442849799990654, "mask/share_reasoning": 0.8149913549423218, "mask/share_step_conf": 0.15667389333248138, "num_tokens": 10864134.0, "reward": 1.0666247606277466, "reward_std": 0.195322185754776, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5828839540481567, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8262854814529419, "step": 43 }, { "adv/mean_abs_final_conf": 0.44674229621887207, "adv/mean_abs_reasoning": 0.39439481496810913, "adv/mean_abs_step_conf": 0.7650898694992065, "adv/ratio_final_to_reasoning": 1.132728624373512, "adv/ratio_step_to_reasoning": 1.939908539520409, "adv/std_final_conf": 0.7148959636688232, "adv/std_reasoning": 0.6815752387046814, "adv/std_step_conf": 0.934475302696228, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5001860003720008, "calib/avg_num_step_conf": 10.48828125, "calib/ece": 0.4796062992125987, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001574803149606563, "calib/mean_conf": 0.9796062992125987, "calib/mu_c": 0.9796850393700792, "calib/mu_w": 0.9795275590551186, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4796062992125987, "calib/std_conf": 0.0026327958263465357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45573998364677026, "calib/step_q_c_n": 1223.0, "calib/step_q_gap": 0.035582664905320194, "calib/step_q_w": 0.42015731874145007, "calib/step_q_w_n": 1462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 695.90625, "completions/mean_terminated_length": 698.6353149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 234.0, "epoch": 0.046933333333333334, "grad_norm": 0.07896358519792557, "learning_rate": 4.333333333333334e-06, "loss": -0.0264, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.022946707904338837, "mask/share_reasoning": 0.8176969289779663, "mask/share_step_conf": 0.15545010566711426, "num_tokens": 11148606.0, "reward": 1.032286524772644, "reward_std": 0.16667400300502777, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5156804323196411, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8346784710884094, "step": 44 }, { "adv/mean_abs_final_conf": 0.5271316766738892, "adv/mean_abs_reasoning": 0.5128173828125, "adv/mean_abs_step_conf": 0.768696665763855, "adv/ratio_final_to_reasoning": 1.0279130433974053, "adv/ratio_step_to_reasoning": 1.4989676472119733, "adv/std_final_conf": 0.7760584354400635, "adv/std_reasoning": 0.7754290699958801, "adv/std_step_conf": 0.9351422190666199, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.515625, "calib/ece": 0.443114754098361, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.443114754098361, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.458319198149576, "calib/step_q_c_n": 1297.0, "calib/step_q_gap": 0.009781563740973831, "calib/step_q_w": 0.44853763440860217, "calib/step_q_w_n": 1395.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 712.76953125, "completions/mean_terminated_length": 715.5647583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.048, "grad_norm": 0.044064924120903015, "learning_rate": 4.305555555555556e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.023293491452932358, "mask/share_reasoning": 0.8162682056427002, "mask/share_step_conf": 0.15653207898139954, "num_tokens": 11436123.0, "reward": 1.0050601959228516, "reward_std": 0.22470340132713318, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5289937257766724, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7921053171157837, "step": 45 }, { "adv/mean_abs_final_conf": 0.4715219736099243, "adv/mean_abs_reasoning": 0.4628002345561981, "adv/mean_abs_step_conf": 0.7624514102935791, "adv/ratio_final_to_reasoning": 1.0188455804524168, "adv/ratio_step_to_reasoning": 1.647474122446656, "adv/std_final_conf": 0.740247905254364, "adv/std_reasoning": 0.7393082976341248, "adv/std_step_conf": 0.9344412088394165, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 11.03125, "calib/ece": 0.41495934959349634, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9800000000000004, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41495934959349634, "calib/std_conf": 4.440892098500626e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46224945926459987, "calib/step_q_c_n": 1387.0, "calib/step_q_gap": 0.02676581277886575, "calib/step_q_w": 0.4354836464857341, "calib/step_q_w_n": 1437.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 758.50390625, "completions/mean_terminated_length": 761.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.04906666666666667, "grad_norm": 0.04323741793632507, "learning_rate": 4.277777777777778e-06, "loss": 0.1058, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0240943506360054, "mask/share_reasoning": 0.8130663633346558, "mask/share_step_conf": 0.15893307328224182, "num_tokens": 11735068.0, "reward": 1.0368883609771729, "reward_std": 0.20708303153514862, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5593031048774719, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8091282844543457, "step": 46 }, { "adv/mean_abs_final_conf": 0.5743204355239868, "adv/mean_abs_reasoning": 0.5580986738204956, "adv/mean_abs_step_conf": 0.774113118648529, "adv/ratio_final_to_reasoning": 1.029066117631931, "adv/ratio_step_to_reasoning": 1.387054216325752, "adv/std_final_conf": 0.7937548756599426, "adv/std_reasoning": 0.7929351925849915, "adv/std_step_conf": 0.9345194697380066, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.890625, "calib/ece": 0.37919354838709707, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37919354838709707, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5025, "calib/step_q_c_n": 1464.0, "calib/step_q_gap": 0.04697129909365555, "calib/step_q_w": 0.4555287009063444, "calib/step_q_w_n": 1324.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 708.640625, "completions/mean_terminated_length": 717.0435180664062, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.050133333333333335, "grad_norm": 0.04193958640098572, "learning_rate": 4.25e-06, "loss": 0.0715, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.023382706567645073, "mask/share_reasoning": 0.8059793710708618, "mask/share_step_conf": 0.15891912579536438, "num_tokens": 12022456.0, "reward": 1.0744290351867676, "reward_std": 0.24675029516220093, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.596957802772522, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8283501863479614, "step": 47 }, { "adv/mean_abs_final_conf": 0.5286183953285217, "adv/mean_abs_reasoning": 0.5230193138122559, "adv/mean_abs_step_conf": 0.7533445954322815, "adv/ratio_final_to_reasoning": 1.0107053054608146, "adv/ratio_step_to_reasoning": 1.4403762452694122, "adv/std_final_conf": 0.7767484784126282, "adv/std_reasoning": 0.7754042148590088, "adv/std_step_conf": 0.9348289370536804, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.06640625, "calib/ece": 0.49574803149606333, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9800000000000002, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.49574803149606333, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5308748906386701, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.04723472327465339, "calib/step_q_w": 0.4836401673640167, "calib/step_q_w_n": 1434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 636.82421875, "completions/mean_terminated_length": 641.8385620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.0512, "grad_norm": 0.04802514612674713, "learning_rate": 4.222222222222223e-06, "loss": -0.0538, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.025706924498081207, "mask/share_reasoning": 0.8068351745605469, "mask/share_step_conf": 0.15964537858963013, "num_tokens": 12289171.0, "reward": 1.0197627544403076, "reward_std": 0.21757322549819946, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5005406141281128, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8296357989311218, "step": 48 }, { "adv/mean_abs_final_conf": 0.4833810031414032, "adv/mean_abs_reasoning": 0.4688076376914978, "adv/mean_abs_step_conf": 0.7697415351867676, "adv/ratio_final_to_reasoning": 1.031086023942928, "adv/ratio_step_to_reasoning": 1.6419133847245497, "adv/std_final_conf": 0.7202240824699402, "adv/std_reasoning": 0.7207056879997253, "adv/std_step_conf": 0.9345253109931946, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.875, "calib/ece": 0.41673469387755135, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41673469387755135, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5488761632068718, "calib/step_q_c_n": 1397.0, "calib/step_q_gap": 0.06362021511747018, "calib/step_q_w": 0.48525594808940165, "calib/step_q_w_n": 1387.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 700.39453125, "completions/mean_terminated_length": 714.3466186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.05226666666666667, "grad_norm": 0.0385357141494751, "learning_rate": 4.194444444444445e-06, "loss": -0.0333, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.022826887667179108, "mask/share_reasoning": 0.8002016544342041, "mask/share_step_conf": 0.1574402153491974, "num_tokens": 12573008.0, "reward": 1.0233741998672485, "reward_std": 0.2162887454032898, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.555243730545044, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7953780889511108, "step": 49 }, { "adv/mean_abs_final_conf": 0.3921908438205719, "adv/mean_abs_reasoning": 0.3828577399253845, "adv/mean_abs_step_conf": 0.7531270980834961, "adv/ratio_final_to_reasoning": 1.0243774721571681, "adv/ratio_step_to_reasoning": 1.9671199496457188, "adv/std_final_conf": 0.6827874183654785, "adv/std_reasoning": 0.6815925240516663, "adv/std_step_conf": 0.9339188933372498, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.4375, "calib/ece": 0.35698412698412707, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000001, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9800000000000005, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35698412698412707, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5290221270521057, "calib/step_q_c_n": 1401.0, "calib/step_q_gap": 0.016667447249150036, "calib/step_q_w": 0.5123546798029557, "calib/step_q_w_n": 1015.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 670.04296875, "completions/mean_terminated_length": 672.670654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.05333333333333334, "grad_norm": 0.04379646107554436, "learning_rate": 4.166666666666667e-06, "loss": -0.0075, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.025415990501642227, "mask/share_reasoning": 0.8144956231117249, "mask/share_step_conf": 0.15618211030960083, "num_tokens": 12849899.0, "reward": 1.0978294610977173, "reward_std": 0.19145922362804413, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6238265037536621, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8348673582077026, "step": 50 }, { "adv/mean_abs_final_conf": 0.42505133152008057, "adv/mean_abs_reasoning": 0.41325896978378296, "adv/mean_abs_step_conf": 0.7430156469345093, "adv/ratio_final_to_reasoning": 1.0285350412175382, "adv/ratio_step_to_reasoning": 1.7979419716485645, "adv/std_final_conf": 0.7020641565322876, "adv/std_reasoning": 0.7014035582542419, "adv/std_step_conf": 0.9343358874320984, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.30859375, "calib/ece": 0.38160642570281145, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38160642570281145, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5214336917562725, "calib/step_q_c_n": 1395.0, "calib/step_q_gap": 0.08580668210997028, "calib/step_q_w": 0.43562700964630224, "calib/step_q_w_n": 1244.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 700.93359375, "completions/mean_terminated_length": 700.93359375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.0544, "grad_norm": 0.04266020655632019, "learning_rate": 4.138888888888889e-06, "loss": 0.1176, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.024903567507863045, "mask/share_reasoning": 0.8158866167068481, "mask/share_step_conf": 0.15920981764793396, "num_tokens": 13138634.0, "reward": 1.0784484148025513, "reward_std": 0.19629380106925964, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5972671508789062, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.832461416721344, "step": 51 }, { "adv/mean_abs_final_conf": 0.44877541065216064, "adv/mean_abs_reasoning": 0.4400636851787567, "adv/mean_abs_step_conf": 0.7490113973617554, "adv/ratio_final_to_reasoning": 1.019796510747905, "adv/ratio_step_to_reasoning": 1.7020522769505557, "adv/std_final_conf": 0.7218416929244995, "adv/std_reasoning": 0.7205371856689453, "adv/std_step_conf": 0.9344486594200134, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.4765625, "calib/ece": 0.3107086614173231, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3107086614173231, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5440012886597938, "calib/step_q_c_n": 1552.0, "calib/step_q_gap": 0.06388674112256587, "calib/step_q_w": 0.48011454753722793, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 615.79296875, "completions/mean_terminated_length": 620.6417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.055466666666666664, "grad_norm": 0.052587732672691345, "learning_rate": 4.111111111111111e-06, "loss": -0.0315, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.025957010686397552, "mask/share_reasoning": 0.8111605644226074, "mask/share_step_conf": 0.15506988763809204, "num_tokens": 13404229.0, "reward": 1.1470290422439575, "reward_std": 0.19214141368865967, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6728858947753906, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8604689836502075, "step": 52 }, { "adv/mean_abs_final_conf": 0.4245341718196869, "adv/mean_abs_reasoning": 0.39320626854896545, "adv/mean_abs_step_conf": 0.752773106098175, "adv/ratio_final_to_reasoning": 1.079672949737881, "adv/ratio_step_to_reasoning": 1.914448385769906, "adv/std_final_conf": 0.681718111038208, "adv/std_reasoning": 0.6613555550575256, "adv/std_step_conf": 0.9345915913581848, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5052631578947369, "calib/avg_num_step_conf": 10.28125, "calib/ece": 0.3554545454545458, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001052631578947194, "calib/mean_conf": 0.9799604743083007, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9798947368421057, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3554545454545458, "calib/std_conf": 0.0006274509038097849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5141645408163266, "calib/step_q_c_n": 1568.0, "calib/step_q_gap": 0.04017017991407096, "calib/step_q_w": 0.4739943609022556, "calib/step_q_w_n": 1064.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 664.16796875, "completions/mean_terminated_length": 666.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.05653333333333333, "grad_norm": 0.044773347675800323, "learning_rate": 4.083333333333334e-06, "loss": 0.014, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02365219034254551, "mask/share_reasoning": 0.8136025667190552, "mask/share_step_conf": 0.15883903205394745, "num_tokens": 13680080.0, "reward": 1.1072542667388916, "reward_std": 0.18055878579616547, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6317120790481567, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8411349654197693, "step": 53 }, { "adv/mean_abs_final_conf": 0.3496309518814087, "adv/mean_abs_reasoning": 0.34337419271469116, "adv/mean_abs_step_conf": 0.7803263664245605, "adv/ratio_final_to_reasoning": 1.0182214018975977, "adv/ratio_step_to_reasoning": 2.272524793594293, "adv/std_final_conf": 0.6413673162460327, "adv/std_reasoning": 0.6402059197425842, "adv/std_step_conf": 0.9342555999755859, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.15234375, "calib/ece": 0.2988976377952759, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2988976377952759, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5104761904761904, "calib/step_q_c_n": 1470.0, "calib/step_q_gap": 0.03387825233186054, "calib/step_q_w": 0.4765979381443299, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 611.46484375, "completions/mean_terminated_length": 611.46484375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.0576, "grad_norm": 0.050735846161842346, "learning_rate": 4.055555555555556e-06, "loss": 0.0082, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.026796435937285423, "mask/share_reasoning": 0.8149226307868958, "mask/share_step_conf": 0.15828096866607666, "num_tokens": 13942847.0, "reward": 1.1371355056762695, "reward_std": 0.15493734180927277, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6880406141281128, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8350911140441895, "step": 54 }, { "adv/mean_abs_final_conf": 0.5220646858215332, "adv/mean_abs_reasoning": 0.5028437376022339, "adv/mean_abs_step_conf": 0.7370898127555847, "adv/ratio_final_to_reasoning": 1.0382244955678532, "adv/ratio_step_to_reasoning": 1.4658426816058854, "adv/std_final_conf": 0.7588990926742554, "adv/std_reasoning": 0.7576790452003479, "adv/std_step_conf": 0.9348087310791016, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.504, "calib/avg_num_step_conf": 9.53515625, "calib/ece": 0.4778486055776895, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000320000000000098, "calib/mean_conf": 0.9798406374501994, "calib/mu_c": 0.9800000000000005, "calib/mu_w": 0.9796800000000004, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4778486055776895, "calib/std_conf": 0.002519743155512655, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45781411359724616, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": 0.03114484072781687, "calib/step_q_w": 0.4266692728694293, "calib/step_q_w_n": 1279.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 630.19921875, "completions/mean_terminated_length": 637.6719360351562, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.058666666666666666, "grad_norm": 0.0496719554066658, "learning_rate": 4.027777777777779e-06, "loss": -0.0886, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.026188170537352562, "mask/share_reasoning": 0.8033748865127563, "mask/share_step_conf": 0.15871819853782654, "num_tokens": 14212002.0, "reward": 1.0265394449234009, "reward_std": 0.22830066084861755, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5116264820098877, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.831280529499054, "step": 55 }, { "adv/mean_abs_final_conf": 0.44553542137145996, "adv/mean_abs_reasoning": 0.44030097126960754, "adv/mean_abs_step_conf": 0.7443605661392212, "adv/ratio_final_to_reasoning": 1.011888345571346, "adv/ratio_step_to_reasoning": 1.6905721647464416, "adv/std_final_conf": 0.7214465141296387, "adv/std_reasoning": 0.7205111980438232, "adv/std_step_conf": 0.9344826936721802, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.29296875, "calib/ece": 0.4305928853754942, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4305928853754942, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4614436885865457, "calib/step_q_c_n": 1323.0, "calib/step_q_gap": 0.03695435931825297, "calib/step_q_w": 0.42448932926829275, "calib/step_q_w_n": 1312.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 670.90234375, "completions/mean_terminated_length": 673.5333862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 221.0, "epoch": 0.05973333333333333, "grad_norm": 0.052368298172950745, "learning_rate": 4.000000000000001e-06, "loss": 0.0264, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.024397222325205803, "mask/share_reasoning": 0.8169806003570557, "mask/share_step_conf": 0.154715895652771, "num_tokens": 14490593.0, "reward": 1.0649067163467407, "reward_std": 0.18028205633163452, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5603859424591064, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8421182036399841, "step": 56 }, { "adv/mean_abs_final_conf": 0.44250503182411194, "adv/mean_abs_reasoning": 0.4391246736049652, "adv/mean_abs_step_conf": 0.7414326667785645, "adv/ratio_final_to_reasoning": 1.007697946442854, "adv/ratio_step_to_reasoning": 1.6884331747788655, "adv/std_final_conf": 0.7213237285614014, "adv/std_reasoning": 0.7205445766448975, "adv/std_step_conf": 0.9351280927658081, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.78515625, "calib/ece": 0.3266135458167334, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000005, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3266135458167334, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44513675783855905, "calib/step_q_c_n": 1499.0, "calib/step_q_gap": 0.021886260820666414, "calib/step_q_w": 0.42325049701789264, "calib/step_q_w_n": 1006.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 620.91796875, "completions/mean_terminated_length": 625.8070678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.0608, "grad_norm": 0.053272590041160583, "learning_rate": 3.972222222222223e-06, "loss": -0.0094, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0259304977953434, "mask/share_reasoning": 0.8077971935272217, "mask/share_step_conf": 0.15845987200737, "num_tokens": 14756340.0, "reward": 1.1104764938354492, "reward_std": 0.2054135799407959, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6499218940734863, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8317291140556335, "step": 57 }, { "adv/mean_abs_final_conf": 0.6040040254592896, "adv/mean_abs_reasoning": 0.5898081660270691, "adv/mean_abs_step_conf": 0.7271995544433594, "adv/ratio_final_to_reasoning": 1.0240686044207277, "adv/ratio_step_to_reasoning": 1.2329424994939537, "adv/std_final_conf": 0.8113057017326355, "adv/std_reasoning": 0.8100230693817139, "adv/std_step_conf": 0.9353846907615662, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.5625, "calib/ece": 0.4779919678714861, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000004, "calib/mu_w": 0.9800000000000004, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4779919678714861, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44436501261564343, "calib/step_q_c_n": 1189.0, "calib/step_q_gap": 0.03760593670805268, "calib/step_q_w": 0.40675907590759075, "calib/step_q_w_n": 1515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 702.85546875, "completions/mean_terminated_length": 711.1897583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.06186666666666667, "grad_norm": 0.04196161404252052, "learning_rate": 3.944444444444445e-06, "loss": -0.0474, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0239163376390934, "mask/share_reasoning": 0.8132352828979492, "mask/share_step_conf": 0.15112963318824768, "num_tokens": 15042591.0, "reward": 1.0031821727752686, "reward_std": 0.26209405064582825, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.49945777654647827, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8103334903717041, "step": 58 }, { "adv/mean_abs_final_conf": 0.462576299905777, "adv/mean_abs_reasoning": 0.4544098377227783, "adv/mean_abs_step_conf": 0.7671006917953491, "adv/ratio_final_to_reasoning": 1.0179715787490955, "adv/ratio_step_to_reasoning": 1.688125185932559, "adv/std_final_conf": 0.7218502759933472, "adv/std_reasoning": 0.7205907106399536, "adv/std_step_conf": 0.9345336556434631, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.30078125, "calib/ece": 0.3887301587301588, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.3306690738754696e-16, "calib/mean_conf": 0.9800000000000001, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000005, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3887301587301588, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.502050909090909, "calib/step_q_c_n": 1375.0, "calib/step_q_gap": 0.08549781875810397, "calib/step_q_w": 0.416553090332805, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 665.68359375, "completions/mean_terminated_length": 668.2941284179688, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.06293333333333333, "grad_norm": 0.0447392538189888, "learning_rate": 3.916666666666667e-06, "loss": 0.0181, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.026031965389847755, "mask/share_reasoning": 0.8124123811721802, "mask/share_step_conf": 0.15764933824539185, "num_tokens": 15319254.0, "reward": 1.0689520835876465, "reward_std": 0.19910673797130585, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5899218916893005, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8241757750511169, "step": 59 }, { "adv/mean_abs_final_conf": 0.4998510181903839, "adv/mean_abs_reasoning": 0.4936041533946991, "adv/mean_abs_step_conf": 0.7445494532585144, "adv/ratio_final_to_reasoning": 1.0126556163531502, "adv/ratio_step_to_reasoning": 1.5083938174708849, "adv/std_final_conf": 0.7406123280525208, "adv/std_reasoning": 0.7393380999565125, "adv/std_step_conf": 0.9346485733985901, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.453125, "calib/ece": 0.4266403162055338, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4266403162055338, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5135161870503597, "calib/step_q_c_n": 1112.0, "calib/step_q_gap": 0.06483747982602511, "calib/step_q_w": 0.4486787072243346, "calib/step_q_w_n": 1052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 590.5625, "completions/mean_terminated_length": 592.8784790039062, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.064, "grad_norm": 0.04761456325650215, "learning_rate": 3.88888888888889e-06, "loss": 0.0079, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.028008155524730682, "mask/share_reasoning": 0.8162366151809692, "mask/share_step_conf": 0.15184903144836426, "num_tokens": 15579294.0, "reward": 1.07675039768219, "reward_std": 0.21444371342658997, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5641359090805054, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8543682098388672, "step": 60 }, { "adv/mean_abs_final_conf": 0.4446122646331787, "adv/mean_abs_reasoning": 0.4401038885116577, "adv/mean_abs_step_conf": 0.7297759056091309, "adv/ratio_final_to_reasoning": 1.0102438906794653, "adv/ratio_step_to_reasoning": 1.6581900879746492, "adv/std_final_conf": 0.7213276624679565, "adv/std_reasoning": 0.7204954028129578, "adv/std_step_conf": 0.9347290396690369, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.38671875, "calib/ece": 0.2858823529411768, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.3306690738754696e-16, "calib/mean_conf": 0.9800000000000003, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9800000000000005, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2858823529411768, "calib/std_conf": 3.3306690738754696e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.510771954674221, "calib/step_q_c_n": 1412.0, "calib/step_q_gap": 0.05141141045653391, "calib/step_q_w": 0.4593605442176871, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 522.85546875, "completions/mean_terminated_length": 524.9058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.04688733443617821, "learning_rate": 3.861111111111112e-06, "loss": -0.0613, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.031810835003852844, "mask/share_reasoning": 0.8003161549568176, "mask/share_step_conf": 0.16396674513816833, "num_tokens": 15817209.0, "reward": 1.1565698385238647, "reward_std": 0.19978997111320496, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.703040599822998, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8489201068878174, "step": 61 }, { "adv/mean_abs_final_conf": 0.577107310295105, "adv/mean_abs_reasoning": 0.5740156173706055, "adv/mean_abs_step_conf": 0.7687522172927856, "adv/ratio_final_to_reasoning": 1.005386078062931, "adv/ratio_step_to_reasoning": 1.3392531388156486, "adv/std_final_conf": 0.7763068675994873, "adv/std_reasoning": 0.775528073310852, "adv/std_step_conf": 0.9355931878089905, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.0859375, "calib/ece": 0.5302008032128516, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9800000000000002, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9800000000000003, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5302008032128516, "calib/std_conf": 2.220446049250313e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4828800856531049, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.04942462588298996, "calib/step_q_w": 0.43345545977011496, "calib/step_q_w_n": 1392.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 633.60546875, "completions/mean_terminated_length": 636.0902099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.06613333333333334, "grad_norm": 0.040969934314489365, "learning_rate": 3.833333333333334e-06, "loss": 0.0081, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.026683034375309944, "mask/share_reasoning": 0.8198766708374023, "mask/share_step_conf": 0.14953404664993286, "num_tokens": 16086492.0, "reward": 0.9675499200820923, "reward_std": 0.25620460510253906, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.4585171937942505, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.795846700668335, "step": 62 }, { "adv/mean_abs_final_conf": 0.5213485956192017, "adv/mean_abs_reasoning": 0.4952261447906494, "adv/mean_abs_step_conf": 0.7424039840698242, "adv/ratio_final_to_reasoning": 1.0527485293402576, "adv/ratio_step_to_reasoning": 1.49912114269263, "adv/std_final_conf": 0.7758664488792419, "adv/std_reasoning": 0.757529079914093, "adv/std_step_conf": 0.934902548789978, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5040650406504066, "calib/avg_num_step_conf": 8.828125, "calib/ece": 0.466126482213439, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 8.130081300794512e-05, "calib/mean_conf": 0.9799604743083007, "calib/mu_c": 0.9800000000000002, "calib/mu_w": 0.9799186991869923, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.466126482213439, "calib/std_conf": 0.0006274509038097849, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4964018957345971, "calib/step_q_c_n": 1055.0, "calib/step_q_gap": 0.061339655070696664, "calib/step_q_w": 0.4350622406639004, "calib/step_q_w_n": 1205.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 671.94140625, "completions/mean_terminated_length": 671.94140625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.0672, "grad_norm": 0.040248576551675797, "learning_rate": 3.8055555555555556e-06, "loss": 0.027, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02550509385764599, "mask/share_reasoning": 0.8300318717956543, "mask/share_step_conf": 0.14446300268173218, "num_tokens": 16367149.0, "reward": 1.043703556060791, "reward_std": 0.22153249382972717, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5226527452468872, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8442112803459167, "step": 63 }, { "adv/mean_abs_final_conf": 0.4774155020713806, "adv/mean_abs_reasoning": 0.45380038022994995, "adv/mean_abs_step_conf": 0.7652921080589294, "adv/ratio_final_to_reasoning": 1.0520385677717246, "adv/ratio_step_to_reasoning": 1.6864069344127484, "adv/std_final_conf": 0.7199251651763916, "adv/std_reasoning": 0.7014497518539429, "adv/std_step_conf": 0.9351070523262024, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5055555555555555, "calib/avg_num_step_conf": 8.43359375, "calib/ece": 0.3414056224899601, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011111111111117289, "calib/mean_conf": 0.9799598393574299, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.9798888888888891, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3414056224899601, "calib/std_conf": 0.0006324504316475356, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5067135761589403, "calib/step_q_c_n": 1208.0, "calib/step_q_gap": 0.04668203041761537, "calib/step_q_w": 0.46003154574132493, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 583.59765625, "completions/mean_terminated_length": 588.1929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.06826666666666667, "grad_norm": 0.043365515768527985, "learning_rate": 3.777777777777778e-06, "loss": 0.0162, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.028047874569892883, "mask/share_reasoning": 0.8139121532440186, "mask/share_step_conf": 0.15022748708724976, "num_tokens": 16620326.0, "reward": 1.088362455368042, "reward_std": 0.2054726779460907, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6348433494567871, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8149001598358154, "step": 64 }, { "adv/mean_abs_final_conf": 0.4592784643173218, "adv/mean_abs_reasoning": 0.4338899254798889, "adv/mean_abs_step_conf": 0.7477743029594421, "adv/ratio_final_to_reasoning": 1.058513778141663, "adv/ratio_step_to_reasoning": 1.7234193721653994, "adv/std_final_conf": 0.7208153605461121, "adv/std_reasoning": 0.7014332413673401, "adv/std_step_conf": 0.9351285099983215, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5093457943925234, "calib/avg_num_step_conf": 7.265625, "calib/ece": 0.4028458498023718, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001869158878503585, "calib/mean_conf": 0.9799209486166011, "calib/mu_c": 0.9800000000000003, "calib/mu_w": 0.97981308411215, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4028458498023718, "calib/std_conf": 0.0008855872135339171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5538731060606061, "calib/step_q_c_n": 1056.0, "calib/step_q_gap": 0.02426862844866584, "calib/step_q_w": 0.5296044776119403, "calib/step_q_w_n": 804.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 493.0625, "completions/mean_terminated_length": 494.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.06933333333333333, "grad_norm": 0.04905857890844345, "learning_rate": 3.7500000000000005e-06, "loss": -0.0156, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03320653736591339, "mask/share_reasoning": 0.8077315092086792, "mask/share_step_conf": 0.15515567362308502, "num_tokens": 16851574.0, "reward": 1.0689868927001953, "reward_std": 0.21185335516929626, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5867882370948792, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8263109922409058, "step": 65 }, { "adv/mean_abs_final_conf": 0.5579763650894165, "adv/mean_abs_reasoning": 0.45325833559036255, "adv/mean_abs_step_conf": 0.7479343414306641, "adv/ratio_final_to_reasoning": 1.2310338746725091, "adv/ratio_step_to_reasoning": 1.6501281558484526, "adv/std_final_conf": 0.78619784116745, "adv/std_reasoning": 0.7206733822822571, "adv/std_step_conf": 0.9343457818031311, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5081289978678039, "calib/avg_num_step_conf": 8.96484375, "calib/ece": 0.43483739837398416, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016257995735602382, "calib/mean_conf": 0.9795528455284557, "calib/mu_c": 0.979626865671642, "calib/mu_w": 0.979464285714286, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.43483739837398416, "calib/std_conf": 0.002066784360794499, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4463528301886792, "calib/step_q_c_n": 1060.0, "calib/step_q_gap": 0.10271720265831485, "calib/step_q_w": 0.34363562753036436, "calib/step_q_w_n": 1235.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 650.06640625, "completions/mean_terminated_length": 655.18505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.0704, "grad_norm": 0.052809618413448334, "learning_rate": 3.7222222222222225e-06, "loss": 0.0081, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.028058627620339394, "mask/share_reasoning": 0.814427375793457, "mask/share_step_conf": 0.1497015357017517, "num_tokens": 17124343.0, "reward": 1.0408997535705566, "reward_std": 0.2173253446817398, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5410004258155823, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8292826414108276, "step": 66 }, { "adv/mean_abs_final_conf": 0.514562726020813, "adv/mean_abs_reasoning": 0.36621803045272827, "adv/mean_abs_step_conf": 0.7596355676651001, "adv/ratio_final_to_reasoning": 1.4050720697304202, "adv/ratio_step_to_reasoning": 2.0742713479345047, "adv/std_final_conf": 0.7502995729446411, "adv/std_reasoning": 0.6611066460609436, "adv/std_step_conf": 0.9344931244850159, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6056374787942059, "calib/avg_num_step_conf": 8.1640625, "calib/ece": 0.3581960784313728, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002112749575884254, "calib/mean_conf": 0.9778039215686277, "calib/mu_c": 0.9786075949367091, "calib/mu_w": 0.9764948453608249, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3581960784313728, "calib/std_conf": 0.004139809637771501, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4553899480069324, "calib/step_q_c_n": 1154.0, "calib/step_q_gap": 0.08926815313513758, "calib/step_q_w": 0.3661217948717948, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 578.56640625, "completions/mean_terminated_length": 578.56640625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.07146666666666666, "grad_norm": 0.06071910262107849, "learning_rate": 3.694444444444445e-06, "loss": 0.0373, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030203592032194138, "mask/share_reasoning": 0.8200080990791321, "mask/share_step_conf": 0.1497882902622223, "num_tokens": 17377464.0, "reward": 1.1311116218566895, "reward_std": 0.15913987159729004, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6344921588897705, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8700499534606934, "step": 67 }, { "adv/mean_abs_final_conf": 0.6473630666732788, "adv/mean_abs_reasoning": 0.4455353021621704, "adv/mean_abs_step_conf": 0.7649122476577759, "adv/ratio_final_to_reasoning": 1.4530006119192886, "adv/ratio_step_to_reasoning": 1.7168386970587473, "adv/std_final_conf": 0.8262444734573364, "adv/std_reasoning": 0.7205067873001099, "adv/std_step_conf": 0.9339404702186584, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.501116213802781, "calib/avg_num_step_conf": 8.4453125, "calib/ece": 0.4406772908366535, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.2324276055307735e-05, "calib/mean_conf": 0.9745418326693228, "calib/mu_c": 0.97455223880597, "calib/mu_w": 0.9745299145299147, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4406772908366535, "calib/std_conf": 0.004978964018458066, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44734989648033124, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.09595357541009714, "calib/step_q_w": 0.3513963210702341, "calib/step_q_w_n": 1196.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 587.65625, "completions/mean_terminated_length": 589.9608154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.07253333333333334, "grad_norm": 0.06573013961315155, "learning_rate": 3.6666666666666666e-06, "loss": -0.046, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0318446159362793, "mask/share_reasoning": 0.8090634346008301, "mask/share_step_conf": 0.15518571436405182, "num_tokens": 17631992.0, "reward": 1.0506805181503296, "reward_std": 0.1893540322780609, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5460590124130249, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8363473415374756, "step": 68 }, { "adv/mean_abs_final_conf": 0.5252597332000732, "adv/mean_abs_reasoning": 0.4546195864677429, "adv/mean_abs_step_conf": 0.7643197774887085, "adv/ratio_final_to_reasoning": 1.1553829813651078, "adv/ratio_step_to_reasoning": 1.6812293184005613, "adv/std_final_conf": 0.7402248978614807, "adv/std_reasoning": 0.7014936804771423, "adv/std_step_conf": 0.9350502490997314, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5089355089355089, "calib/avg_num_step_conf": 8.1796875, "calib/ece": 0.4404016064257028, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00017871017870996209, "calib/mean_conf": 0.9705220883534137, "calib/mu_c": 0.9706060606060605, "calib/mu_w": 0.9704273504273505, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4404016064257028, "calib/std_conf": 0.002224479104277308, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40132663316582917, "calib/step_q_c_n": 995.0, "calib/step_q_gap": 0.025148289216784636, "calib/step_q_w": 0.37617834394904454, "calib/step_q_w_n": 1099.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 623.11328125, "completions/mean_terminated_length": 633.0040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.0736, "grad_norm": 0.05324266478419304, "learning_rate": 3.638888888888889e-06, "loss": -0.0488, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02733626589179039, "mask/share_reasoning": 0.8253809213638306, "mask/share_step_conf": 0.13165783882141113, "num_tokens": 17896005.0, "reward": 1.031106948852539, "reward_std": 0.19873470067977905, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5418062806129456, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8151673674583435, "step": 69 }, { "adv/mean_abs_final_conf": 0.5496417284011841, "adv/mean_abs_reasoning": 0.5222122073173523, "adv/mean_abs_step_conf": 0.7780110836029053, "adv/ratio_final_to_reasoning": 1.0525256221503123, "adv/ratio_step_to_reasoning": 1.4898370292789844, "adv/std_final_conf": 0.7696312665939331, "adv/std_reasoning": 0.7575502991676331, "adv/std_step_conf": 0.9350816011428833, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.516640461215933, "calib/avg_num_step_conf": 8.25, "calib/ece": 0.5462799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00033280922431877524, "calib/mean_conf": 0.9702799999999999, "calib/mu_c": 0.9704716981132075, "calib/mu_w": 0.9701388888888888, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5462799999999999, "calib/std_conf": 0.0016497272501841038, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4809333333333333, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.08823876651982376, "calib/step_q_w": 0.39269456681350956, "calib/step_q_w_n": 1362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 594.40625, "completions/mean_terminated_length": 599.0866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.07466666666666667, "grad_norm": 0.04241788014769554, "learning_rate": 3.6111111111111115e-06, "loss": -0.0155, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029813095927238464, "mask/share_reasoning": 0.815823495388031, "mask/share_step_conf": 0.14655089378356934, "num_tokens": 18155165.0, "reward": 0.989437460899353, "reward_std": 0.21000359952449799, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.44679102301597595, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8359725475311279, "step": 70 }, { "adv/mean_abs_final_conf": 0.6323713064193726, "adv/mean_abs_reasoning": 0.6069374084472656, "adv/mean_abs_step_conf": 0.7431070804595947, "adv/ratio_final_to_reasoning": 1.041905306244304, "adv/ratio_step_to_reasoning": 1.2243553785236165, "adv/std_final_conf": 0.818419337272644, "adv/std_reasoning": 0.8099581599235535, "adv/std_step_conf": 0.93465656042099, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49574877428163594, "calib/avg_num_step_conf": 8.83984375, "calib/ece": 0.45429133858267723, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00023893750387904422, "calib/mean_conf": 0.9700393700787402, "calib/mu_c": 0.9699236641221373, "calib/mu_w": 0.9701626016260163, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45429133858267723, "calib/std_conf": 0.0022619814838146833, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43796892341842397, "calib/step_q_c_n": 901.0, "calib/step_q_gap": 0.054371272904473966, "calib/step_q_w": 0.38359765051395, "calib/step_q_w_n": 1362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 604.88671875, "completions/mean_terminated_length": 604.88671875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.07573333333333333, "grad_norm": 0.04573419690132141, "learning_rate": 3.5833333333333335e-06, "loss": 0.0376, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030430704355239868, "mask/share_reasoning": 0.8164864182472229, "mask/share_step_conf": 0.15308287739753723, "num_tokens": 18414424.0, "reward": 1.0544276237487793, "reward_std": 0.24058274924755096, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5392640829086304, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8463940620422363, "step": 71 }, { "adv/mean_abs_final_conf": 0.48391905426979065, "adv/mean_abs_reasoning": 0.44466525316238403, "adv/mean_abs_step_conf": 0.7678430676460266, "adv/ratio_final_to_reasoning": 1.0882771946497736, "adv/ratio_step_to_reasoning": 1.726788999556985, "adv/std_final_conf": 0.7654800415039062, "adv/std_reasoning": 0.7391319274902344, "adv/std_step_conf": 0.9339613318443298, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5026279128672746, "calib/avg_num_step_conf": 7.640625, "calib/ece": 0.412806324110672, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.255825734518105e-05, "calib/mean_conf": 0.9701185770750989, "calib/mu_c": 0.970141843971631, "calib/mu_w": 0.9700892857142858, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.412806324110672, "calib/std_conf": 0.001082455647243412, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45534722222222224, "calib/step_q_c_n": 1008.0, "calib/step_q_gap": 0.05440840365682137, "calib/step_q_w": 0.40093881856540087, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 541.44140625, "completions/mean_terminated_length": 543.5647583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0768, "grad_norm": 0.04849497973918915, "learning_rate": 3.555555555555556e-06, "loss": -0.0047, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030670518055558205, "mask/share_reasoning": 0.8153384923934937, "mask/share_step_conf": 0.1500847041606903, "num_tokens": 18657441.0, "reward": 1.0819460153579712, "reward_std": 0.18929322063922882, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5721668004989624, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.855941653251648, "step": 72 }, { "adv/mean_abs_final_conf": 0.5332915186882019, "adv/mean_abs_reasoning": 0.5199190378189087, "adv/mean_abs_step_conf": 0.7636487483978271, "adv/ratio_final_to_reasoning": 1.0257203139269366, "adv/ratio_step_to_reasoning": 1.4687839699068899, "adv/std_final_conf": 0.7628645896911621, "adv/std_reasoning": 0.7575727105140686, "adv/std_step_conf": 0.9346169233322144, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.497632702053881, "calib/avg_num_step_conf": 7.37890625, "calib/ece": 0.33086274509803926, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.734595892252891e-05, "calib/mean_conf": 0.970078431372549, "calib/mu_c": 0.9700613496932514, "calib/mu_w": 0.970108695652174, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33086274509803926, "calib/std_conf": 0.0008821350493491766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4796269335759782, "calib/step_q_c_n": 1099.0, "calib/step_q_gap": 0.06263959180382628, "calib/step_q_w": 0.4169873417721519, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 516.22265625, "completions/mean_terminated_length": 518.2470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.07786666666666667, "grad_norm": 0.039155565202236176, "learning_rate": 3.5277777777777784e-06, "loss": -0.0826, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031482987105846405, "mask/share_reasoning": 0.8183671832084656, "mask/share_step_conf": 0.14624357223510742, "num_tokens": 18896626.0, "reward": 1.1384083032608032, "reward_std": 0.20831382274627686, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6573105454444885, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8619623184204102, "step": 73 }, { "adv/mean_abs_final_conf": 0.41902828216552734, "adv/mean_abs_reasoning": 0.40599530935287476, "adv/mean_abs_step_conf": 0.7844444513320923, "adv/ratio_final_to_reasoning": 1.0321012891341679, "adv/ratio_step_to_reasoning": 1.9321515132340723, "adv/std_final_conf": 0.6868499517440796, "adv/std_reasoning": 0.681566059589386, "adv/std_step_conf": 0.9346157312393188, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.508130081300813, "calib/avg_num_step_conf": 7.19921875, "calib/ece": 0.4858267716535433, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016260162601633432, "calib/mean_conf": 0.9700787401574803, "calib/mu_c": 0.9701626016260163, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4858267716535433, "calib/std_conf": 0.000883856075615892, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47340071343638523, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.06918913659007786, "calib/step_q_w": 0.4042115768463074, "calib/step_q_w_n": 1002.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 558.4921875, "completions/mean_terminated_length": 558.4921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.07893333333333333, "grad_norm": 0.046783946454524994, "learning_rate": 3.5e-06, "loss": 0.06, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0309610553085804, "mask/share_reasoning": 0.8248661160469055, "mask/share_step_conf": 0.14417284727096558, "num_tokens": 19143528.0, "reward": 1.0438826084136963, "reward_std": 0.1730068027973175, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5102828145027161, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8553006052970886, "step": 74 }, { "adv/mean_abs_final_conf": 0.4345305562019348, "adv/mean_abs_reasoning": 0.41730642318725586, "adv/mean_abs_step_conf": 0.769504725933075, "adv/ratio_final_to_reasoning": 1.041274545651913, "adv/ratio_step_to_reasoning": 1.8439800663882397, "adv/std_final_conf": 0.6885610222816467, "adv/std_reasoning": 0.6816553473472595, "adv/std_step_conf": 0.9336322546005249, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5056497175141242, "calib/avg_num_step_conf": 7.20703125, "calib/ece": 0.2732283464566928, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011299435028266913, "calib/mean_conf": 0.9700787401574802, "calib/mu_c": 0.9701129943502824, "calib/mu_w": 0.9699999999999998, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2732283464566928, "calib/std_conf": 0.0008838560756158926, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45865293185419964, "calib/step_q_c_n": 1262.0, "calib/step_q_gap": 0.02942480149399379, "calib/step_q_w": 0.42922813036020585, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 496.20703125, "completions/mean_terminated_length": 498.1529846191406, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.08, "grad_norm": 0.03853510692715645, "learning_rate": 3.4722222222222224e-06, "loss": -0.0564, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03274504095315933, "mask/share_reasoning": 0.8104146718978882, "mask/share_step_conf": 0.1529339998960495, "num_tokens": 19375309.0, "reward": 1.164655089378357, "reward_std": 0.18265056610107422, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.708564043045044, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8560182452201843, "step": 75 }, { "adv/mean_abs_final_conf": 0.43551361560821533, "adv/mean_abs_reasoning": 0.43244290351867676, "adv/mean_abs_step_conf": 0.7513256669044495, "adv/ratio_final_to_reasoning": 1.0071008497643341, "adv/ratio_step_to_reasoning": 1.7373985346761518, "adv/std_final_conf": 0.7214844226837158, "adv/std_reasoning": 0.7204523086547852, "adv/std_step_conf": 0.9339141249656677, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.3582352941176471, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.9700000000000001, "calib/mu_c": 0.97, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3582352941176471, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45638655462184874, "calib/step_q_c_n": 952.0, "calib/step_q_gap": 0.05774953883992617, "calib/step_q_w": 0.39863701578192257, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 541.484375, "completions/mean_terminated_length": 541.484375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.08106666666666666, "grad_norm": 0.04242115840315819, "learning_rate": 3.444444444444445e-06, "loss": -0.0574, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.032614558935165405, "mask/share_reasoning": 0.8351109027862549, "mask/share_step_conf": 0.1322745382785797, "num_tokens": 19616985.0, "reward": 1.1245042085647583, "reward_std": 0.17931437492370605, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6314507722854614, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8648300170898438, "step": 76 }, { "adv/mean_abs_final_conf": 0.45722830295562744, "adv/mean_abs_reasoning": 0.40404924750328064, "adv/mean_abs_step_conf": 0.7532762885093689, "adv/ratio_final_to_reasoning": 1.1316152815057898, "adv/ratio_step_to_reasoning": 1.864318008668616, "adv/std_final_conf": 0.7099508047103882, "adv/std_reasoning": 0.6816768646240234, "adv/std_step_conf": 0.9351339340209961, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.328125, "calib/ece": 0.3280708661417322, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00030674846625766694, "calib/mean_conf": 0.9698031496062991, "calib/mu_c": 0.9696932515337423, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3280708661417322, "calib/std_conf": 0.0038115848483745015, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.40374316939890703, "calib/step_q_c_n": 1098.0, "calib/step_q_gap": 0.007803580709961011, "calib/step_q_w": 0.395939588688946, "calib/step_q_w_n": 778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2061.0, "completions/max_terminated_length": 2061.0, "completions/mean_length": 508.76953125, "completions/mean_terminated_length": 512.7755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.08213333333333334, "grad_norm": 0.0632067620754242, "learning_rate": 3.416666666666667e-06, "loss": -0.1064, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03560970723628998, "mask/share_reasoning": 0.8034310340881348, "mask/share_step_conf": 0.15314674377441406, "num_tokens": 19851894.0, "reward": 1.1153215169906616, "reward_std": 0.18604689836502075, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6566660404205322, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.833172082901001, "step": 77 }, { "adv/mean_abs_final_conf": 0.5653814077377319, "adv/mean_abs_reasoning": 0.5597813129425049, "adv/mean_abs_step_conf": 0.7569602727890015, "adv/ratio_final_to_reasoning": 1.0100040759949453, "adv/ratio_step_to_reasoning": 1.3522428407086695, "adv/std_final_conf": 0.7767623662948608, "adv/std_reasoning": 0.7754603028297424, "adv/std_step_conf": 0.9340821504592896, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.59765625, "calib/ece": 0.37784313725490204, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.3306690738754696e-16, "calib/mean_conf": 0.9700000000000001, "calib/mu_c": 0.9699999999999998, "calib/mu_w": 0.9700000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37784313725490204, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3812932454695222, "calib/step_q_c_n": 1214.0, "calib/step_q_gap": 0.020171494443530447, "calib/step_q_w": 0.36112175102599176, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 575.14453125, "completions/mean_terminated_length": 577.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.0832, "grad_norm": 0.04561072587966919, "learning_rate": 3.3888888888888893e-06, "loss": -0.0392, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.02947339601814747, "mask/share_reasoning": 0.8271172046661377, "mask/share_step_conf": 0.1395030915737152, "num_tokens": 20107155.0, "reward": 1.1115992069244385, "reward_std": 0.2046259045600891, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6133222579956055, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8617924451828003, "step": 78 }, { "adv/mean_abs_final_conf": 0.4079931676387787, "adv/mean_abs_reasoning": 0.3414595127105713, "adv/mean_abs_step_conf": 0.7496354579925537, "adv/ratio_final_to_reasoning": 1.1948507874331877, "adv/ratio_step_to_reasoning": 2.1953860709335737, "adv/std_final_conf": 0.7016916871070862, "adv/std_reasoning": 0.6401836276054382, "adv/std_step_conf": 0.9334725737571716, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4964635854341737, "calib/avg_num_step_conf": 8.0546875, "calib/ece": 0.30387351778656135, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9683794466403162, "calib/gap": -0.002706582633052901, "calib/mean_conf": 0.9658498023715416, "calib/mu_c": 0.9649404761904763, "calib/mu_w": 0.9676470588235292, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3028458498023716, "calib/std_conf": 0.024409410847163165, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37789432176656146, "calib/step_q_c_n": 1268.0, "calib/step_q_gap": 0.04780616055749354, "calib/step_q_w": 0.3300881612090679, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 575.953125, "completions/mean_terminated_length": 575.953125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08426666666666667, "grad_norm": 0.05421285331249237, "learning_rate": 3.3611111111111117e-06, "loss": 0.0006, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.029839934781193733, "mask/share_reasoning": 0.8252792358398438, "mask/share_step_conf": 0.14488080143928528, "num_tokens": 20360975.0, "reward": 1.152230978012085, "reward_std": 0.14872446656227112, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6759929656982422, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8663750290870667, "step": 79 }, { "adv/mean_abs_final_conf": 0.4370696544647217, "adv/mean_abs_reasoning": 0.4167598485946655, "adv/mean_abs_step_conf": 0.7575744390487671, "adv/ratio_final_to_reasoning": 1.0487326356858555, "adv/ratio_step_to_reasoning": 1.8177721332881394, "adv/std_final_conf": 0.6915924549102783, "adv/std_reasoning": 0.681614875793457, "adv/std_step_conf": 0.9341986775398254, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5116374589266155, "calib/avg_num_step_conf": 8.19140625, "calib/ece": 0.31618110236220476, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.001143209200437978, "calib/mean_conf": 0.9697244094488189, "calib/mu_c": 0.9701204819277108, "calib/mu_w": 0.9689772727272729, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31618110236220476, "calib/std_conf": 0.005709747385167787, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4008960843373494, "calib/step_q_c_n": 1328.0, "calib/step_q_gap": 0.05092209213969012, "calib/step_q_w": 0.3499739921976593, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 541.7734375, "completions/mean_terminated_length": 541.7734375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.08533333333333333, "grad_norm": 0.04697663336992264, "learning_rate": 3.3333333333333333e-06, "loss": 0.0711, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03264261782169342, "mask/share_reasoning": 0.8089589476585388, "mask/share_step_conf": 0.15839841961860657, "num_tokens": 20601829.0, "reward": 1.139902114868164, "reward_std": 0.16698545217514038, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6688238382339478, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8552368879318237, "step": 80 }, { "adv/mean_abs_final_conf": 0.40665262937545776, "adv/mean_abs_reasoning": 0.38142406940460205, "adv/mean_abs_step_conf": 0.7557171583175659, "adv/ratio_final_to_reasoning": 1.0661430727490195, "adv/ratio_step_to_reasoning": 1.9813043248613817, "adv/std_final_conf": 0.6708281636238098, "adv/std_reasoning": 0.6612600684165955, "adv/std_step_conf": 0.9344485998153687, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5261813537675607, "calib/avg_num_step_conf": 7.49609375, "calib/ece": 0.3181526104417671, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.004093231162196842, "calib/mean_conf": 0.9687550200803213, "calib/mu_c": 0.9701851851851852, "calib/mu_w": 0.9660919540229883, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3181526104417671, "calib/std_conf": 0.01259726116483335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4059841740850643, "calib/step_q_c_n": 1011.0, "calib/step_q_gap": 0.0693762445696458, "calib/step_q_w": 0.3366079295154185, "calib/step_q_w_n": 908.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 546.99609375, "completions/mean_terminated_length": 553.4822387695312, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0864, "grad_norm": 0.04766637831926346, "learning_rate": 3.3055555555555558e-06, "loss": -0.1404, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032697394490242004, "mask/share_reasoning": 0.8120584487915039, "mask/share_step_conf": 0.1435253918170929, "num_tokens": 20848108.0, "reward": 1.109546184539795, "reward_std": 0.167487233877182, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6547554731369019, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8283078670501709, "step": 81 }, { "adv/mean_abs_final_conf": 0.42695850133895874, "adv/mean_abs_reasoning": 0.3536386787891388, "adv/mean_abs_step_conf": 0.7595793008804321, "adv/ratio_final_to_reasoning": 1.2073297604234567, "adv/ratio_step_to_reasoning": 2.147896557812162, "adv/std_final_conf": 0.676507294178009, "adv/std_reasoning": 0.6185661554336548, "adv/std_step_conf": 0.93404620885849, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5224534847051403, "calib/avg_num_step_conf": 7.01171875, "calib/ece": 0.3791796875000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.98828125, "calib/gap": 0.0015736360769472713, "calib/mean_conf": 0.9690234375000002, "calib/mu_c": 0.9696688741721854, "calib/mu_w": 0.9680952380952381, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3791796875000001, "calib/std_conf": 0.010871382188277335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43265495867768594, "calib/step_q_c_n": 968.0, "calib/step_q_gap": 0.07552073860513453, "calib/step_q_w": 0.3571342200725514, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 488.78125, "completions/mean_terminated_length": 490.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.08746666666666666, "grad_norm": 0.05314906686544418, "learning_rate": 3.277777777777778e-06, "loss": -0.0194, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03320850431919098, "mask/share_reasoning": 0.8129414319992065, "mask/share_step_conf": 0.14994382858276367, "num_tokens": 21078788.0, "reward": 1.1136729717254639, "reward_std": 0.1398569941520691, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6149379014968872, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8629594445228577, "step": 82 }, { "adv/mean_abs_final_conf": 0.41274362802505493, "adv/mean_abs_reasoning": 0.33305490016937256, "adv/mean_abs_step_conf": 0.7500869631767273, "adv/ratio_final_to_reasoning": 1.239266042370664, "adv/ratio_step_to_reasoning": 2.252142102684201, "adv/std_final_conf": 0.6662124395370483, "adv/std_reasoning": 0.6185774207115173, "adv/std_step_conf": 0.9347962737083435, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5276705276705277, "calib/avg_num_step_conf": 7.77734375, "calib/ece": 0.41266932270916334, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006241956241953561, "calib/mean_conf": 0.9704382470119521, "calib/mu_c": 0.9707142857142855, "calib/mu_w": 0.9700900900900902, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41266932270916334, "calib/std_conf": 0.002233208997169965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4368304712041885, "calib/step_q_c_n": 955.0, "calib/step_q_gap": 0.0845042163779337, "calib/step_q_w": 0.3523262548262548, "calib/step_q_w_n": 1036.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 609.265625, "completions/mean_terminated_length": 614.06298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.08853333333333334, "grad_norm": 0.040096431970596313, "learning_rate": 3.2500000000000002e-06, "loss": -0.0661, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029325375333428383, "mask/share_reasoning": 0.8259421586990356, "mask/share_step_conf": 0.13691997528076172, "num_tokens": 21342024.0, "reward": 1.0817862749099731, "reward_std": 0.1610364317893982, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5719507932662964, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8574353456497192, "step": 83 }, { "adv/mean_abs_final_conf": 0.5629602670669556, "adv/mean_abs_reasoning": 0.48572349548339844, "adv/mean_abs_step_conf": 0.755538821220398, "adv/ratio_final_to_reasoning": 1.159013867564076, "adv/ratio_step_to_reasoning": 1.5554916083861163, "adv/std_final_conf": 0.7861579656600952, "adv/std_reasoning": 0.7574853897094727, "adv/std_step_conf": 0.9348664283752441, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5920216147488875, "calib/avg_num_step_conf": 6.91015625, "calib/ece": 0.40600790513833995, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.0028111888111884697, "calib/mean_conf": 0.9712252964426878, "calib/mu_c": 0.9724475524475522, "calib/mu_w": 0.9696363636363637, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40600790513833995, "calib/std_conf": 0.007627741677189908, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48983559127439724, "calib/step_q_c_n": 871.0, "calib/step_q_gap": 0.12687345318976467, "calib/step_q_w": 0.36296213808463257, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 503.91015625, "completions/mean_terminated_length": 507.8779602050781, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.0896, "grad_norm": 0.05198689177632332, "learning_rate": 3.2222222222222227e-06, "loss": -0.0975, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.035145025700330734, "mask/share_reasoning": 0.8113081455230713, "mask/share_step_conf": 0.14573431015014648, "num_tokens": 21576945.0, "reward": 1.0832618474960327, "reward_std": 0.20970940589904785, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5838117599487305, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8488913178443909, "step": 84 }, { "adv/mean_abs_final_conf": 0.6328294277191162, "adv/mean_abs_reasoning": 0.4318961501121521, "adv/mean_abs_step_conf": 0.7624772191047668, "adv/ratio_final_to_reasoning": 1.465235167191898, "adv/ratio_step_to_reasoning": 1.7654179573186088, "adv/std_final_conf": 0.8357207775115967, "adv/std_reasoning": 0.72041255235672, "adv/std_step_conf": 0.9348968863487244, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5754936120789779, "calib/avg_num_step_conf": 7.08203125, "calib/ece": 0.4671887550200804, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0016898954703832292, "calib/mean_conf": 0.9732128514056225, "calib/mu_c": 0.9740476190476192, "calib/mu_w": 0.9723577235772359, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4671887550200804, "calib/std_conf": 0.005607553309464117, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5065581339712919, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.12335444922205957, "calib/step_q_w": 0.38320368474923233, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 585.8671875, "completions/mean_terminated_length": 590.4802856445312, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.09066666666666667, "grad_norm": 0.05411646142601967, "learning_rate": 3.1944444444444443e-06, "loss": -0.0524, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032674334943294525, "mask/share_reasoning": 0.8182451725006104, "mask/share_step_conf": 0.14126797020435333, "num_tokens": 21834751.0, "reward": 1.0298364162445068, "reward_std": 0.1878812611103058, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5180214643478394, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.832455039024353, "step": 85 }, { "adv/mean_abs_final_conf": 0.544785737991333, "adv/mean_abs_reasoning": 0.3841592073440552, "adv/mean_abs_step_conf": 0.763214111328125, "adv/ratio_final_to_reasoning": 1.4181249012819308, "adv/ratio_step_to_reasoning": 1.9867130521346221, "adv/std_final_conf": 0.7668167948722839, "adv/std_reasoning": 0.6611892580986023, "adv/std_step_conf": 0.9353486895561218, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6578898050974513, "calib/avg_num_step_conf": 7.19140625, "calib/ece": 0.4315748031496064, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0031146926536730346, "calib/mean_conf": 0.9748818897637797, "calib/mu_c": 0.9763043478260871, "calib/mu_w": 0.9731896551724141, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4315748031496064, "calib/std_conf": 0.006000805947482726, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5288233155080214, "calib/step_q_c_n": 935.0, "calib/step_q_gap": 0.05683247665592428, "calib/step_q_w": 0.47199083885209714, "calib/step_q_w_n": 906.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 535.390625, "completions/mean_terminated_length": 537.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.09173333333333333, "grad_norm": 0.0701175406575203, "learning_rate": 3.1666666666666667e-06, "loss": -0.0438, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03447920083999634, "mask/share_reasoning": 0.8152116537094116, "mask/share_step_conf": 0.14640289545059204, "num_tokens": 22077323.0, "reward": 1.0588345527648926, "reward_std": 0.18509967625141144, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5626976490020752, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8324810266494751, "step": 86 }, { "adv/mean_abs_final_conf": 0.5743167400360107, "adv/mean_abs_reasoning": 0.5077738165855408, "adv/mean_abs_step_conf": 0.7713330388069153, "adv/ratio_final_to_reasoning": 1.131048355147434, "adv/ratio_step_to_reasoning": 1.5190484692449175, "adv/std_final_conf": 0.7639585733413696, "adv/std_reasoning": 0.7394582033157349, "adv/std_step_conf": 0.9350627660751343, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5658251319355299, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.30130434782608717, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0014413065183285223, "calib/mean_conf": 0.9771936758893283, "calib/mu_c": 0.9776608187134505, "calib/mu_w": 0.9762195121951219, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30130434782608717, "calib/std_conf": 0.004913283552019082, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5630849898580123, "calib/step_q_c_n": 986.0, "calib/step_q_gap": 0.040424356373849335, "calib/step_q_w": 0.5226606334841629, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 477.34765625, "completions/mean_terminated_length": 481.1062927246094, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.0928, "grad_norm": 0.05513214319944382, "learning_rate": 3.138888888888889e-06, "loss": -0.0012, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03634363412857056, "mask/share_reasoning": 0.812298595905304, "mask/share_step_conf": 0.1435452699661255, "num_tokens": 22305020.0, "reward": 1.1288774013519287, "reward_std": 0.23732128739356995, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.682665228843689, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8292262554168701, "step": 87 }, { "adv/mean_abs_final_conf": 0.526688277721405, "adv/mean_abs_reasoning": 0.4350961446762085, "adv/mean_abs_step_conf": 0.7446381449699402, "adv/ratio_final_to_reasoning": 1.2105101002753262, "adv/ratio_step_to_reasoning": 1.711433562630365, "adv/std_final_conf": 0.782920777797699, "adv/std_reasoning": 0.7204387784004211, "adv/std_step_conf": 0.9349430203437805, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6315269994905757, "calib/avg_num_step_conf": 7.00390625, "calib/ece": 0.38584313725490216, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0026305399898116644, "calib/mean_conf": 0.9780000000000002, "calib/mu_c": 0.9790728476821194, "calib/mu_w": 0.9764423076923078, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38584313725490216, "calib/std_conf": 0.0040000000000000036, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5448845401174169, "calib/step_q_c_n": 1022.0, "calib/step_q_gap": 0.11432682286709261, "calib/step_q_w": 0.4305577172503243, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2205.0, "completions/max_terminated_length": 2205.0, "completions/mean_length": 539.45703125, "completions/mean_terminated_length": 539.45703125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.09386666666666667, "grad_norm": 0.05252831056714058, "learning_rate": 3.1111111111111116e-06, "loss": 0.0136, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03185161575675011, "mask/share_reasoning": 0.8286886215209961, "mask/share_step_conf": 0.1394597440958023, "num_tokens": 22552969.0, "reward": 1.1006900072097778, "reward_std": 0.18641909956932068, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6084863543510437, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.850470781326294, "step": 88 }, { "adv/mean_abs_final_conf": 0.5511601567268372, "adv/mean_abs_reasoning": 0.3774305284023285, "adv/mean_abs_step_conf": 0.755623459815979, "adv/ratio_final_to_reasoning": 1.460295644498896, "adv/ratio_step_to_reasoning": 2.0020199823648324, "adv/std_final_conf": 0.7930145859718323, "adv/std_reasoning": 0.6612715125083923, "adv/std_step_conf": 0.9353261590003967, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6280397022332507, "calib/avg_num_step_conf": 7.1015625, "calib/ece": 0.4659055118110238, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0024714640198508775, "calib/mean_conf": 0.977716535433071, "calib/mu_c": 0.978923076923077, "calib/mu_w": 0.9764516129032261, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4659055118110238, "calib/std_conf": 0.004470194622751896, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48424789410348973, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.10486228113489804, "calib/step_q_w": 0.3793856129685917, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 566.0546875, "completions/mean_terminated_length": 568.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.09493333333333333, "grad_norm": 0.06655947118997574, "learning_rate": 3.0833333333333336e-06, "loss": -0.0578, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03302885964512825, "mask/share_reasoning": 0.8250543475151062, "mask/share_step_conf": 0.13801056146621704, "num_tokens": 22806767.0, "reward": 1.0520408153533936, "reward_std": 0.1870345175266266, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5262097120285034, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8524354100227356, "step": 89 }, { "adv/mean_abs_final_conf": 0.5208219885826111, "adv/mean_abs_reasoning": 0.4154004752635956, "adv/mean_abs_step_conf": 0.7598018646240234, "adv/ratio_final_to_reasoning": 1.25378284233334, "adv/ratio_step_to_reasoning": 1.829082800499651, "adv/std_final_conf": 0.7493654489517212, "adv/std_reasoning": 0.6815720200538635, "adv/std_step_conf": 0.9325631260871887, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5939135919268368, "calib/avg_num_step_conf": 6.89453125, "calib/ece": 0.38828125000000036, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0016650898770105282, "calib/mean_conf": 0.9781250000000004, "calib/mu_c": 0.9788079470198678, "calib/mu_w": 0.9771428571428573, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38828125000000036, "calib/std_conf": 0.004463392767839285, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4680568136272545, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.0973006206676717, "calib/step_q_w": 0.3707561929595828, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 504.16015625, "completions/mean_terminated_length": 506.1372985839844, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.096, "grad_norm": 0.05374359339475632, "learning_rate": 3.055555555555556e-06, "loss": -0.0964, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035145342350006104, "mask/share_reasoning": 0.8199383616447449, "mask/share_step_conf": 0.14101001620292664, "num_tokens": 23039152.0, "reward": 1.1121408939361572, "reward_std": 0.16609475016593933, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6080952882766724, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8654782772064209, "step": 90 }, { "adv/mean_abs_final_conf": 0.5854384899139404, "adv/mean_abs_reasoning": 0.46899843215942383, "adv/mean_abs_step_conf": 0.7628809213638306, "adv/ratio_final_to_reasoning": 1.2482738742182742, "adv/ratio_step_to_reasoning": 1.626617210320458, "adv/std_final_conf": 0.8055523633956909, "adv/std_reasoning": 0.7393237352371216, "adv/std_step_conf": 0.9347228407859802, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5655892255892256, "calib/avg_num_step_conf": 6.921875, "calib/ece": 0.37485943775100405, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0013353535353536339, "calib/mean_conf": 0.977269076305221, "calib/mu_c": 0.9778000000000001, "calib/mu_w": 0.9764646464646465, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37485943775100405, "calib/std_conf": 0.004544722826805921, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4203867403314917, "calib/step_q_c_n": 905.0, "calib/step_q_gap": 0.09297036201545938, "calib/step_q_w": 0.3274163783160323, "calib/step_q_w_n": 867.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 503.62890625, "completions/mean_terminated_length": 517.787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.09706666666666666, "grad_norm": 0.08242634683847427, "learning_rate": 3.0277777777777776e-06, "loss": -0.1546, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.030973821878433228, "mask/share_reasoning": 0.8174804449081421, "mask/share_step_conf": 0.1242019534111023, "num_tokens": 23275793.0, "reward": 1.0824449062347412, "reward_std": 0.20780614018440247, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6036179065704346, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8330353498458862, "step": 91 }, { "adv/mean_abs_final_conf": 0.5711477994918823, "adv/mean_abs_reasoning": 0.43993639945983887, "adv/mean_abs_step_conf": 0.7688874006271362, "adv/ratio_final_to_reasoning": 1.2982508385147193, "adv/ratio_step_to_reasoning": 1.7477239927662016, "adv/std_final_conf": 0.8007752299308777, "adv/std_reasoning": 0.7204440236091614, "adv/std_step_conf": 0.9330356121063232, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.562315775983596, "calib/avg_num_step_conf": 6.10546875, "calib/ece": 0.37564705882352956, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001241830065359606, "calib/mean_conf": 0.9756470588235295, "calib/mu_c": 0.9761437908496734, "calib/mu_w": 0.9749019607843138, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37564705882352956, "calib/std_conf": 0.005113701431436252, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40907103825136615, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.07323770491803283, "calib/step_q_w": 0.3358333333333333, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 489.6953125, "completions/mean_terminated_length": 491.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.09813333333333334, "grad_norm": 0.07487954944372177, "learning_rate": 3e-06, "loss": -0.022, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03821246325969696, "mask/share_reasoning": 0.8173834085464478, "mask/share_step_conf": 0.14049787819385529, "num_tokens": 23507875.0, "reward": 1.1162192821502686, "reward_std": 0.16407504677772522, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6170394420623779, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8644327521324158, "step": 92 }, { "adv/mean_abs_final_conf": 0.538732647895813, "adv/mean_abs_reasoning": 0.3245697617530823, "adv/mean_abs_step_conf": 0.7636405825614929, "adv/ratio_final_to_reasoning": 1.6598362243789548, "adv/ratio_step_to_reasoning": 2.3527779619299087, "adv/std_final_conf": 0.746753990650177, "adv/std_reasoning": 0.5961504578590393, "adv/std_step_conf": 0.934518575668335, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5975177304964538, "calib/avg_num_step_conf": 7.4296875, "calib/ece": 0.4168379446640317, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001999746707193273, "calib/mean_conf": 0.9741501976284586, "calib/mu_c": 0.9750354609929077, "calib/mu_w": 0.9730357142857144, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4168379446640317, "calib/std_conf": 0.005006830311016599, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4416926503340758, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.036832092565151464, "calib/step_q_w": 0.4048605577689243, "calib/step_q_w_n": 1004.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2443.0, "completions/max_terminated_length": 2443.0, "completions/mean_length": 569.02734375, "completions/mean_terminated_length": 571.2588500976562, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0992, "grad_norm": 0.07459692656993866, "learning_rate": 2.9722222222222225e-06, "loss": -0.0646, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032702356576919556, "mask/share_reasoning": 0.818350076675415, "mask/share_step_conf": 0.1450413316488266, "num_tokens": 23759322.0, "reward": 1.0743638277053833, "reward_std": 0.15945008397102356, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5736898183822632, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8448168635368347, "step": 93 }, { "adv/mean_abs_final_conf": 0.5882172584533691, "adv/mean_abs_reasoning": 0.454916775226593, "adv/mean_abs_step_conf": 0.7690072059631348, "adv/ratio_final_to_reasoning": 1.2930216920674764, "adv/ratio_step_to_reasoning": 1.6904349275317314, "adv/std_final_conf": 0.785946249961853, "adv/std_reasoning": 0.7205522060394287, "adv/std_step_conf": 0.9350289702415466, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6242, "calib/avg_num_step_conf": 6.41796875, "calib/ece": 0.37396000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0024333333333332874, "calib/mean_conf": 0.97396, "calib/mu_c": 0.9749333333333334, "calib/mu_w": 0.9725000000000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37396000000000007, "calib/std_conf": 0.004971760251661381, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5005479452054794, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.13027416998934105, "calib/step_q_w": 0.37027377521613836, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 518.140625, "completions/mean_terminated_length": 522.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.10026666666666667, "grad_norm": 0.050825074315071106, "learning_rate": 2.944444444444445e-06, "loss": -0.0603, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03436311334371567, "mask/share_reasoning": 0.8233369588851929, "mask/share_step_conf": 0.13448745012283325, "num_tokens": 24000646.0, "reward": 1.0840754508972168, "reward_std": 0.19846372306346893, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6067355275154114, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8326102495193481, "step": 94 }, { "adv/mean_abs_final_conf": 0.4853938817977905, "adv/mean_abs_reasoning": 0.3698340356349945, "adv/mean_abs_step_conf": 0.7322413325309753, "adv/ratio_final_to_reasoning": 1.3124640650349637, "adv/ratio_step_to_reasoning": 1.9799187256352375, "adv/std_final_conf": 0.7317978143692017, "adv/std_reasoning": 0.6612231731414795, "adv/std_step_conf": 0.9335426688194275, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7348688873139616, "calib/avg_num_step_conf": 6.828125, "calib/ece": 0.3020158102766799, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004806520198441078, "calib/mean_conf": 0.9739525691699605, "calib/mu_c": 0.9755294117647061, "calib/mu_w": 0.970722891566265, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3020158102766799, "calib/std_conf": 0.00504815969678723, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5019259962049336, "calib/step_q_c_n": 1054.0, "calib/step_q_gap": 0.11604703366891045, "calib/step_q_w": 0.38587896253602316, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 528.54296875, "completions/mean_terminated_length": 530.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.10133333333333333, "grad_norm": 0.05134730041027069, "learning_rate": 2.916666666666667e-06, "loss": -0.026, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0339992381632328, "mask/share_reasoning": 0.8192802667617798, "mask/share_step_conf": 0.1428142488002777, "num_tokens": 24242081.0, "reward": 1.1571464538574219, "reward_std": 0.18443702161312103, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6784464716911316, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8707724809646606, "step": 95 }, { "adv/mean_abs_final_conf": 0.43800491094589233, "adv/mean_abs_reasoning": 0.29693323373794556, "adv/mean_abs_step_conf": 0.7541441917419434, "adv/ratio_final_to_reasoning": 1.4750956147012082, "adv/ratio_step_to_reasoning": 2.539776980327851, "adv/std_final_conf": 0.6472437381744385, "adv/std_reasoning": 0.5726427435874939, "adv/std_step_conf": 0.9342457056045532, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6856541759288663, "calib/avg_num_step_conf": 6.33203125, "calib/ece": 0.23807843137254914, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0037924738012073966, "calib/mean_conf": 0.9753333333333335, "calib/mu_c": 0.976329787234043, "calib/mu_w": 0.9725373134328356, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23807843137254914, "calib/std_conf": 0.005143687037512034, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5463881636205395, "calib/step_q_c_n": 1149.0, "calib/step_q_gap": 0.11132460429850566, "calib/step_q_w": 0.4350635593220339, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 457.33203125, "completions/mean_terminated_length": 459.1255187988281, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1024, "grad_norm": 0.07884582877159119, "learning_rate": 2.888888888888889e-06, "loss": -0.0015, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03597208485007286, "mask/share_reasoning": 0.8180271983146667, "mask/share_step_conf": 0.1420944631099701, "num_tokens": 24464974.0, "reward": 1.182147741317749, "reward_std": 0.1482551097869873, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7442148327827454, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8498453497886658, "step": 96 }, { "adv/mean_abs_final_conf": 0.6110037565231323, "adv/mean_abs_reasoning": 0.45363372564315796, "adv/mean_abs_step_conf": 0.7293815016746521, "adv/ratio_final_to_reasoning": 1.3469099010591783, "adv/ratio_step_to_reasoning": 1.6078643637894015, "adv/std_final_conf": 0.8205944299697876, "adv/std_reasoning": 0.7391649484634399, "adv/std_step_conf": 0.9352025389671326, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6430432516935904, "calib/avg_num_step_conf": 6.61328125, "calib/ece": 0.37557312252964437, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0028465346534654046, "calib/mean_conf": 0.9763636363636364, "calib/mu_c": 0.9775, "calib/mu_w": 0.9746534653465346, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37557312252964437, "calib/std_conf": 0.005050944205764396, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6193447311827956, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.1463041020871207, "calib/step_q_w": 0.47304062909567496, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 487.59375, "completions/mean_terminated_length": 489.50592041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.10346666666666667, "grad_norm": 0.05761410668492317, "learning_rate": 2.861111111111111e-06, "loss": -0.0134, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03603067994117737, "mask/share_reasoning": 0.8179956674575806, "mask/share_step_conf": 0.14206740260124207, "num_tokens": 24694870.0, "reward": 1.0798836946487427, "reward_std": 0.21278470754623413, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6129417419433594, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8208003044128418, "step": 97 }, { "adv/mean_abs_final_conf": 0.6085901260375977, "adv/mean_abs_reasoning": 0.5149928331375122, "adv/mean_abs_step_conf": 0.7647644281387329, "adv/ratio_final_to_reasoning": 1.181744845515342, "adv/ratio_step_to_reasoning": 1.4850001377291542, "adv/std_final_conf": 0.7990161180496216, "adv/std_reasoning": 0.7576404213905334, "adv/std_step_conf": 0.9346080422401428, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5509554140127388, "calib/avg_num_step_conf": 6.2734375, "calib/ece": 0.34451612903225826, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001038706516413468, "calib/mean_conf": 0.9775806451612905, "calib/mu_c": 0.9779617834394906, "calib/mu_w": 0.9769230769230771, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.34451612903225826, "calib/std_conf": 0.004375696763306628, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6171648351648352, "calib/step_q_c_n": 910.0, "calib/step_q_gap": 0.03322805355563985, "calib/step_q_w": 0.5839367816091954, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 495.140625, "completions/mean_terminated_length": 503.0000305175781, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.10453333333333334, "grad_norm": 0.07149934023618698, "learning_rate": 2.8333333333333335e-06, "loss": -0.1034, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034688372164964676, "mask/share_reasoning": 0.8171087503433228, "mask/share_step_conf": 0.1325778365135193, "num_tokens": 24927810.0, "reward": 1.077172875404358, "reward_std": 0.26091310381889343, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6252793073654175, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.808440089225769, "step": 98 }, { "adv/mean_abs_final_conf": 0.5918498039245605, "adv/mean_abs_reasoning": 0.4674447774887085, "adv/mean_abs_step_conf": 0.7354602813720703, "adv/ratio_final_to_reasoning": 1.266138445495537, "adv/ratio_step_to_reasoning": 1.5733629228317476, "adv/std_final_conf": 0.8150935769081116, "adv/std_reasoning": 0.7392680048942566, "adv/std_step_conf": 0.935352087020874, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.684703196347032, "calib/avg_num_step_conf": 6.765625, "calib/ece": 0.5582071713147414, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003666666666666818, "calib/mean_conf": 0.976533864541833, "calib/mu_c": 0.978666666666667, "calib/mu_w": 0.9750000000000002, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5582071713147414, "calib/std_conf": 0.0050037683613772975, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6395565092989985, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.08880142701439053, "calib/step_q_w": 0.550755082284608, "calib/step_q_w_n": 1033.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 562.4921875, "completions/mean_terminated_length": 571.420654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.1056, "grad_norm": 0.05055514723062515, "learning_rate": 2.805555555555556e-06, "loss": -0.1247, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030417390167713165, "mask/share_reasoning": 0.8215442895889282, "mask/share_step_conf": 0.1324133276939392, "num_tokens": 25177608.0, "reward": 0.9419336915016174, "reward_std": 0.21998000144958496, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.438107430934906, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7784231901168823, "step": 99 }, { "adv/mean_abs_final_conf": 0.5585629940032959, "adv/mean_abs_reasoning": 0.4550975561141968, "adv/mean_abs_step_conf": 0.767812967300415, "adv/ratio_final_to_reasoning": 1.2273478213606068, "adv/ratio_step_to_reasoning": 1.6871392891148578, "adv/std_final_conf": 0.798711895942688, "adv/std_reasoning": 0.7205420136451721, "adv/std_step_conf": 0.9352794885635376, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6672497570456755, "calib/avg_num_step_conf": 6.56640625, "calib/ece": 0.39373015873015876, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.004272108843537681, "calib/mean_conf": 0.9770634920634922, "calib/mu_c": 0.9788435374149663, "calib/mu_w": 0.9745714285714286, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39373015873015876, "calib/std_conf": 0.008268074027423371, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6221988372093025, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.054375707957601827, "calib/step_q_w": 0.5678231292517006, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 547.8984375, "completions/mean_terminated_length": 552.2125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.10666666666666667, "grad_norm": 0.05975564941763878, "learning_rate": 2.7777777777777783e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03263147920370102, "mask/share_reasoning": 0.8286091685295105, "mask/share_step_conf": 0.1309468150138855, "num_tokens": 25425278.0, "reward": 1.0638865232467651, "reward_std": 0.22568199038505554, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5942621231079102, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8145281076431274, "step": 100 }, { "adv/mean_abs_final_conf": 0.5452511310577393, "adv/mean_abs_reasoning": 0.4778488278388977, "adv/mean_abs_step_conf": 0.7576862573623657, "adv/ratio_final_to_reasoning": 1.1410536121301644, "adv/ratio_step_to_reasoning": 1.5856191607480778, "adv/std_final_conf": 0.7721120715141296, "adv/std_reasoning": 0.7392907738685608, "adv/std_step_conf": 0.9355528950691223, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6051843533595358, "calib/avg_num_step_conf": 6.76953125, "calib/ece": 0.43818897637795295, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0021298895751451496, "calib/mean_conf": 0.9775590551181105, "calib/mu_c": 0.9785401459854017, "calib/mu_w": 0.9764102564102566, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43818897637795295, "calib/std_conf": 0.004386187236915108, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.602814569536424, "calib/step_q_c_n": 906.0, "calib/step_q_gap": 0.047498850068467546, "calib/step_q_w": 0.5553157194679564, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 568.68359375, "completions/mean_terminated_length": 570.9137573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.10773333333333333, "grad_norm": 0.05229601263999939, "learning_rate": 2.7500000000000004e-06, "loss": -0.0096, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031221158802509308, "mask/share_reasoning": 0.8331286907196045, "mask/share_step_conf": 0.13174395263195038, "num_tokens": 25677853.0, "reward": 1.027117133140564, "reward_std": 0.22584015130996704, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5561999678611755, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7950435876846313, "step": 101 }, { "adv/mean_abs_final_conf": 0.48673343658447266, "adv/mean_abs_reasoning": 0.3910636007785797, "adv/mean_abs_step_conf": 0.746878981590271, "adv/ratio_final_to_reasoning": 1.244640093364407, "adv/ratio_step_to_reasoning": 1.9098657612298569, "adv/std_final_conf": 0.7263456583023071, "adv/std_reasoning": 0.6612106561660767, "adv/std_step_conf": 0.9347814917564392, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6418454204971059, "calib/avg_num_step_conf": 6.7578125, "calib/ece": 0.3285433070866144, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0028593803200548384, "calib/mean_conf": 0.9781496062992128, "calib/mu_c": 0.9791515151515156, "calib/mu_w": 0.9762921348314607, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3285433070866144, "calib/std_conf": 0.003983388222953356, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5955947580645161, "calib/step_q_c_n": 992.0, "calib/step_q_gap": 0.05136711578809339, "calib/step_q_w": 0.5442276422764227, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 501.609375, "completions/mean_terminated_length": 501.609375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1088, "grad_norm": 0.07377835363149643, "learning_rate": 2.7222222222222224e-06, "loss": 0.01, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03854910284280777, "mask/share_reasoning": 0.80856853723526, "mask/share_step_conf": 0.15288236737251282, "num_tokens": 25912961.0, "reward": 1.1215847730636597, "reward_std": 0.18449583649635315, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6605261564254761, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8368663191795349, "step": 102 }, { "adv/mean_abs_final_conf": 0.4801088869571686, "adv/mean_abs_reasoning": 0.4155534505844116, "adv/mean_abs_step_conf": 0.7427923679351807, "adv/ratio_final_to_reasoning": 1.155348093685589, "adv/ratio_step_to_reasoning": 1.787477319441237, "adv/std_final_conf": 0.7518900036811829, "adv/std_reasoning": 0.7204484343528748, "adv/std_step_conf": 0.9343893527984619, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6622718052738337, "calib/avg_num_step_conf": 6.9453125, "calib/ece": 0.3909311740890692, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003231913455037261, "calib/mean_conf": 0.9779757085020246, "calib/mu_c": 0.9793103448275864, "calib/mu_w": 0.9760784313725491, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3909311740890692, "calib/std_conf": 0.00421480629559188, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6107300115874854, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.06798684218857831, "calib/step_q_w": 0.5427431693989071, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 617.6953125, "completions/mean_terminated_length": 622.55908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.10986666666666667, "grad_norm": 0.060474298894405365, "learning_rate": 2.6944444444444444e-06, "loss": 0.007, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033057354390621185, "mask/share_reasoning": 0.8269200921058655, "mask/share_step_conf": 0.13221006095409393, "num_tokens": 26175643.0, "reward": 1.0655455589294434, "reward_std": 0.19261986017227173, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5849835872650146, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8265714645385742, "step": 103 }, { "adv/mean_abs_final_conf": 0.6086057424545288, "adv/mean_abs_reasoning": 0.5149220824241638, "adv/mean_abs_step_conf": 0.7702325582504272, "adv/ratio_final_to_reasoning": 1.1819375459473762, "adv/ratio_step_to_reasoning": 1.4958235129950264, "adv/std_final_conf": 0.8138259053230286, "adv/std_reasoning": 0.757559597492218, "adv/std_step_conf": 0.9335253834724426, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.67715625, "calib/avg_num_step_conf": 7.01171875, "calib/ece": 0.4819762845849804, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0035431250000000913, "calib/mean_conf": 0.9760474308300396, "calib/mu_c": 0.9778400000000004, "calib/mu_w": 0.9742968750000003, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4819762845849804, "calib/std_conf": 0.004889058054092092, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5650879396984924, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.07133418594473862, "calib/step_q_w": 0.49375375375375374, "calib/step_q_w_n": 999.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2414.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 537.5390625, "completions/mean_terminated_length": 541.7716674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.11093333333333333, "grad_norm": 0.07125498354434967, "learning_rate": 2.666666666666667e-06, "loss": -0.0048, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032777514308691025, "mask/share_reasoning": 0.8187971711158752, "mask/share_step_conf": 0.14061282575130463, "num_tokens": 26419933.0, "reward": 1.0168688297271729, "reward_std": 0.21766817569732666, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.513393759727478, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8166875839233398, "step": 104 }, { "adv/mean_abs_final_conf": 0.551266074180603, "adv/mean_abs_reasoning": 0.5299026370048523, "adv/mean_abs_step_conf": 0.7634249925613403, "adv/ratio_final_to_reasoning": 1.0403157781899377, "adv/ratio_step_to_reasoning": 1.4406891742913701, "adv/std_final_conf": 0.8026061058044434, "adv/std_reasoning": 0.7926851511001587, "adv/std_step_conf": 0.9345788359642029, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6324555628703095, "calib/avg_num_step_conf": 7.8984375, "calib/ece": 0.36264822134387353, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0026491112574061892, "calib/mean_conf": 0.9752964426877471, "calib/mu_c": 0.9763225806451614, "calib/mu_w": 0.9736734693877552, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36264822134387353, "calib/std_conf": 0.004991204437095454, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5292671166827387, "calib/step_q_c_n": 1037.0, "calib/step_q_gap": 0.05610975627664733, "calib/step_q_w": 0.4731573604060914, "calib/step_q_w_n": 985.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 577.125, "completions/mean_terminated_length": 581.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.112, "grad_norm": 0.06227808818221092, "learning_rate": 2.6388888888888893e-06, "loss": 0.0437, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03263796865940094, "mask/share_reasoning": 0.8158507347106934, "mask/share_step_conf": 0.1436988264322281, "num_tokens": 26673437.0, "reward": 1.1061831712722778, "reward_std": 0.21990132331848145, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6249972581863403, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8457460403442383, "step": 105 }, { "adv/mean_abs_final_conf": 0.4969498813152313, "adv/mean_abs_reasoning": 0.3592734932899475, "adv/mean_abs_step_conf": 0.7686994075775146, "adv/ratio_final_to_reasoning": 1.3832077528585547, "adv/ratio_step_to_reasoning": 2.139593991581073, "adv/std_final_conf": 0.7539573907852173, "adv/std_reasoning": 0.6611307263374329, "adv/std_step_conf": 0.9345561265945435, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6483600305110602, "calib/avg_num_step_conf": 6.74609375, "calib/ece": 0.4277777777777778, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0029672006102209325, "calib/mean_conf": 0.9753968253968255, "calib/mu_c": 0.9767391304347826, "calib/mu_w": 0.9737719298245616, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4277777777777778, "calib/std_conf": 0.004984228085113522, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5138065326633166, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.058919314618203766, "calib/step_q_w": 0.4548872180451128, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 545.23046875, "completions/mean_terminated_length": 547.36865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.11306666666666666, "grad_norm": 0.05896085128188133, "learning_rate": 2.6111111111111113e-06, "loss": -0.0141, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032741058617830276, "mask/share_reasoning": 0.8298637866973877, "mask/share_step_conf": 0.13348886370658875, "num_tokens": 26917600.0, "reward": 1.0589934587478638, "reward_std": 0.16930758953094482, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.561801552772522, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8343318700790405, "step": 106 }, { "adv/mean_abs_final_conf": 0.5585216283798218, "adv/mean_abs_reasoning": 0.4725401997566223, "adv/mean_abs_step_conf": 0.7294274568557739, "adv/ratio_final_to_reasoning": 1.1819557969194652, "adv/ratio_step_to_reasoning": 1.5436304831450511, "adv/std_final_conf": 0.8102872371673584, "adv/std_reasoning": 0.7752585411071777, "adv/std_step_conf": 0.933884859085083, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.519613337069207, "calib/avg_num_step_conf": 6.52734375, "calib/ece": 0.31571428571428584, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003922667413843728, "calib/mean_conf": 0.9744444444444446, "calib/mu_c": 0.9745783132530123, "calib/mu_w": 0.9741860465116279, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31571428571428584, "calib/std_conf": 0.004969039949999537, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47640812557710066, "calib/step_q_c_n": 1083.0, "calib/step_q_gap": -0.0019422145589537143, "calib/step_q_w": 0.4783503401360544, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1915.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 492.140625, "completions/mean_terminated_length": 499.9524230957031, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.11413333333333334, "grad_norm": 0.07943214476108551, "learning_rate": 2.5833333333333337e-06, "loss": -0.0572, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03573369234800339, "mask/share_reasoning": 0.8075776100158691, "mask/share_step_conf": 0.1410636603832245, "num_tokens": 27148204.0, "reward": 1.1372442245483398, "reward_std": 0.2124222069978714, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6651140451431274, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8552078008651733, "step": 107 }, { "adv/mean_abs_final_conf": 0.472678542137146, "adv/mean_abs_reasoning": 0.3605843186378479, "adv/mean_abs_step_conf": 0.7309489846229553, "adv/ratio_final_to_reasoning": 1.3108682704859378, "adv/ratio_step_to_reasoning": 2.0271236069949077, "adv/std_final_conf": 0.6875787973403931, "adv/std_reasoning": 0.6403064131736755, "adv/std_step_conf": 0.9338570237159729, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6144736842105263, "calib/avg_num_step_conf": 7.04296875, "calib/ece": 0.2261811023622049, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002289473684210841, "calib/mean_conf": 0.974212598425197, "calib/mu_c": 0.9747894736842108, "calib/mu_w": 0.9724999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2261811023622049, "calib/std_conf": 0.004937610632684351, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47036405886909366, "calib/step_q_c_n": 1291.0, "calib/step_q_gap": 0.038860152619093646, "calib/step_q_w": 0.43150390625, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 543.02734375, "completions/mean_terminated_length": 545.1569213867188, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.1152, "grad_norm": 0.08610007166862488, "learning_rate": 2.5555555555555557e-06, "loss": -0.0756, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03323179855942726, "mask/share_reasoning": 0.8196040391921997, "mask/share_step_conf": 0.14325785636901855, "num_tokens": 27390451.0, "reward": 1.1965038776397705, "reward_std": 0.16265276074409485, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7552535533905029, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8605860471725464, "step": 108 }, { "adv/mean_abs_final_conf": 0.3658500611782074, "adv/mean_abs_reasoning": 0.26386594772338867, "adv/mean_abs_step_conf": 0.7497037053108215, "adv/ratio_final_to_reasoning": 1.3864997144752036, "adv/ratio_step_to_reasoning": 2.841229464351868, "adv/std_final_conf": 0.6053043007850647, "adv/std_reasoning": 0.5483630895614624, "adv/std_step_conf": 0.9330347180366516, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6764097926490537, "calib/avg_num_step_conf": 7.3203125, "calib/ece": 0.42448, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.0043640591693041575, "calib/mean_conf": 0.97248, "calib/mu_c": 0.9744525547445255, "calib/mu_w": 0.9700884955752214, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.42448, "calib/std_conf": 0.007915150030163675, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44699999999999995, "calib/step_q_c_n": 910.0, "calib/step_q_gap": 0.08826556016597509, "calib/step_q_w": 0.35873443983402487, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 587.4765625, "completions/mean_terminated_length": 587.4765625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.11626666666666667, "grad_norm": 0.06257660686969757, "learning_rate": 2.5277777777777778e-06, "loss": 0.0175, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03028855472803116, "mask/share_reasoning": 0.8285435438156128, "mask/share_step_conf": 0.14116786420345306, "num_tokens": 27645445.0, "reward": 1.0708765983581543, "reward_std": 0.14244344830513, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5602999925613403, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8537813425064087, "step": 109 }, { "adv/mean_abs_final_conf": 0.5903299450874329, "adv/mean_abs_reasoning": 0.41868531703948975, "adv/mean_abs_step_conf": 0.7497169971466064, "adv/ratio_final_to_reasoning": 1.4099609445624621, "adv/ratio_step_to_reasoning": 1.7906455436455975, "adv/std_final_conf": 0.7605558633804321, "adv/std_reasoning": 0.6614421010017395, "adv/std_step_conf": 0.9334558844566345, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5956521739130435, "calib/avg_num_step_conf": 6.7265625, "calib/ece": 0.42758893280632426, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001913043478260601, "calib/mean_conf": 0.9730434782608697, "calib/mu_c": 0.9739130434782607, "calib/mu_w": 0.9720000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42758893280632426, "calib/std_conf": 0.004601306627938423, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44374149659863943, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.08096540743957159, "calib/step_q_w": 0.36277608915906784, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 484.45703125, "completions/mean_terminated_length": 488.2716369628906, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.11733333333333333, "grad_norm": 0.0538322739303112, "learning_rate": 2.5e-06, "loss": -0.0599, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037332724779844284, "mask/share_reasoning": 0.8117356300354004, "mask/share_step_conf": 0.14311917126178741, "num_tokens": 27874386.0, "reward": 1.075239658355713, "reward_std": 0.18618369102478027, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5634796619415283, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8543539643287659, "step": 110 }, { "adv/mean_abs_final_conf": 0.5618973970413208, "adv/mean_abs_reasoning": 0.41221410036087036, "adv/mean_abs_step_conf": 0.7482247352600098, "adv/ratio_final_to_reasoning": 1.3631202730557035, "adv/ratio_step_to_reasoning": 1.8151361988951396, "adv/std_final_conf": 0.7664877772331238, "adv/std_reasoning": 0.68166184425354, "adv/std_step_conf": 0.9334881901741028, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6659549689440992, "calib/avg_num_step_conf": 6.5390625, "calib/ece": 0.42140000000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.0050271739130431925, "calib/mean_conf": 0.9734, "calib/mu_c": 0.9756521739130434, "calib/mu_w": 0.9706250000000002, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42140000000000016, "calib/std_conf": 0.010509043724335723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4369565217391304, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.07084777973486606, "calib/step_q_w": 0.36610874200426435, "calib/step_q_w_n": 938.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 558.140625, "completions/mean_terminated_length": 562.5354614257812, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.1184, "grad_norm": 0.057587627321481705, "learning_rate": 2.4722222222222226e-06, "loss": -0.0325, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0363595113158226, "mask/share_reasoning": 0.8237424492835999, "mask/share_step_conf": 0.13208553194999695, "num_tokens": 28124678.0, "reward": 1.0738959312438965, "reward_std": 0.18189433217048645, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5639668107032776, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8537999987602234, "step": 111 }, { "adv/mean_abs_final_conf": 0.5185309648513794, "adv/mean_abs_reasoning": 0.4189787209033966, "adv/mean_abs_step_conf": 0.7416942119598389, "adv/ratio_final_to_reasoning": 1.2376069212616085, "adv/ratio_step_to_reasoning": 1.7702431530665979, "adv/std_final_conf": 0.7534166574478149, "adv/std_reasoning": 0.7013196349143982, "adv/std_step_conf": 0.9343290328979492, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7158685256280731, "calib/avg_num_step_conf": 7.2109375, "calib/ece": 0.3785887096774195, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9758064516129032, "calib/gap": 0.006921937091668218, "calib/mean_conf": 0.9713306451612905, "calib/mu_c": 0.9741496598639455, "calib/mu_w": 0.9672277227722773, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3785887096774195, "calib/std_conf": 0.01641951129591469, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.412217659137577, "calib/step_q_c_n": 974.0, "calib/step_q_gap": 0.1006465582201459, "calib/step_q_w": 0.3115711009174311, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 598.7265625, "completions/mean_terminated_length": 605.8261108398438, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.11946666666666667, "grad_norm": 0.08669447153806686, "learning_rate": 2.4444444444444447e-06, "loss": -0.0538, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02860027551651001, "mask/share_reasoning": 0.8302577137947083, "mask/share_step_conf": 0.12942329049110413, "num_tokens": 28385872.0, "reward": 1.0846667289733887, "reward_std": 0.17266321182250977, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5990207195281982, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8411459922790527, "step": 112 }, { "adv/mean_abs_final_conf": 0.5774078369140625, "adv/mean_abs_reasoning": 0.44517821073532104, "adv/mean_abs_step_conf": 0.7621119022369385, "adv/ratio_final_to_reasoning": 1.297026276196968, "adv/ratio_step_to_reasoning": 1.7119254353849072, "adv/std_final_conf": 0.8084132671356201, "adv/std_reasoning": 0.7392378449440002, "adv/std_step_conf": 0.9335689544677734, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6918721061193842, "calib/avg_num_step_conf": 6.69921875, "calib/ece": 0.4561660079051384, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.004625829057689601, "calib/mean_conf": 0.9739525691699605, "calib/mu_c": 0.9761832061068701, "calib/mu_w": 0.9715573770491805, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4561660079051384, "calib/std_conf": 0.008208602166376925, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41018404907975453, "calib/step_q_c_n": 815.0, "calib/step_q_gap": 0.039806271301976726, "calib/step_q_w": 0.3703777777777778, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3042.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 484.421875, "completions/mean_terminated_length": 486.32159423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.12053333333333334, "grad_norm": 0.0665605217218399, "learning_rate": 2.4166666666666667e-06, "loss": -0.0361, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03515036031603813, "mask/share_reasoning": 0.8132357597351074, "mask/share_step_conf": 0.14770758152008057, "num_tokens": 28615084.0, "reward": 1.0729321241378784, "reward_std": 0.18777276575565338, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5378601551055908, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8725234866142273, "step": 113 }, { "adv/mean_abs_final_conf": 0.4570245146751404, "adv/mean_abs_reasoning": 0.40027526021003723, "adv/mean_abs_step_conf": 0.7232214212417603, "adv/ratio_final_to_reasoning": 1.141775573227596, "adv/ratio_step_to_reasoning": 1.8068101957194727, "adv/std_final_conf": 0.7423800826072693, "adv/std_reasoning": 0.7013442516326904, "adv/std_step_conf": 0.9326123595237732, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.730661040787623, "calib/avg_num_step_conf": 7.09765625, "calib/ece": 0.29232, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004613220815752661, "calib/mean_conf": 0.9763200000000001, "calib/mu_c": 0.9777777777777781, "calib/mu_w": 0.9731645569620254, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29232, "calib/std_conf": 0.004822613399392495, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43347910592808553, "calib/step_q_c_n": 1029.0, "calib/step_q_gap": 0.08303494349153734, "calib/step_q_w": 0.3504441624365482, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 497.50390625, "completions/mean_terminated_length": 503.4031677246094, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1216, "grad_norm": 0.06393461674451828, "learning_rate": 2.388888888888889e-06, "loss": -0.0075, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037301406264305115, "mask/share_reasoning": 0.7991474866867065, "mask/share_step_conf": 0.15183240175247192, "num_tokens": 28847469.0, "reward": 1.1506597995758057, "reward_std": 0.18302500247955322, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6839609146118164, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8589682579040527, "step": 114 }, { "adv/mean_abs_final_conf": 0.5060404539108276, "adv/mean_abs_reasoning": 0.4313282370567322, "adv/mean_abs_step_conf": 0.7706390619277954, "adv/ratio_final_to_reasoning": 1.1732142958316654, "adv/ratio_step_to_reasoning": 1.7866649936633618, "adv/std_final_conf": 0.7697705626487732, "adv/std_reasoning": 0.7204693555831909, "adv/std_step_conf": 0.9332576394081116, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6138629888629888, "calib/avg_num_step_conf": 6.5234375, "calib/ece": 0.3597254901960786, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9529411764705882, "calib/gap": 0.01085664335664338, "calib/mean_conf": 0.9714901960784316, "calib/mu_c": 0.9757051282051284, "calib/mu_w": 0.9648484848484851, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3597254901960786, "calib/std_conf": 0.023028858748441506, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4334791454730417, "calib/step_q_c_n": 983.0, "calib/step_q_gap": 0.05564799554582195, "calib/step_q_w": 0.3778311499272198, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 450.578125, "completions/mean_terminated_length": 452.3451232910156, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.12266666666666666, "grad_norm": 0.05441709980368614, "learning_rate": 2.361111111111111e-06, "loss": 0.052, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03709343448281288, "mask/share_reasoning": 0.8064676523208618, "mask/share_step_conf": 0.15253272652626038, "num_tokens": 29068081.0, "reward": 1.1251858472824097, "reward_std": 0.1664474904537201, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6352245807647705, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8627021908760071, "step": 115 }, { "adv/mean_abs_final_conf": 0.5148700475692749, "adv/mean_abs_reasoning": 0.40699562430381775, "adv/mean_abs_step_conf": 0.758590042591095, "adv/ratio_final_to_reasoning": 1.265050572595174, "adv/ratio_step_to_reasoning": 1.863877637231834, "adv/std_final_conf": 0.7608790397644043, "adv/std_reasoning": 0.7012465000152588, "adv/std_step_conf": 0.9334179162979126, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6482262465889357, "calib/avg_num_step_conf": 6.99609375, "calib/ece": 0.42666666666666664, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9607843137254902, "calib/gap": 0.008298189034979142, "calib/mean_conf": 0.971764705882353, "calib/mu_c": 0.975539568345324, "calib/mu_w": 0.9672413793103448, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42666666666666664, "calib/std_conf": 0.021116472740302206, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4242675159235668, "calib/step_q_c_n": 942.0, "calib/step_q_gap": 0.04038059012851386, "calib/step_q_w": 0.38388692579505296, "calib/step_q_w_n": 849.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 562.62890625, "completions/mean_terminated_length": 562.62890625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.12373333333333333, "grad_norm": 0.08195675164461136, "learning_rate": 2.3333333333333336e-06, "loss": -0.0084, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03406328707933426, "mask/share_reasoning": 0.8288680911064148, "mask/share_step_conf": 0.13706859946250916, "num_tokens": 29316634.0, "reward": 1.107461929321289, "reward_std": 0.1593656986951828, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5714179277420044, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8904621601104736, "step": 116 }, { "adv/mean_abs_final_conf": 0.6606265306472778, "adv/mean_abs_reasoning": 0.490254670381546, "adv/mean_abs_step_conf": 0.7552804946899414, "adv/ratio_final_to_reasoning": 1.3475170570698247, "adv/ratio_step_to_reasoning": 1.540588066406662, "adv/std_final_conf": 0.8490501046180725, "adv/std_reasoning": 0.7392802834510803, "adv/std_step_conf": 0.9333322048187256, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6984117212509234, "calib/avg_num_step_conf": 6.46875, "calib/ece": 0.4463137254901964, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8392156862745098, "calib/gap": 0.02378047278995321, "calib/mean_conf": 0.9600392156862746, "calib/mu_c": 0.9716030534351147, "calib/mu_w": 0.9478225806451614, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4463137254901964, "calib/std_conf": 0.038719686832468826, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4325380710659899, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.04828691899225723, "calib/step_q_w": 0.38425115207373267, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 515.0390625, "completions/mean_terminated_length": 515.0390625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1248, "grad_norm": 0.09659411013126373, "learning_rate": 2.305555555555556e-06, "loss": -0.1011, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03404964506626129, "mask/share_reasoning": 0.8259721398353577, "mask/share_step_conf": 0.13997818529605865, "num_tokens": 29555084.0, "reward": 1.0920822620391846, "reward_std": 0.18369436264038086, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5591816306114197, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8822801113128662, "step": 117 }, { "adv/mean_abs_final_conf": 0.5085911154747009, "adv/mean_abs_reasoning": 0.33327728509902954, "adv/mean_abs_step_conf": 0.7403470277786255, "adv/ratio_final_to_reasoning": 1.5260299402750441, "adv/ratio_step_to_reasoning": 2.221414602434246, "adv/std_final_conf": 0.7594017386436462, "adv/std_reasoning": 0.6401320695877075, "adv/std_step_conf": 0.9297709465026855, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7194628647214855, "calib/avg_num_step_conf": 6.90625, "calib/ece": 0.36622489959839366, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7429718875502008, "calib/gap": 0.03466909814323582, "calib/mean_conf": 0.9485542168674699, "calib/mu_c": 0.9630344827586206, "calib/mu_w": 0.9283653846153848, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36622489959839366, "calib/std_conf": 0.05085926396649889, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4019910011248594, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.04269634811006989, "calib/step_q_w": 0.3592946530147895, "calib/step_q_w_n": 879.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2543.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 499.3046875, "completions/mean_terminated_length": 507.2301940917969, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.12586666666666665, "grad_norm": 0.06263020634651184, "learning_rate": 2.277777777777778e-06, "loss": -0.0716, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03446434438228607, "mask/share_reasoning": 0.8053621053695679, "mask/share_step_conf": 0.14454856514930725, "num_tokens": 29786914.0, "reward": 1.0896894931793213, "reward_std": 0.1556907594203949, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6195191144943237, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8341772556304932, "step": 118 }, { "adv/mean_abs_final_conf": 0.6502874493598938, "adv/mean_abs_reasoning": 0.4082077741622925, "adv/mean_abs_step_conf": 0.7355071902275085, "adv/ratio_final_to_reasoning": 1.5930305362125634, "adv/ratio_step_to_reasoning": 1.8017961361389716, "adv/std_final_conf": 0.8503894209861755, "adv/std_reasoning": 0.6816251277923584, "adv/std_step_conf": 0.9338120222091675, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7246560228452752, "calib/avg_num_step_conf": 7.41796875, "calib/ece": 0.35003984063745025, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5139442231075697, "calib/gap": 0.049859813084112026, "calib/mean_conf": 0.9237450199203188, "calib/mu_c": 0.9449999999999998, "calib/mu_w": 0.8951401869158878, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35003984063745025, "calib/std_conf": 0.05878177893667188, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42159609120521174, "calib/step_q_c_n": 921.0, "calib/step_q_gap": 0.08709711370009926, "calib/step_q_w": 0.3344989775051125, "calib/step_q_w_n": 978.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 639.0, "completions/mean_terminated_length": 639.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.12693333333333334, "grad_norm": 0.11909914016723633, "learning_rate": 2.25e-06, "loss": 0.0345, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030238190665841103, "mask/share_reasoning": 0.8378813862800598, "mask/share_step_conf": 0.13188043236732483, "num_tokens": 30055562.0, "reward": 1.1247520446777344, "reward_std": 0.1627296805381775, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6410672068595886, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8665619492530823, "step": 119 }, { "adv/mean_abs_final_conf": 0.5520824790000916, "adv/mean_abs_reasoning": 0.27621161937713623, "adv/mean_abs_step_conf": 0.7582372426986694, "adv/ratio_final_to_reasoning": 1.9987663091257735, "adv/ratio_step_to_reasoning": 2.7451315929739395, "adv/std_final_conf": 0.8035014867782593, "adv/std_reasoning": 0.5725486874580383, "adv/std_step_conf": 0.932507336139679, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7933700118392646, "calib/avg_num_step_conf": 6.1015625, "calib/ece": 0.2482812499999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.515625, "calib/gap": 0.06056689184483577, "calib/mean_conf": 0.9240625, "calib/mu_c": 0.9436994219653179, "calib/mu_w": 0.8831325301204821, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2482812499999999, "calib/std_conf": 0.05941035131818359, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4170348258706468, "calib/step_q_c_n": 1005.0, "calib/step_q_gap": 0.058489044901167464, "calib/step_q_w": 0.3585457809694793, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 469.94140625, "completions/mean_terminated_length": 471.7843322753906, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.128, "grad_norm": 0.09170673042535782, "learning_rate": 2.222222222222222e-06, "loss": -0.0359, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03412052243947983, "mask/share_reasoning": 0.82478928565979, "mask/share_step_conf": 0.13718390464782715, "num_tokens": 30282555.0, "reward": 1.2048192024230957, "reward_std": 0.10698007047176361, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7422664165496826, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8881438374519348, "step": 120 }, { "adv/mean_abs_final_conf": 0.6064180135726929, "adv/mean_abs_reasoning": 0.48254069685935974, "adv/mean_abs_step_conf": 0.7727996706962585, "adv/ratio_final_to_reasoning": 1.256718899607007, "adv/ratio_step_to_reasoning": 1.6015222668804183, "adv/std_final_conf": 0.8220515847206116, "adv/std_reasoning": 0.739255428314209, "adv/std_step_conf": 0.9320149421691895, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5662593984962406, "calib/avg_num_step_conf": 6.72265625, "calib/ece": 0.38208661417322853, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5984251968503937, "calib/gap": 0.013408521303258425, "calib/mean_conf": 0.9332677165354332, "calib/mu_c": 0.9392857142857145, "calib/mu_w": 0.9258771929824561, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38208661417322853, "calib/std_conf": 0.06015032989206602, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4264102564102564, "calib/step_q_c_n": 858.0, "calib/step_q_gap": 0.030454288855215894, "calib/step_q_w": 0.39595596755504053, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 547.4296875, "completions/mean_terminated_length": 549.5764770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.12906666666666666, "grad_norm": 0.10754235833883286, "learning_rate": 2.1944444444444445e-06, "loss": 0.0076, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0344381257891655, "mask/share_reasoning": 0.8288736939430237, "mask/share_step_conf": 0.1327819526195526, "num_tokens": 30527753.0, "reward": 1.109503984451294, "reward_std": 0.17705771327018738, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6048824191093445, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8708754181861877, "step": 121 }, { "adv/mean_abs_final_conf": 0.5294678211212158, "adv/mean_abs_reasoning": 0.43470725417137146, "adv/mean_abs_step_conf": 0.7290816307067871, "adv/ratio_final_to_reasoning": 1.217987084504662, "adv/ratio_step_to_reasoning": 1.6771784314861389, "adv/std_final_conf": 0.7790201306343079, "adv/std_reasoning": 0.7205578684806824, "adv/std_step_conf": 0.9326818585395813, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7954926624737946, "calib/avg_num_step_conf": 7.2890625, "calib/ece": 0.2971084337349401, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6867469879518072, "calib/gap": 0.0826457023060797, "calib/mean_conf": 0.9356626506024097, "calib/mu_c": 0.9655345911949686, "calib/mu_w": 0.882888888888889, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2971084337349401, "calib/std_conf": 0.07459436929369202, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4255555555555556, "calib/step_q_c_n": 981.0, "calib/step_q_gap": 0.11382674199623355, "calib/step_q_w": 0.311728813559322, "calib/step_q_w_n": 885.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 519.33203125, "completions/mean_terminated_length": 523.4212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.13013333333333332, "grad_norm": 0.06262241303920746, "learning_rate": 2.166666666666667e-06, "loss": -0.0341, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03336969390511513, "mask/share_reasoning": 0.8161087036132812, "mask/share_step_conf": 0.14270910620689392, "num_tokens": 30768046.0, "reward": 1.1628289222717285, "reward_std": 0.19497352838516235, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6939992308616638, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8752723932266235, "step": 122 }, { "adv/mean_abs_final_conf": 0.6652189493179321, "adv/mean_abs_reasoning": 0.5161447525024414, "adv/mean_abs_step_conf": 0.7464606761932373, "adv/ratio_final_to_reasoning": 1.288822459383205, "adv/ratio_step_to_reasoning": 1.44622351108705, "adv/std_final_conf": 0.8896883130073547, "adv/std_reasoning": 0.7754158973693848, "adv/std_step_conf": 0.93485027551651, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6206302794022093, "calib/avg_num_step_conf": 6.27734375, "calib/ece": 0.38052208835341383, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.570281124497992, "calib/gap": 0.03020272904483412, "calib/mean_conf": 0.9226907630522089, "calib/mu_c": 0.9365185185185185, "calib/mu_w": 0.9063157894736844, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.38052208835341383, "calib/std_conf": 0.07372741472306667, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4364375, "calib/step_q_c_n": 800.0, "calib/step_q_gap": 0.06425387717121583, "calib/step_q_w": 0.37218362282878414, "calib/step_q_w_n": 806.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 603.11328125, "completions/mean_terminated_length": 605.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.1312, "grad_norm": 0.08993158489465714, "learning_rate": 2.138888888888889e-06, "loss": -0.0675, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03210265934467316, "mask/share_reasoning": 0.8405189514160156, "mask/share_step_conf": 0.12347208708524704, "num_tokens": 31027731.0, "reward": 1.0731170177459717, "reward_std": 0.22716030478477478, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5956000089645386, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8342767953872681, "step": 123 }, { "adv/mean_abs_final_conf": 0.5562883615493774, "adv/mean_abs_reasoning": 0.3537698984146118, "adv/mean_abs_step_conf": 0.7436978816986084, "adv/ratio_final_to_reasoning": 1.5724581544171339, "adv/ratio_step_to_reasoning": 2.102207918286502, "adv/std_final_conf": 0.7942464351654053, "adv/std_reasoning": 0.6402589082717896, "adv/std_step_conf": 0.9312188029289246, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6676470588235294, "calib/avg_num_step_conf": 6.49609375, "calib/ece": 0.3279051383399212, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6482213438735178, "calib/gap": 0.048364705882352976, "calib/mean_conf": 0.9326482213438737, "calib/mu_c": 0.951764705882353, "calib/mu_w": 0.9034, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3279051383399212, "calib/std_conf": 0.070803566481362, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4387348178137652, "calib/step_q_c_n": 988.0, "calib/step_q_gap": 0.03294222522117268, "calib/step_q_w": 0.40579259259259254, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 525.47265625, "completions/mean_terminated_length": 527.5333862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.13226666666666667, "grad_norm": 0.08522067964076996, "learning_rate": 2.1111111111111114e-06, "loss": -0.0915, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03211629018187523, "mask/share_reasoning": 0.8295165300369263, "mask/share_step_conf": 0.1344609558582306, "num_tokens": 31269068.0, "reward": 1.132234811782837, "reward_std": 0.15816253423690796, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6636874675750732, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8552088737487793, "step": 124 }, { "adv/mean_abs_final_conf": 0.5968010425567627, "adv/mean_abs_reasoning": 0.5435364246368408, "adv/mean_abs_step_conf": 0.7577736377716064, "adv/ratio_final_to_reasoning": 1.0979964092664263, "adv/ratio_step_to_reasoning": 1.3941542892510035, "adv/std_final_conf": 0.7921422123908997, "adv/std_reasoning": 0.7577725648880005, "adv/std_step_conf": 0.9344490766525269, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5902048144700092, "calib/avg_num_step_conf": 6.61328125, "calib/ece": 0.3313253012048195, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6024096385542169, "calib/gap": 0.029971405772043935, "calib/mean_conf": 0.9176706827309239, "calib/mu_c": 0.9300684931506851, "calib/mu_w": 0.9000970873786411, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3313253012048195, "calib/std_conf": 0.08431745797170943, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42316427783902977, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.012136288017146835, "calib/step_q_w": 0.41102798982188293, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 538.6796875, "completions/mean_terminated_length": 545.0671997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.13333333333333333, "grad_norm": 0.08532305061817169, "learning_rate": 2.0833333333333334e-06, "loss": -0.0838, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033499591052532196, "mask/share_reasoning": 0.82036292552948, "mask/share_step_conf": 0.13441874086856842, "num_tokens": 31511778.0, "reward": 1.1019268035888672, "reward_std": 0.21884110569953918, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6370406150817871, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.839333713054657, "step": 125 }, { "adv/mean_abs_final_conf": 0.54444819688797, "adv/mean_abs_reasoning": 0.41777682304382324, "adv/mean_abs_step_conf": 0.7403782606124878, "adv/ratio_final_to_reasoning": 1.3032034494428126, "adv/ratio_step_to_reasoning": 1.7721860567043108, "adv/std_final_conf": 0.7756986021995544, "adv/std_reasoning": 0.6817764043807983, "adv/std_step_conf": 0.9344750046730042, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7156347352024922, "calib/avg_num_step_conf": 7.6875, "calib/ece": 0.3045418326693228, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.545816733067729, "calib/gap": 0.10299389927310498, "calib/mean_conf": 0.8782470119521912, "calib/mu_c": 0.9221527777777779, "calib/mu_w": 0.819158878504673, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3045418326693228, "calib/std_conf": 0.13293842275301604, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41729032258064513, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.09042134377525018, "calib/step_q_w": 0.32686897880539495, "calib/step_q_w_n": 1038.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 537.33203125, "completions/mean_terminated_length": 543.7035522460938, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.1344, "grad_norm": 0.10794106125831604, "learning_rate": 2.0555555555555555e-06, "loss": -0.0897, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032997895032167435, "mask/share_reasoning": 0.8024303317070007, "mask/share_step_conf": 0.15285301208496094, "num_tokens": 31754799.0, "reward": 1.1234350204467773, "reward_std": 0.1804707646369934, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6802804470062256, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8391845226287842, "step": 126 }, { "adv/mean_abs_final_conf": 0.6498432755470276, "adv/mean_abs_reasoning": 0.4853948950767517, "adv/mean_abs_step_conf": 0.7725515365600586, "adv/ratio_final_to_reasoning": 1.3387929748298506, "adv/ratio_step_to_reasoning": 1.5915938638742813, "adv/std_final_conf": 0.8430218696594238, "adv/std_reasoning": 0.7394492626190186, "adv/std_step_conf": 0.9329238533973694, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6920798898071625, "calib/avg_num_step_conf": 7.234375, "calib/ece": 0.35410788381742747, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.43983402489626555, "calib/gap": 0.0855454545454547, "calib/mean_conf": 0.8480497925311203, "calib/mu_c": 0.8910000000000001, "calib/mu_w": 0.8054545454545454, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3521161825726142, "calib/std_conf": 0.14278474357536333, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4363019390581717, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.07835503640330443, "calib/step_q_w": 0.3579469026548673, "calib/step_q_w_n": 1130.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 540.75390625, "completions/mean_terminated_length": 553.7320556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.13546666666666668, "grad_norm": 0.1398288458585739, "learning_rate": 2.027777777777778e-06, "loss": -0.0272, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.033511899411678314, "mask/share_reasoning": 0.8006240129470825, "mask/share_step_conf": 0.14242660999298096, "num_tokens": 31996904.0, "reward": 1.0577702522277832, "reward_std": 0.20324760675430298, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6117273569107056, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.814521312713623, "step": 127 }, { "adv/mean_abs_final_conf": 0.7356740832328796, "adv/mean_abs_reasoning": 0.5345216393470764, "adv/mean_abs_step_conf": 0.7763804197311401, "adv/ratio_final_to_reasoning": 1.3763223583080995, "adv/ratio_step_to_reasoning": 1.4524770609464879, "adv/std_final_conf": 0.8902674317359924, "adv/std_reasoning": 0.7576658129692078, "adv/std_step_conf": 0.9353314638137817, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.655264857881137, "calib/avg_num_step_conf": 5.9140625, "calib/ece": 0.26156626506024083, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.22088353413654618, "calib/gap": 0.0783507751937984, "calib/mean_conf": 0.7786746987951807, "calib/mu_c": 0.8164341085271317, "calib/mu_w": 0.7380833333333333, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2610843373493975, "calib/std_conf": 0.1617631598415949, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4414387464387464, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.07308899274416514, "calib/step_q_w": 0.3683497536945813, "calib/step_q_w_n": 812.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 524.58984375, "completions/mean_terminated_length": 532.9166870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.13653333333333334, "grad_norm": 0.15617628395557404, "learning_rate": 2.0000000000000003e-06, "loss": -0.1302, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03690294548869133, "mask/share_reasoning": 0.8143789172172546, "mask/share_step_conf": 0.13309314846992493, "num_tokens": 32237863.0, "reward": 1.115687608718872, "reward_std": 0.20451577007770538, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6724511981010437, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8424077033996582, "step": 128 }, { "adv/mean_abs_final_conf": 0.7323572635650635, "adv/mean_abs_reasoning": 0.3838053047657013, "adv/mean_abs_step_conf": 0.7599761486053467, "adv/ratio_final_to_reasoning": 1.9081478407708306, "adv/ratio_step_to_reasoning": 1.9801085059761838, "adv/std_final_conf": 0.8994174003601074, "adv/std_reasoning": 0.6403990983963013, "adv/std_step_conf": 0.9336506128311157, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6279545454545454, "calib/avg_num_step_conf": 6.3125, "calib/ece": 0.18479999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.124, "calib/gap": 0.09089610389610414, "calib/mean_conf": 0.73672, "calib/mu_c": 0.7767142857142858, "calib/mu_w": 0.6858181818181817, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18075999999999995, "calib/std_conf": 0.16621564787949417, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43683823529411764, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.041325735294117705, "calib/step_q_w": 0.39551249999999993, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 485.7890625, "completions/mean_terminated_length": 489.6141662597656, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.1376, "grad_norm": 0.15342004597187042, "learning_rate": 1.9722222222222224e-06, "loss": -0.0737, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03681214898824692, "mask/share_reasoning": 0.8119223117828369, "mask/share_step_conf": 0.14345306158065796, "num_tokens": 32464609.0, "reward": 1.1368520259857178, "reward_std": 0.15180076658725739, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7222031354904175, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8306880593299866, "step": 129 }, { "adv/mean_abs_final_conf": 0.7020096182823181, "adv/mean_abs_reasoning": 0.2727423906326294, "adv/mean_abs_step_conf": 0.743945837020874, "adv/ratio_final_to_reasoning": 2.5738925901983847, "adv/ratio_step_to_reasoning": 2.727650202431981, "adv/std_final_conf": 0.9080822467803955, "adv/std_reasoning": 0.5726157426834106, "adv/std_step_conf": 0.9340441226959229, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6890972222222222, "calib/avg_num_step_conf": 6.19921875, "calib/ece": 0.05103999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.116, "calib/gap": 0.17471527777777762, "calib/mean_conf": 0.69104, "calib/mu_c": 0.7539374999999999, "calib/mu_w": 0.5792222222222223, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05103999999999999, "calib/std_conf": 0.20370497882967908, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4516629464285714, "calib/step_q_c_n": 896.0, "calib/step_q_gap": 0.08234312008993178, "calib/step_q_w": 0.36931982633863963, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 477.81640625, "completions/mean_terminated_length": 479.6902160644531, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.13866666666666666, "grad_norm": 0.16025367379188538, "learning_rate": 1.944444444444445e-06, "loss": 0.0115, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03768278285861015, "mask/share_reasoning": 0.8125240802764893, "mask/share_step_conf": 0.1458868533372879, "num_tokens": 32692218.0, "reward": 1.1927818059921265, "reward_std": 0.12730564177036285, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7871171832084656, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.851568341255188, "step": 130 }, { "adv/mean_abs_final_conf": 0.6797161102294922, "adv/mean_abs_reasoning": 0.3434407413005829, "adv/mean_abs_step_conf": 0.7750881910324097, "adv/ratio_final_to_reasoning": 1.9791365102912983, "adv/ratio_step_to_reasoning": 2.256832395880617, "adv/std_final_conf": 0.8826343417167664, "adv/std_reasoning": 0.6402589678764343, "adv/std_step_conf": 0.9340447187423706, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7046836589468236, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.1591967871485945, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.060240963855421686, "calib/gap": 0.1846002070125501, "calib/mean_conf": 0.6292369477911647, "calib/mu_c": 0.7263559322033898, "calib/mu_w": 0.5417557251908397, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.157269076305221, "calib/std_conf": 0.2165441886147318, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42487394957983193, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.04866418841679976, "calib/step_q_w": 0.3762097611630322, "calib/step_q_w_n": 963.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 470.01171875, "completions/mean_terminated_length": 475.5849914550781, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.13973333333333332, "grad_norm": 0.11347653716802597, "learning_rate": 1.916666666666667e-06, "loss": -0.0434, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035636741667985916, "mask/share_reasoning": 0.8070310950279236, "mask/share_step_conf": 0.1456134170293808, "num_tokens": 32918749.0, "reward": 1.1516947746276855, "reward_std": 0.12257316708564758, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7506062388420105, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8440431356430054, "step": 131 }, { "adv/mean_abs_final_conf": 0.7347065806388855, "adv/mean_abs_reasoning": 0.5124211311340332, "adv/mean_abs_step_conf": 0.7653183937072754, "adv/ratio_final_to_reasoning": 1.4337944631848318, "adv/ratio_step_to_reasoning": 1.4935340234965686, "adv/std_final_conf": 0.889552116394043, "adv/std_reasoning": 0.7576735615730286, "adv/std_step_conf": 0.9337661266326904, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7162384598869249, "calib/avg_num_step_conf": 7.77734375, "calib/ece": 0.06260162601626003, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.14227642276422764, "calib/gap": 0.20626207686252068, "calib/mean_conf": 0.6051219512195123, "calib/mu_c": 0.6797452229299363, "calib/mu_w": 0.47348314606741565, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.01475609756097548, "calib/std_conf": 0.2543487326699838, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4245586708203531, "calib/step_q_c_n": 963.0, "calib/step_q_gap": 0.10524026770448164, "calib/step_q_w": 0.31931840311587145, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 580.7734375, "completions/mean_terminated_length": 589.9921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.1408, "grad_norm": 0.1195579469203949, "learning_rate": 1.888888888888889e-06, "loss": -0.0584, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03593384847044945, "mask/share_reasoning": 0.7987890243530273, "mask/share_step_conf": 0.1496521532535553, "num_tokens": 33173019.0, "reward": 1.1788052320480347, "reward_std": 0.18831761181354523, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7660914063453674, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8511168360710144, "step": 132 }, { "adv/mean_abs_final_conf": 0.7405315041542053, "adv/mean_abs_reasoning": 0.45831504464149475, "adv/mean_abs_step_conf": 0.751035749912262, "adv/ratio_final_to_reasoning": 1.6157695733803965, "adv/ratio_step_to_reasoning": 1.6386888423000394, "adv/std_final_conf": 0.9202378988265991, "adv/std_reasoning": 0.7393971681594849, "adv/std_step_conf": 0.9354689121246338, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6978641456582634, "calib/avg_num_step_conf": 7.25390625, "calib/ece": 0.09942148760330577, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.06611570247933884, "calib/gap": 0.15078011204481806, "calib/mean_conf": 0.520909090909091, "calib/mu_c": 0.6081372549019609, "calib/mu_w": 0.45735714285714285, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09942148760330577, "calib/std_conf": 0.22624129589211192, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4150064516129032, "calib/step_q_c_n": 775.0, "calib/step_q_gap": 0.04018205235227473, "calib/step_q_w": 0.3748243992606285, "calib/step_q_w_n": 1082.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 574.1484375, "completions/mean_terminated_length": 595.06884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.14186666666666667, "grad_norm": 0.12294993549585342, "learning_rate": 1.8611111111111113e-06, "loss": -0.219, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03028670698404312, "mask/share_reasoning": 0.8093153238296509, "mask/share_step_conf": 0.12524168193340302, "num_tokens": 33426345.0, "reward": 1.1139624118804932, "reward_std": 0.2063348889350891, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.7242355346679688, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8222510814666748, "step": 133 }, { "adv/mean_abs_final_conf": 0.719078779220581, "adv/mean_abs_reasoning": 0.5342764854431152, "adv/mean_abs_step_conf": 0.7093563675880432, "adv/ratio_final_to_reasoning": 1.3458926207920148, "adv/ratio_step_to_reasoning": 1.3276952793453398, "adv/std_final_conf": 0.9208303093910217, "adv/std_reasoning": 0.7929035425186157, "adv/std_step_conf": 0.9340146780014038, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6706476649270708, "calib/avg_num_step_conf": 7.1875, "calib/ece": 0.11578947368421054, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.1214574898785425, "calib/gap": 0.16008229626655962, "calib/mean_conf": 0.5161943319838057, "calib/mu_c": 0.5848936170212766, "calib/mu_w": 0.424811320754717, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.030566801619433225, "calib/std_conf": 0.25029717294785286, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4097724810400866, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.09069941669984671, "calib/step_q_w": 0.3190730643402399, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 618.80859375, "completions/mean_terminated_length": 626.146240234375, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.14293333333333333, "grad_norm": 0.1309865415096283, "learning_rate": 1.8333333333333333e-06, "loss": -0.1192, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029283631592988968, "mask/share_reasoning": 0.8341566324234009, "mask/share_step_conf": 0.12484101951122284, "num_tokens": 33693712.0, "reward": 1.156282901763916, "reward_std": 0.2100716233253479, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7374527454376221, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8479920625686646, "step": 134 }, { "adv/mean_abs_final_conf": 0.7240814566612244, "adv/mean_abs_reasoning": 0.4685104787349701, "adv/mean_abs_step_conf": 0.7005765438079834, "adv/ratio_final_to_reasoning": 1.5454968234997093, "adv/ratio_step_to_reasoning": 1.495327373892719, "adv/std_final_conf": 0.905745804309845, "adv/std_reasoning": 0.739366888999939, "adv/std_step_conf": 0.9335731863975525, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6283891547049442, "calib/avg_num_step_conf": 7.11328125, "calib/ece": 0.16039840637450203, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.23107569721115537, "calib/gap": 0.12990829346092503, "calib/mean_conf": 0.6083665338645419, "calib/mu_c": 0.6596052631578947, "calib/mu_w": 0.5296969696969697, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08159362549800796, "calib/std_conf": 0.2731039692196431, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4204934541792548, "calib/step_q_c_n": 993.0, "calib/step_q_gap": 0.08036060393770894, "calib/step_q_w": 0.3401328502415459, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 566.87890625, "completions/mean_terminated_length": 571.342529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.144, "grad_norm": 0.16953939199447632, "learning_rate": 1.8055555555555557e-06, "loss": -0.0455, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033975474536418915, "mask/share_reasoning": 0.8183913230895996, "mask/share_step_conf": 0.13982072472572327, "num_tokens": 33944713.0, "reward": 1.1777615547180176, "reward_std": 0.16259250044822693, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.732399582862854, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8727073073387146, "step": 135 }, { "adv/mean_abs_final_conf": 0.6876845955848694, "adv/mean_abs_reasoning": 0.4227200746536255, "adv/mean_abs_step_conf": 0.7711788415908813, "adv/ratio_final_to_reasoning": 1.6268084645574363, "adv/ratio_step_to_reasoning": 1.8243250979333807, "adv/std_final_conf": 0.890141487121582, "adv/std_reasoning": 0.7012588381767273, "adv/std_step_conf": 0.9338822960853577, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7928475033738192, "calib/avg_num_step_conf": 7.93359375, "calib/ece": 0.12668000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.324, "calib/gap": 0.3159302101407366, "calib/mean_conf": 0.59372, "calib/mu_c": 0.7617948717948719, "calib/mu_w": 0.44586466165413535, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1262000000000001, "calib/std_conf": 0.3059218226933149, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4130848329048843, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.10942162460480448, "calib/step_q_w": 0.3036632083000798, "calib/step_q_w_n": 1253.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 546.58203125, "completions/mean_terminated_length": 553.0632934570312, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.14506666666666668, "grad_norm": 0.17194612324237823, "learning_rate": 1.777777777777778e-06, "loss": -0.0445, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03519256412982941, "mask/share_reasoning": 0.8008227348327637, "mask/share_step_conf": 0.15226593613624573, "num_tokens": 34193126.0, "reward": 1.1875487565994263, "reward_std": 0.15522706508636475, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7767167687416077, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8749620914459229, "step": 136 }, { "adv/mean_abs_final_conf": 0.6520270109176636, "adv/mean_abs_reasoning": 0.3962523937225342, "adv/mean_abs_step_conf": 0.7401241064071655, "adv/ratio_final_to_reasoning": 1.6454840935906854, "adv/ratio_step_to_reasoning": 1.8678098053974632, "adv/std_final_conf": 0.8592166304588318, "adv/std_reasoning": 0.7013719081878662, "adv/std_step_conf": 0.9343339800834656, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6180776989653153, "calib/avg_num_step_conf": 7.67578125, "calib/ece": 0.2571370967741937, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4959677419354839, "calib/gap": 0.11666493134639155, "calib/mean_conf": 0.7086693548387096, "calib/mu_c": 0.7655905511811025, "calib/mu_w": 0.648925619834711, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22685483870967757, "calib/std_conf": 0.3003322144906771, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4082, "calib/step_q_c_n": 900.0, "calib/step_q_gap": 0.062425352112676136, "calib/step_q_w": 0.34577464788732387, "calib/step_q_w_n": 1065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 543.8359375, "completions/mean_terminated_length": 550.2846069335938, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.14613333333333334, "grad_norm": 0.11026394367218018, "learning_rate": 1.75e-06, "loss": -0.082, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033572569489479065, "mask/share_reasoning": 0.7995332479476929, "mask/share_step_conf": 0.15517544746398926, "num_tokens": 34439332.0, "reward": 1.1067839860916138, "reward_std": 0.22292152047157288, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6548605561256409, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8438256978988647, "step": 137 }, { "adv/mean_abs_final_conf": 0.5672576427459717, "adv/mean_abs_reasoning": 0.46197739243507385, "adv/mean_abs_step_conf": 0.7603617310523987, "adv/ratio_final_to_reasoning": 1.2278904812981597, "adv/ratio_step_to_reasoning": 1.6458851526143883, "adv/std_final_conf": 0.799165666103363, "adv/std_reasoning": 0.7206169366836548, "adv/std_step_conf": 0.933541476726532, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7085352622061483, "calib/avg_num_step_conf": 7.29296875, "calib/ece": 0.14988188976377964, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5748031496062992, "calib/gap": 0.2578669077757687, "calib/mean_conf": 0.733740157480315, "calib/mu_c": 0.8139428571428573, "calib/mu_w": 0.5560759493670886, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09732283464566938, "calib/std_conf": 0.3059408288371813, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39558082859463856, "calib/step_q_c_n": 1231.0, "calib/step_q_gap": 0.06086384746256307, "calib/step_q_w": 0.3347169811320755, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 515.046875, "completions/mean_terminated_length": 517.0667114257812, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.1472, "grad_norm": 0.07487356662750244, "learning_rate": 1.7222222222222224e-06, "loss": -0.1116, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036223817616701126, "mask/share_reasoning": 0.8065900206565857, "mask/share_step_conf": 0.15327994525432587, "num_tokens": 34675520.0, "reward": 1.2233290672302246, "reward_std": 0.16680964827537537, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7919644117355347, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8802123665809631, "step": 138 }, { "adv/mean_abs_final_conf": 0.6250104904174805, "adv/mean_abs_reasoning": 0.4324566721916199, "adv/mean_abs_step_conf": 0.7376917004585266, "adv/ratio_final_to_reasoning": 1.4452557460844095, "adv/ratio_step_to_reasoning": 1.7058164387198038, "adv/std_final_conf": 0.8434977531433105, "adv/std_reasoning": 0.7205010652542114, "adv/std_step_conf": 0.9194177985191345, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6968005952380952, "calib/avg_num_step_conf": 5.95703125, "calib/ece": 0.20024000000000017, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.704, "calib/gap": 0.2039502164502166, "calib/mean_conf": 0.8148000000000001, "calib/mu_c": 0.8931168831168832, "calib/mu_w": 0.6891666666666666, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19952000000000017, "calib/std_conf": 0.26922585314192987, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.436105651105651, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.08655572142913909, "calib/step_q_w": 0.34954992967651194, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 475.13671875, "completions/mean_terminated_length": 478.8779602050781, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.14826666666666666, "grad_norm": 0.14141923189163208, "learning_rate": 1.6944444444444446e-06, "loss": 0.0495, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03738235682249069, "mask/share_reasoning": 0.8101898431777954, "mask/share_step_conf": 0.1446153074502945, "num_tokens": 34900251.0, "reward": 1.1667213439941406, "reward_std": 0.20426005125045776, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7265039086341858, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8613967895507812, "step": 139 }, { "adv/mean_abs_final_conf": 0.4288480281829834, "adv/mean_abs_reasoning": 0.33103907108306885, "adv/mean_abs_step_conf": 0.7590134143829346, "adv/ratio_final_to_reasoning": 1.2954604626575061, "adv/ratio_step_to_reasoning": 2.2928212428208288, "adv/std_final_conf": 0.6866632699966431, "adv/std_reasoning": 0.6185633540153503, "adv/std_step_conf": 0.9310550689697266, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7131553708439897, "calib/avg_num_step_conf": 5.73828125, "calib/ece": 0.15658730158730189, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7936507936507936, "calib/gap": 0.2169980818414322, "calib/mean_conf": 0.8618253968253968, "calib/mu_c": 0.9203804347826088, "calib/mu_w": 0.7033823529411766, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14412698412698444, "calib/std_conf": 0.24140264985082757, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4601015228426396, "calib/step_q_c_n": 985.0, "calib/step_q_gap": 0.12627920879305282, "calib/step_q_w": 0.3338223140495868, "calib/step_q_w_n": 484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 485.9453125, "completions/mean_terminated_length": 487.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.14933333333333335, "grad_norm": 0.1173972561955452, "learning_rate": 1.6666666666666667e-06, "loss": 0.0124, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03842870146036148, "mask/share_reasoning": 0.8230201005935669, "mask/share_step_conf": 0.13464492559432983, "num_tokens": 35129669.0, "reward": 1.235023021697998, "reward_std": 0.13625219464302063, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.8001695275306702, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8861677646636963, "step": 140 }, { "adv/mean_abs_final_conf": 0.5950652956962585, "adv/mean_abs_reasoning": 0.48388898372650146, "adv/mean_abs_step_conf": 0.7792325019836426, "adv/ratio_final_to_reasoning": 1.229755823564264, "adv/ratio_step_to_reasoning": 1.6103538790708904, "adv/std_final_conf": 0.8279574513435364, "adv/std_reasoning": 0.7575324177742004, "adv/std_step_conf": 0.9340521097183228, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6933891612200437, "calib/avg_num_step_conf": 6.671875, "calib/ece": 0.26662650602409677, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8232931726907631, "calib/gap": 0.2202246732026144, "calib/mean_conf": 0.8759437751004017, "calib/mu_c": 0.9608496732026145, "calib/mu_w": 0.7406250000000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2640562248995988, "calib/std_conf": 0.23709082878886054, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41911298838437167, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.09636660205060033, "calib/step_q_w": 0.32274638633377134, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 569.62109375, "completions/mean_terminated_length": 574.1063232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.1504, "grad_norm": 0.08798349648714066, "learning_rate": 1.638888888888889e-06, "loss": 0.0634, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03277936577796936, "mask/share_reasoning": 0.8236855864524841, "mask/share_step_conf": 0.1357225626707077, "num_tokens": 35382588.0, "reward": 1.165555715560913, "reward_std": 0.2309819459915161, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7223124504089355, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8636784553527832, "step": 141 }, { "adv/mean_abs_final_conf": 0.5308142900466919, "adv/mean_abs_reasoning": 0.3850702941417694, "adv/mean_abs_step_conf": 0.7345419526100159, "adv/ratio_final_to_reasoning": 1.3784867285848454, "adv/ratio_step_to_reasoning": 1.907552890432995, "adv/std_final_conf": 0.7917589545249939, "adv/std_reasoning": 0.6814852356910706, "adv/std_step_conf": 0.9330111742019653, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5695026587425712, "calib/avg_num_step_conf": 6.98828125, "calib/ece": 0.4083464566929137, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9330708661417323, "calib/gap": 0.02395871129183602, "calib/mean_conf": 0.9468503937007876, "calib/mu_c": 0.957697841726619, "calib/mu_w": 0.933739130434783, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4039763779527562, "calib/std_conf": 0.13166230238729587, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4570890410958904, "calib/step_q_c_n": 876.0, "calib/step_q_gap": 0.1062337779379956, "calib/step_q_w": 0.3508552631578948, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 514.02734375, "completions/mean_terminated_length": 518.0748291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.15146666666666667, "grad_norm": 0.07767757773399353, "learning_rate": 1.6111111111111113e-06, "loss": -0.0422, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034924302250146866, "mask/share_reasoning": 0.8092361688613892, "mask/share_step_conf": 0.14802706241607666, "num_tokens": 35619339.0, "reward": 1.0883498191833496, "reward_std": 0.17449073493480682, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5785929560661316, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8612378835678101, "step": 142 }, { "adv/mean_abs_final_conf": 0.4907079339027405, "adv/mean_abs_reasoning": 0.43104416131973267, "adv/mean_abs_step_conf": 0.7607996463775635, "adv/ratio_final_to_reasoning": 1.1384168443445204, "adv/ratio_step_to_reasoning": 1.7650155474747988, "adv/std_final_conf": 0.75885409116745, "adv/std_reasoning": 0.7205471992492676, "adv/std_step_conf": 0.9344484806060791, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6228807812288079, "calib/avg_num_step_conf": 7.0703125, "calib/ece": 0.35720647773279385, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9392712550607287, "calib/gap": 0.07718974637189746, "calib/mean_conf": 0.9482995951417007, "calib/mu_c": 0.9798630136986304, "calib/mu_w": 0.9026732673267329, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35720647773279385, "calib/std_conf": 0.13272427097218972, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4336813441483198, "calib/step_q_c_n": 863.0, "calib/step_q_gap": 0.1462367823742966, "calib/step_q_w": 0.2874445617740232, "calib/step_q_w_n": 947.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 551.10546875, "completions/mean_terminated_length": 555.4448852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.15253333333333333, "grad_norm": 0.06190464645624161, "learning_rate": 1.5833333333333333e-06, "loss": 0.0387, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033856652677059174, "mask/share_reasoning": 0.8064756989479065, "mask/share_step_conf": 0.15185511112213135, "num_tokens": 35867758.0, "reward": 1.1040698289871216, "reward_std": 0.2155992090702057, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6275339722633362, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8490495681762695, "step": 143 }, { "adv/mean_abs_final_conf": 0.5087292194366455, "adv/mean_abs_reasoning": 0.4547651410102844, "adv/mean_abs_step_conf": 0.7651829123497009, "adv/ratio_final_to_reasoning": 1.1186636211968162, "adv/ratio_step_to_reasoning": 1.682589194611107, "adv/std_final_conf": 0.7767125964164734, "adv/std_reasoning": 0.739241898059845, "adv/std_step_conf": 0.934445858001709, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5587776517300056, "calib/avg_num_step_conf": 6.9609375, "calib/ece": 0.2990944881889768, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9645669291338582, "calib/gap": 0.015146057855927375, "calib/mean_conf": 0.9558661417322838, "calib/mu_c": 0.9607558139534887, "calib/mu_w": 0.9456097560975614, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.288897637795276, "calib/std_conf": 0.12398500431327406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4573327222731439, "calib/step_q_c_n": 1091.0, "calib/step_q_gap": 0.11548033442943917, "calib/step_q_w": 0.34185238784370475, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 510.98046875, "completions/mean_terminated_length": 512.9843139648438, "completions/min_length": 0.0, "completions/min_terminated_length": 91.0, "epoch": 0.1536, "grad_norm": 0.06918393820524216, "learning_rate": 1.5555555555555558e-06, "loss": 0.038, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03654374182224274, "mask/share_reasoning": 0.8113521337509155, "mask/share_step_conf": 0.14819784462451935, "num_tokens": 36102697.0, "reward": 1.1596009731292725, "reward_std": 0.20381799340248108, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6893793344497681, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8651941418647766, "step": 144 }, { "adv/mean_abs_final_conf": 0.6027562618255615, "adv/mean_abs_reasoning": 0.6014331579208374, "adv/mean_abs_step_conf": 0.7539108991622925, "adv/ratio_final_to_reasoning": 1.0021999184569372, "adv/ratio_step_to_reasoning": 1.2535240021826743, "adv/std_final_conf": 0.8105014562606812, "adv/std_reasoning": 0.8099634647369385, "adv/std_step_conf": 0.92684006690979, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.52, "calib/avg_num_step_conf": 6.75, "calib/ece": 0.3752589641434265, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.00790000000000013, "calib/mean_conf": 0.9768525896414345, "calib/mu_c": 0.9800000000000005, "calib/mu_w": 0.9721000000000004, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3752589641434265, "calib/std_conf": 0.039690713928804806, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4916683937823835, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.10609827582694448, "calib/step_q_w": 0.385570117955439, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 474.296875, "completions/mean_terminated_length": 481.825439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.15466666666666667, "grad_norm": 0.06234937906265259, "learning_rate": 1.527777777777778e-06, "loss": 0.0024, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0364404022693634, "mask/share_reasoning": 0.7930680513381958, "mask/share_step_conf": 0.1548665463924408, "num_tokens": 36326821.0, "reward": 1.0898528099060059, "reward_std": 0.2445923089981079, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6094160079956055, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8380056619644165, "step": 145 }, { "adv/mean_abs_final_conf": 0.5295411348342896, "adv/mean_abs_reasoning": 0.4761459529399872, "adv/mean_abs_step_conf": 0.7413994073867798, "adv/ratio_final_to_reasoning": 1.112140366970697, "adv/ratio_step_to_reasoning": 1.5570843410701525, "adv/std_final_conf": 0.7750407457351685, "adv/std_reasoning": 0.7393215894699097, "adv/std_step_conf": 0.9353768229484558, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5346930995043843, "calib/avg_num_step_conf": 6.53515625, "calib/ece": 0.49091633466135476, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.005731986275257617, "calib/mean_conf": 0.9769721115537852, "calib/mu_c": 0.9799180327868856, "calib/mu_w": 0.974186046511628, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.49091633466135476, "calib/std_conf": 0.04159656383203995, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4806713286713286, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.09888281092602591, "calib/step_q_w": 0.3817885177453027, "calib/step_q_w_n": 958.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 520.83203125, "completions/mean_terminated_length": 527.0079345703125, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.15573333333333333, "grad_norm": 0.06934022903442383, "learning_rate": 1.5e-06, "loss": -0.1303, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03379040211439133, "mask/share_reasoning": 0.8066377639770508, "mask/share_step_conf": 0.1478530764579773, "num_tokens": 36567370.0, "reward": 1.0298337936401367, "reward_std": 0.22394514083862305, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.5003616809844971, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.844745397567749, "step": 146 }, { "adv/mean_abs_final_conf": 0.38683611154556274, "adv/mean_abs_reasoning": 0.3548612594604492, "adv/mean_abs_step_conf": 0.7753262519836426, "adv/ratio_final_to_reasoning": 1.0901052206536432, "adv/ratio_step_to_reasoning": 2.1848714992515434, "adv/std_final_conf": 0.6618477702140808, "adv/std_reasoning": 0.6403102874755859, "adv/std_step_conf": 0.9351269602775574, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5350567959263611, "calib/avg_num_step_conf": 6.9921875, "calib/ece": 0.5202811244979922, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9718875502008032, "calib/gap": 0.02156090873482197, "calib/mean_conf": 0.9608433734939761, "calib/mu_c": 0.9727927927927933, "calib/mu_w": 0.9512318840579713, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.517670682730924, "calib/std_conf": 0.1125841566811578, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5180636237897648, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.1042023304439354, "calib/step_q_w": 0.4138612933458294, "calib/step_q_w_n": 1067.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 521.671875, "completions/mean_terminated_length": 527.8577270507812, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.1568, "grad_norm": 0.07205287367105484, "learning_rate": 1.4722222222222225e-06, "loss": -0.1356, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03373868763446808, "mask/share_reasoning": 0.8053416013717651, "mask/share_step_conf": 0.14920096099376678, "num_tokens": 36804598.0, "reward": 0.9906857013702393, "reward_std": 0.195381760597229, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.47235190868377686, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8185129165649414, "step": 147 }, { "adv/mean_abs_final_conf": 0.42417389154434204, "adv/mean_abs_reasoning": 0.36322999000549316, "adv/mean_abs_step_conf": 0.7607502937316895, "adv/ratio_final_to_reasoning": 1.167783231604657, "adv/ratio_step_to_reasoning": 2.0944038616419984, "adv/std_final_conf": 0.7210102081298828, "adv/std_reasoning": 0.6610972285270691, "adv/std_step_conf": 0.9344878196716309, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5526422764227642, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.3232677165354333, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.031050135501354936, "calib/mean_conf": 0.968937007874016, "calib/mu_c": 0.9799390243902442, "calib/mu_w": 0.9488888888888892, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3232677165354333, "calib/std_conf": 0.0875972573575166, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49106045589692765, "calib/step_q_c_n": 1009.0, "calib/step_q_gap": 0.11026885654313123, "calib/step_q_w": 0.3807915993537964, "calib/step_q_w_n": 619.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 484.01953125, "completions/mean_terminated_length": 485.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.15786666666666666, "grad_norm": 0.08672936260700226, "learning_rate": 1.4444444444444445e-06, "loss": -0.0994, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.039496589452028275, "mask/share_reasoning": 0.7975192070007324, "mask/share_step_conf": 0.1590779423713684, "num_tokens": 37033619.0, "reward": 1.135124683380127, "reward_std": 0.17193685472011566, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6679917573928833, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8504635095596313, "step": 148 }, { "adv/mean_abs_final_conf": 0.5512717366218567, "adv/mean_abs_reasoning": 0.5307047367095947, "adv/mean_abs_step_conf": 0.7301586866378784, "adv/ratio_final_to_reasoning": 1.0387541291599898, "adv/ratio_step_to_reasoning": 1.3758284713358913, "adv/std_final_conf": 0.8100273013114929, "adv/std_reasoning": 0.7927494645118713, "adv/std_step_conf": 0.9347800612449646, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.531609756097561, "calib/avg_num_step_conf": 6.79296875, "calib/ece": 0.47762096774193585, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9879032258064516, "calib/gap": 0.012074796747967409, "calib/mean_conf": 0.9735887096774196, "calib/mu_c": 0.9796747967479679, "calib/mu_w": 0.9676000000000005, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.47762096774193585, "calib/std_conf": 0.05830627035620779, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5213084112149532, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.09424780515434716, "calib/step_q_w": 0.42706060606060603, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 543.52734375, "completions/mean_terminated_length": 549.9723510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.15893333333333334, "grad_norm": 0.08807147294282913, "learning_rate": 1.4166666666666667e-06, "loss": -0.0886, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03507719561457634, "mask/share_reasoning": 0.8086420297622681, "mask/share_step_conf": 0.1445620059967041, "num_tokens": 37277218.0, "reward": 1.0231540203094482, "reward_std": 0.24548467993736267, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.50790935754776, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8328907489776611, "step": 149 }, { "adv/mean_abs_final_conf": 0.4628121554851532, "adv/mean_abs_reasoning": 0.4400729835033417, "adv/mean_abs_step_conf": 0.750774621963501, "adv/ratio_final_to_reasoning": 1.051671365510305, "adv/ratio_step_to_reasoning": 1.7060229782494702, "adv/std_final_conf": 0.7392811179161072, "adv/std_reasoning": 0.7205407023429871, "adv/std_step_conf": 0.9340653419494629, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5326130586031577, "calib/avg_num_step_conf": 6.53515625, "calib/ece": 0.3857831325301207, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.00014784586566762847, "calib/mean_conf": 0.9791967871485946, "calib/mu_c": 0.9792567567567572, "calib/mu_w": 0.9791089108910895, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3853012048192773, "calib/std_conf": 0.006658435911988018, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5091600454029511, "calib/step_q_c_n": 881.0, "calib/step_q_gap": 0.12332671206961776, "calib/step_q_w": 0.38583333333333336, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 465.40625, "completions/mean_terminated_length": 472.7936706542969, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.16, "grad_norm": 0.07247061282396317, "learning_rate": 1.3888888888888892e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0417606383562088, "mask/share_reasoning": 0.7887001037597656, "mask/share_step_conf": 0.15391427278518677, "num_tokens": 37501322.0, "reward": 1.0706796646118164, "reward_std": 0.1999371200799942, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5901405811309814, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8284165859222412, "step": 150 }, { "adv/mean_abs_final_conf": 0.5551609396934509, "adv/mean_abs_reasoning": 0.4884258508682251, "adv/mean_abs_step_conf": 0.7733472585678101, "adv/ratio_final_to_reasoning": 1.1366329990654622, "adv/ratio_step_to_reasoning": 1.5833462892946986, "adv/std_final_conf": 0.7760764360427856, "adv/std_reasoning": 0.7393485307693481, "adv/std_step_conf": 0.9337877631187439, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5478505291005291, "calib/avg_num_step_conf": 6.484375, "calib/ece": 0.5027125506072876, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9392712550607287, "calib/gap": 0.016022486772486966, "calib/mean_conf": 0.9396356275303648, "calib/mu_c": 0.9483928571428574, "calib/mu_w": 0.9323703703703704, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.49445344129554675, "calib/std_conf": 0.1601702940721179, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4606629055007052, "calib/step_q_c_n": 709.0, "calib/step_q_gap": 0.021914219906593757, "calib/step_q_w": 0.43874868559411145, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 550.63671875, "completions/mean_terminated_length": 550.63671875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.16106666666666666, "grad_norm": 0.08831620961427689, "learning_rate": 1.3611111111111112e-06, "loss": -0.0575, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.034764066338539124, "mask/share_reasoning": 0.824368953704834, "mask/share_step_conf": 0.1408669650554657, "num_tokens": 37749309.0, "reward": 1.002524733543396, "reward_std": 0.22087067365646362, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.480559766292572, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8293472528457642, "step": 151 }, { "adv/mean_abs_final_conf": 0.5274214744567871, "adv/mean_abs_reasoning": 0.4665879011154175, "adv/mean_abs_step_conf": 0.7688575983047485, "adv/ratio_final_to_reasoning": 1.130379663073007, "adv/ratio_step_to_reasoning": 1.647830122612974, "adv/std_final_conf": 0.7581343054771423, "adv/std_reasoning": 0.7206973433494568, "adv/std_step_conf": 0.9336856603622437, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5688613231552163, "calib/avg_num_step_conf": 6.40625, "calib/ece": 0.4482868525896417, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9721115537848606, "calib/gap": 0.016660305343511372, "calib/mean_conf": 0.9611952191235061, "calib/mu_c": 0.9691603053435115, "calib/mu_w": 0.9525000000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44378486055776917, "calib/std_conf": 0.10977778050535121, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44863247863247857, "calib/step_q_c_n": 819.0, "calib/step_q_gap": 0.06633040798692436, "calib/step_q_w": 0.3823020706455542, "calib/step_q_w_n": 821.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 512.44140625, "completions/mean_terminated_length": 514.4509887695312, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.16213333333333332, "grad_norm": 0.07020580768585205, "learning_rate": 1.3333333333333334e-06, "loss": -0.007, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03603512793779373, "mask/share_reasoning": 0.8120272755622864, "mask/share_step_conf": 0.1480313539505005, "num_tokens": 37985886.0, "reward": 1.0656317472457886, "reward_std": 0.22288453578948975, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5391280651092529, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8629859685897827, "step": 152 }, { "adv/mean_abs_final_conf": 0.5444529056549072, "adv/mean_abs_reasoning": 0.45993316173553467, "adv/mean_abs_step_conf": 0.7532547116279602, "adv/ratio_final_to_reasoning": 1.183765274937866, "adv/ratio_step_to_reasoning": 1.6377482084257449, "adv/std_final_conf": 0.7684758305549622, "adv/std_reasoning": 0.7205182313919067, "adv/std_step_conf": 0.9347703456878662, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5891861761426979, "calib/avg_num_step_conf": 6.32421875, "calib/ece": 0.4164313725490199, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9529411764705882, "calib/gap": 0.04690821256038624, "calib/mean_conf": 0.9576078431372551, "calib/mu_c": 0.9791304347826087, "calib/mu_w": 0.9322222222222225, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4164313725490199, "calib/std_conf": 0.10660612468727179, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4672058823529412, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.0961381441210239, "calib/step_q_w": 0.3710677382319173, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 512.5, "completions/mean_terminated_length": 514.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.1632, "grad_norm": 0.07010144740343094, "learning_rate": 1.3055555555555556e-06, "loss": -0.0974, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034779004752635956, "mask/share_reasoning": 0.8166590929031372, "mask/share_step_conf": 0.14465561509132385, "num_tokens": 38224406.0, "reward": 1.087091088294983, "reward_std": 0.20078277587890625, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5799410343170166, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8597024083137512, "step": 153 }, { "adv/mean_abs_final_conf": 0.6241574287414551, "adv/mean_abs_reasoning": 0.5177608132362366, "adv/mean_abs_step_conf": 0.7719071507453918, "adv/ratio_final_to_reasoning": 1.2054937584793104, "adv/ratio_step_to_reasoning": 1.490856648498806, "adv/std_final_conf": 0.8262401819229126, "adv/std_reasoning": 0.7575730681419373, "adv/std_step_conf": 0.9345754981040955, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6089230371900827, "calib/avg_num_step_conf": 6.2734375, "calib/ece": 0.4795983935742974, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9839357429718876, "calib/gap": 0.025874225206611645, "calib/mean_conf": 0.9655421686746989, "calib/mu_c": 0.9788429752066119, "calib/mu_w": 0.9529687500000003, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4795983935742974, "calib/std_conf": 0.09680336147099437, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.462365747460087, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.0889088227054523, "calib/step_q_w": 0.3734569247546347, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 450.4453125, "completions/mean_terminated_length": 461.2560119628906, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.16426666666666667, "grad_norm": 0.09528306126594543, "learning_rate": 1.2777777777777779e-06, "loss": -0.0861, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03584619611501694, "mask/share_reasoning": 0.8004294633865356, "mask/share_step_conf": 0.14028680324554443, "num_tokens": 38444160.0, "reward": 1.0437710285186768, "reward_std": 0.22803018987178802, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5094179511070251, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8593744039535522, "step": 154 }, { "adv/mean_abs_final_conf": 0.5692579746246338, "adv/mean_abs_reasoning": 0.4835493862628937, "adv/mean_abs_step_conf": 0.7541655898094177, "adv/ratio_final_to_reasoning": 1.1772488825270528, "adv/ratio_step_to_reasoning": 1.5596454286458277, "adv/std_final_conf": 0.8165688514709473, "adv/std_reasoning": 0.7574070692062378, "adv/std_step_conf": 0.9348653554916382, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6760942760942762, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.4878906250000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.93359375, "calib/gap": 0.02003183348637927, "calib/mean_conf": 0.9311718750000002, "calib/mu_c": 0.9417355371900828, "calib/mu_w": 0.9217037037037036, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.47320312500000017, "calib/std_conf": 0.1767595519596731, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4213231481481481, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.08427576054182978, "calib/step_q_w": 0.3370473876063183, "calib/step_q_w_n": 823.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 436.0625, "completions/mean_terminated_length": 437.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.16533333333333333, "grad_norm": 0.10584117472171783, "learning_rate": 1.25e-06, "loss": -0.0045, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.038839589804410934, "mask/share_reasoning": 0.7988035082817078, "mask/share_step_conf": 0.15845061838626862, "num_tokens": 38663008.0, "reward": 1.0421983003616333, "reward_std": 0.19966325163841248, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.515350341796875, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8501973748207092, "step": 155 }, { "adv/mean_abs_final_conf": 0.630531907081604, "adv/mean_abs_reasoning": 0.46726131439208984, "adv/mean_abs_step_conf": 0.724333643913269, "adv/ratio_final_to_reasoning": 1.3494203086380696, "adv/ratio_step_to_reasoning": 1.5501682283619649, "adv/std_final_conf": 0.8626736402511597, "adv/std_reasoning": 0.7391840219497681, "adv/std_step_conf": 0.9196922183036804, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5732221583285414, "calib/avg_num_step_conf": 6.41015625, "calib/ece": 0.49123015873015896, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8968253968253969, "calib/gap": 0.0300651715545337, "calib/mean_conf": 0.9061507936507938, "calib/mu_c": 0.9229729729729734, "calib/mu_w": 0.8929078014184397, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4784523809523812, "calib/std_conf": 0.21891295535709995, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.43821766561514186, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.07983236273530075, "calib/step_q_w": 0.3583853028798411, "calib/step_q_w_n": 1007.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 507.0234375, "completions/mean_terminated_length": 509.01177978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.1664, "grad_norm": 0.09417561441659927, "learning_rate": 1.2222222222222223e-06, "loss": -0.0404, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03862348943948746, "mask/share_reasoning": 0.8037759065628052, "mask/share_step_conf": 0.15369439125061035, "num_tokens": 38897566.0, "reward": 1.0278263092041016, "reward_std": 0.2014266550540924, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.4957183599472046, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8508935570716858, "step": 156 }, { "adv/mean_abs_final_conf": 0.594262957572937, "adv/mean_abs_reasoning": 0.48525696992874146, "adv/mean_abs_step_conf": 0.7258875966072083, "adv/ratio_final_to_reasoning": 1.2246355939208926, "adv/ratio_step_to_reasoning": 1.4958828859558735, "adv/std_final_conf": 0.8026941418647766, "adv/std_reasoning": 0.7393622994422913, "adv/std_step_conf": 0.9335121512413025, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.766922471467926, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.29592885375494093, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8853754940711462, "calib/gap": 0.1753607503607505, "calib/mean_conf": 0.9020948616600791, "calib/mu_c": 0.9707142857142858, "calib/mu_w": 0.7953535353535353, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2946640316205536, "calib/std_conf": 0.22021719177808333, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.447638737758433, "calib/step_q_c_n": 919.0, "calib/step_q_gap": 0.12689469013938537, "calib/step_q_w": 0.3207440476190476, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 466.95703125, "completions/mean_terminated_length": 470.63385009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.16746666666666668, "grad_norm": 0.09445346146821976, "learning_rate": 1.1944444444444446e-06, "loss": -0.0623, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03903867304325104, "mask/share_reasoning": 0.8008126616477966, "mask/share_step_conf": 0.15233619511127472, "num_tokens": 39120835.0, "reward": 1.1484313011169434, "reward_std": 0.22829920053482056, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6980769038200378, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8554403781890869, "step": 157 }, { "adv/mean_abs_final_conf": 0.5869907140731812, "adv/mean_abs_reasoning": 0.48486799001693726, "adv/mean_abs_step_conf": 0.7452125549316406, "adv/ratio_final_to_reasoning": 1.2106196452619538, "adv/ratio_step_to_reasoning": 1.536939064394845, "adv/std_final_conf": 0.7773375511169434, "adv/std_reasoning": 0.7394952774047852, "adv/std_step_conf": 0.9342270493507385, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6505353620738236, "calib/avg_num_step_conf": 6.44921875, "calib/ece": 0.32323886639676147, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8704453441295547, "calib/gap": 0.037802197802197623, "calib/mean_conf": 0.8999190283400811, "calib/mu_c": 0.9138461538461539, "calib/mu_w": 0.8760439560439562, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2957894736842109, "calib/std_conf": 0.21804057844458355, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4356462585034014, "calib/step_q_c_n": 882.0, "calib/step_q_gap": 0.08148501012888898, "calib/step_q_w": 0.3541612483745124, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 500.8515625, "completions/mean_terminated_length": 510.8287048339844, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.16853333333333334, "grad_norm": 0.07168283313512802, "learning_rate": 1.1666666666666668e-06, "loss": -0.0864, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.038769736886024475, "mask/share_reasoning": 0.7923343777656555, "mask/share_step_conf": 0.1493646502494812, "num_tokens": 39354293.0, "reward": 1.0865575075149536, "reward_std": 0.24829015135765076, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6341601610183716, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8171157240867615, "step": 158 }, { "adv/mean_abs_final_conf": 0.6913203001022339, "adv/mean_abs_reasoning": 0.502968430519104, "adv/mean_abs_step_conf": 0.7115085124969482, "adv/ratio_final_to_reasoning": 1.3744805004734304, "adv/ratio_step_to_reasoning": 1.4146186307610082, "adv/std_final_conf": 0.867906928062439, "adv/std_reasoning": 0.7575962543487549, "adv/std_step_conf": 0.934805154800415, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6377729674667346, "calib/avg_num_step_conf": 6.41015625, "calib/ece": 0.366468253968254, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8214285714285714, "calib/gap": 0.07891449672120698, "calib/mean_conf": 0.8520238095238094, "calib/mu_c": 0.887410071942446, "calib/mu_w": 0.808495575221239, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33345238095238094, "calib/std_conf": 0.27575918192733595, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4125581395348837, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.09972843402912185, "calib/step_q_w": 0.31282970550576183, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 459.76171875, "completions/mean_terminated_length": 463.38189697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.1696, "grad_norm": 0.07267262041568756, "learning_rate": 1.138888888888889e-06, "loss": -0.1434, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03936787322163582, "mask/share_reasoning": 0.788456916809082, "mask/share_step_conf": 0.16436274349689484, "num_tokens": 39576776.0, "reward": 1.1042141914367676, "reward_std": 0.23494693636894226, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6156214475631714, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8582253456115723, "step": 159 }, { "adv/mean_abs_final_conf": 0.6293020248413086, "adv/mean_abs_reasoning": 0.4387770891189575, "adv/mean_abs_step_conf": 0.7438819408416748, "adv/ratio_final_to_reasoning": 1.4342180584335331, "adv/ratio_step_to_reasoning": 1.6953527412640268, "adv/std_final_conf": 0.8369735479354858, "adv/std_reasoning": 0.7014278769493103, "adv/std_step_conf": 0.9351868629455566, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.67088370933566, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.3426274509803922, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7803921568627451, "calib/gap": 0.09647949644771281, "calib/mean_conf": 0.8202745098039215, "calib/mu_c": 0.8630281690140845, "calib/mu_w": 0.7665486725663717, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3030196078431373, "calib/std_conf": 0.30170028488430284, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3733294392523364, "calib/step_q_c_n": 856.0, "calib/step_q_gap": 0.04816614220723536, "calib/step_q_w": 0.32516329704510105, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 462.1171875, "completions/mean_terminated_length": 463.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.17066666666666666, "grad_norm": 0.08778801560401917, "learning_rate": 1.111111111111111e-06, "loss": -0.0843, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.037485793232917786, "mask/share_reasoning": 0.8103216886520386, "mask/share_step_conf": 0.14828626811504364, "num_tokens": 39799918.0, "reward": 1.1009936332702637, "reward_std": 0.1922154426574707, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6375530958175659, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8372268676757812, "step": 160 }, { "adv/mean_abs_final_conf": 0.5192180275917053, "adv/mean_abs_reasoning": 0.38679128885269165, "adv/mean_abs_step_conf": 0.7360755205154419, "adv/ratio_final_to_reasoning": 1.3423725987516952, "adv/ratio_step_to_reasoning": 1.9030302432580743, "adv/std_final_conf": 0.759878396987915, "adv/std_reasoning": 0.6815119385719299, "adv/std_step_conf": 0.9340335726737976, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7095197783592425, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.18051792828685265, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7370517928286853, "calib/gap": 0.2445090041557645, "calib/mean_conf": 0.7917529880478087, "calib/mu_c": 0.862865168539326, "calib/mu_w": 0.6183561643835616, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13155378486055783, "calib/std_conf": 0.32005370991773774, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3799685863874346, "calib/step_q_c_n": 955.0, "calib/step_q_gap": 0.05801159714012277, "calib/step_q_w": 0.3219569892473118, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 433.796875, "completions/mean_terminated_length": 440.68255615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.17173333333333332, "grad_norm": 0.10147473961114883, "learning_rate": 1.0833333333333335e-06, "loss": -0.0665, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04036349803209305, "mask/share_reasoning": 0.7908325791358948, "mask/share_step_conf": 0.15317894518375397, "num_tokens": 40014890.0, "reward": 1.1826231479644775, "reward_std": 0.16954448819160461, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7700152397155762, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8400498628616333, "step": 161 }, { "adv/mean_abs_final_conf": 0.555371880531311, "adv/mean_abs_reasoning": 0.40420442819595337, "adv/mean_abs_step_conf": 0.7501338720321655, "adv/ratio_final_to_reasoning": 1.3739876205959662, "adv/ratio_step_to_reasoning": 1.8558279417674013, "adv/std_final_conf": 0.7655318379402161, "adv/std_reasoning": 0.6816686391830444, "adv/std_step_conf": 0.9353340268135071, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6940372575965796, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.22521912350597617, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7211155378486056, "calib/gap": 0.14669033440219892, "calib/mean_conf": 0.7970916334661354, "calib/mu_c": 0.8403389830508475, "calib/mu_w": 0.6936486486486486, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15856573705179292, "calib/std_conf": 0.3047968227476998, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39499999999999996, "calib/step_q_c_n": 888.0, "calib/step_q_gap": 0.07203092783505155, "calib/step_q_w": 0.3229690721649484, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 420.1171875, "completions/mean_terminated_length": 428.4860534667969, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.1728, "grad_norm": 0.0986003577709198, "learning_rate": 1.0555555555555557e-06, "loss": -0.1312, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.039783887565135956, "mask/share_reasoning": 0.7854666113853455, "mask/share_step_conf": 0.15521827340126038, "num_tokens": 40226584.0, "reward": 1.153937578201294, "reward_std": 0.1762813925743103, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.736830472946167, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8249673843383789, "step": 162 }, { "adv/mean_abs_final_conf": 0.6111153364181519, "adv/mean_abs_reasoning": 0.3719629943370819, "adv/mean_abs_step_conf": 0.7672944068908691, "adv/ratio_final_to_reasoning": 1.6429465987800504, "adv/ratio_step_to_reasoning": 2.0628245781770653, "adv/std_final_conf": 0.8349888920783997, "adv/std_reasoning": 0.6403541564941406, "adv/std_step_conf": 0.9314863085746765, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7884820594904628, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.15734693877551031, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5061224489795918, "calib/gap": 0.36323062558356706, "calib/mean_conf": 0.632938775510204, "calib/mu_c": 0.8093650793650796, "calib/mu_w": 0.44613445378151256, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13800000000000012, "calib/std_conf": 0.3673606029204716, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4011027190332327, "calib/step_q_c_n": 662.0, "calib/step_q_gap": 0.13657129676200885, "calib/step_q_w": 0.26453142227122384, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 450.796875, "completions/mean_terminated_length": 469.1219482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.17386666666666667, "grad_norm": 0.08291774988174438, "learning_rate": 1.0277777777777777e-06, "loss": -0.182, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.037739530205726624, "mask/share_reasoning": 0.7714070677757263, "mask/share_step_conf": 0.15179085731506348, "num_tokens": 40446820.0, "reward": 1.1517189741134644, "reward_std": 0.1930740922689438, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7490097880363464, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8430562019348145, "step": 163 }, { "adv/mean_abs_final_conf": 0.6865717172622681, "adv/mean_abs_reasoning": 0.536221444606781, "adv/mean_abs_step_conf": 0.7406268119812012, "adv/ratio_final_to_reasoning": 1.2803883995459024, "adv/ratio_step_to_reasoning": 1.381195808989537, "adv/std_final_conf": 0.8637810349464417, "adv/std_reasoning": 0.8099890351295471, "adv/std_step_conf": 0.9336626529693604, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.761631481861037, "calib/avg_num_step_conf": 6.7578125, "calib/ece": 0.1879338842975207, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4090909090909091, "calib/gap": 0.3282325613172099, "calib/mean_conf": 0.5486776859504132, "calib/mu_c": 0.7100813008130082, "calib/mu_w": 0.38184873949579834, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1141735537190083, "calib/std_conf": 0.3860082689748488, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.35082677165354337, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.08844040801717973, "calib/step_q_w": 0.26238636363636364, "calib/step_q_w_n": 968.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 535.20703125, "completions/mean_terminated_length": 559.2366943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.17493333333333333, "grad_norm": 0.07999632507562637, "learning_rate": 1.0000000000000002e-06, "loss": -0.2353, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.031994570046663284, "mask/share_reasoning": 0.7777503728866577, "mask/share_step_conf": 0.1472862958908081, "num_tokens": 40689969.0, "reward": 1.1251270771026611, "reward_std": 0.2629609704017639, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7215191125869751, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.829573392868042, "step": 164 }, { "adv/mean_abs_final_conf": 0.6166242957115173, "adv/mean_abs_reasoning": 0.3994438350200653, "adv/mean_abs_step_conf": 0.7586116790771484, "adv/ratio_final_to_reasoning": 1.5437071288897033, "adv/ratio_step_to_reasoning": 1.8991698270647763, "adv/std_final_conf": 0.8585282564163208, "adv/std_reasoning": 0.7014773488044739, "adv/std_step_conf": 0.9335530400276184, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7362260919212104, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.17637130801687773, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.43037974683544306, "calib/gap": 0.2973786754210678, "calib/mean_conf": 0.5712236286919832, "calib/mu_c": 0.7268141592920355, "calib/mu_w": 0.42943548387096775, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1354008438818566, "calib/std_conf": 0.37505074282668666, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.374524959742351, "calib/step_q_c_n": 621.0, "calib/step_q_gap": 0.08076664488647523, "calib/step_q_w": 0.2937583148558758, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 469.37890625, "completions/mean_terminated_length": 498.5933837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.176, "grad_norm": 0.1938386857509613, "learning_rate": 9.722222222222224e-07, "loss": -0.1938, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.03441419079899788, "mask/share_reasoning": 0.7516778707504272, "mask/share_step_conf": 0.15531416237354279, "num_tokens": 40915706.0, "reward": 1.0904395580291748, "reward_std": 0.24750910699367523, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.6902323961257935, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8119936585426331, "step": 165 }, { "adv/mean_abs_final_conf": 0.7183117866516113, "adv/mean_abs_reasoning": 0.47158336639404297, "adv/mean_abs_step_conf": 0.7550788521766663, "adv/ratio_final_to_reasoning": 1.5231915242137875, "adv/ratio_step_to_reasoning": 1.6011566691810366, "adv/std_final_conf": 0.8728970885276794, "adv/std_reasoning": 0.7396736741065979, "adv/std_step_conf": 0.9338595867156982, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.7622749231444884, "calib/avg_num_step_conf": 6.0703125, "calib/ece": 0.22136752136752139, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.38461538461538464, "calib/gap": 0.3347351778656126, "calib/mean_conf": 0.5348717948717948, "calib/mu_c": 0.6335757575757576, "calib/mu_w": 0.298840579710145, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.025555555555555574, "calib/std_conf": 0.3772813577155442, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.35048582995951416, "calib/step_q_c_n": 988.0, "calib/step_q_gap": 0.0885246992174647, "calib/step_q_w": 0.26196113074204946, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 488.7109375, "completions/mean_terminated_length": 527.8902587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.17706666666666668, "grad_norm": 0.10186661779880524, "learning_rate": 9.444444444444445e-07, "loss": -0.3896, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.03354319930076599, "mask/share_reasoning": 0.7347061634063721, "mask/share_step_conf": 0.1575319468975067, "num_tokens": 41147000.0, "reward": 1.089632272720337, "reward_std": 0.3075636029243469, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6920523643493652, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.7847039699554443, "step": 166 }, { "adv/mean_abs_final_conf": 0.7088315486907959, "adv/mean_abs_reasoning": 0.5106958150863647, "adv/mean_abs_step_conf": 0.7436408996582031, "adv/ratio_final_to_reasoning": 1.3879721112868804, "adv/ratio_step_to_reasoning": 1.4561327461288176, "adv/std_final_conf": 0.894888162612915, "adv/std_reasoning": 0.7757686972618103, "adv/std_step_conf": 0.9358707070350647, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.5773578811369509, "calib/avg_num_step_conf": 5.984375, "calib/ece": 0.2913913043478261, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.391304347826087, "calib/gap": 0.1000952842377264, "calib/mean_conf": 0.572086956521739, "calib/mu_c": 0.6095138888888889, "calib/mu_w": 0.5094186046511625, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11869565217391312, "calib/std_conf": 0.35580770491003416, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.363421926910299, "calib/step_q_c_n": 903.0, "calib/step_q_gap": 0.04022637842063287, "calib/step_q_w": 0.32319554848966614, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 443.8515625, "completions/mean_terminated_length": 487.66522216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.17813333333333334, "grad_norm": 0.08605474978685379, "learning_rate": 9.166666666666666e-07, "loss": -0.5211, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.03470229357481003, "mask/share_reasoning": 0.7189175486564636, "mask/share_step_conf": 0.15653637051582336, "num_tokens": 41366234.0, "reward": 1.0280582904815674, "reward_std": 0.3161855936050415, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6122652292251587, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7682965993881226, "step": 167 }, { "adv/mean_abs_final_conf": 0.7873885631561279, "adv/mean_abs_reasoning": 0.6657405495643616, "adv/mean_abs_step_conf": 0.7577903866767883, "adv/ratio_final_to_reasoning": 1.1827258587018152, "adv/ratio_step_to_reasoning": 1.1382668325861496, "adv/std_final_conf": 0.9266418814659119, "adv/std_reasoning": 0.8594803810119629, "adv/std_step_conf": 0.9359583258628845, "calib/answer_extract_rate": 0.8671875, "calib/auroc": 0.7620819397993311, "calib/avg_num_step_conf": 5.62109375, "calib/ece": 0.18878378378378377, "calib/final_conf_rate": 0.8671875, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.21621621621621623, "calib/gap": 0.31751839464882925, "calib/mean_conf": 0.40310810810810815, "calib/mu_c": 0.5346923076923076, "calib/mu_w": 0.2171739130434783, "calib/nonempty_final_conf_rate": 0.8671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.003153153153153153, "calib/std_conf": 0.3323947737404039, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.34687407407407406, "calib/step_q_c_n": 675.0, "calib/step_q_gap": 0.0813766918751212, "calib/step_q_w": 0.26549738219895286, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12890625, "completions/max_length": 2851.0, "completions/max_terminated_length": 2851.0, "completions/mean_length": 477.03125, "completions/mean_terminated_length": 547.6233520507812, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.1792, "grad_norm": 0.0872519463300705, "learning_rate": 8.88888888888889e-07, "loss": -0.6337, "mask/has_final_conf_rate": 0.8671875, "mask/share_final_conf": 0.029452074319124222, "mask/share_reasoning": 0.6938341856002808, "mask/share_step_conf": 0.1478075236082077, "num_tokens": 41593026.0, "reward": 1.0312435626983643, "reward_std": 0.3793541491031647, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.660910964012146, "rewards/format_reward_step": 0.859375, "rewards/step_l2_reward": 0.7520923614501953, "step": 168 }, { "adv/mean_abs_final_conf": 0.7701253890991211, "adv/mean_abs_reasoning": 0.6043051481246948, "adv/mean_abs_step_conf": 0.784436821937561, "adv/ratio_final_to_reasoning": 1.2743981935103592, "adv/ratio_step_to_reasoning": 1.2980806540732912, "adv/std_final_conf": 0.9355708360671997, "adv/std_reasoning": 0.8268880248069763, "adv/std_step_conf": 0.9356573820114136, "calib/answer_extract_rate": 0.83203125, "calib/auroc": 0.6584119496855345, "calib/avg_num_step_conf": 4.75390625, "calib/ece": 0.22218457943925218, "calib/final_conf_rate": 0.8359375, "calib/format_rate": 0.82421875, "calib/frac_conf_gt_0.9": 0.21962616822429906, "calib/gap": 0.178536862334032, "calib/mean_conf": 0.4128621495327102, "calib/mu_c": 0.5012962962962961, "calib/mu_w": 0.32275943396226414, "calib/nonempty_final_conf_rate": 0.8359375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.06518691588785036, "calib/std_conf": 0.32377449161076505, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.35374999999999995, "calib/step_q_c_n": 520.0, "calib/step_q_gap": 0.05394727403156385, "calib/step_q_w": 0.2998027259684361, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 430.98828125, "completions/mean_terminated_length": 506.1146545410156, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.18026666666666666, "grad_norm": 0.10154373943805695, "learning_rate": 8.611111111111112e-07, "loss": -0.5491, "mask/has_final_conf_rate": 0.8359375, "mask/share_final_conf": 0.0335194393992424, "mask/share_reasoning": 0.6624668836593628, "mask/share_step_conf": 0.15557613968849182, "num_tokens": 41807543.0, "reward": 0.9480201005935669, "reward_std": 0.3434249460697174, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.5995436906814575, "rewards/format_reward_step": 0.82421875, "rewards/step_l2_reward": 0.69818514585495, "step": 169 }, { "adv/mean_abs_final_conf": 0.7581398487091064, "adv/mean_abs_reasoning": 0.6952020525932312, "adv/mean_abs_step_conf": 0.7796958684921265, "adv/ratio_final_to_reasoning": 1.0905316603728452, "adv/ratio_step_to_reasoning": 1.1215385017689719, "adv/std_final_conf": 0.9218901991844177, "adv/std_reasoning": 0.8908805251121521, "adv/std_step_conf": 0.9365332126617432, "calib/answer_extract_rate": 0.7578125, "calib/auroc": 0.6416815942678011, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.323160621761658, "calib/final_conf_rate": 0.75390625, "calib/format_rate": 0.73828125, "calib/frac_conf_gt_0.9": 0.21243523316062177, "calib/gap": 0.12132781012091348, "calib/mean_conf": 0.41487046632124347, "calib/mu_c": 0.4632758620689654, "calib/mu_w": 0.34194805194805195, "calib/nonempty_final_conf_rate": 0.75390625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.06849740932642484, "calib/std_conf": 0.33441172353493587, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.34593893129770986, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.04616320570404231, "calib/step_q_w": 0.29977572559366755, "calib/step_q_w_n": 758.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22265625, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 455.69921875, "completions/mean_terminated_length": 586.2261352539062, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.18133333333333335, "grad_norm": 0.09000255912542343, "learning_rate": 8.333333333333333e-07, "loss": -0.9007, "mask/has_final_conf_rate": 0.75390625, "mask/share_final_conf": 0.024297218769788742, "mask/share_reasoning": 0.594523012638092, "mask/share_step_conf": 0.1585235595703125, "num_tokens": 42028354.0, "reward": 0.8514015078544617, "reward_std": 0.4654655456542969, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.49094337224960327, "rewards/format_reward_step": 0.73828125, "rewards/step_l2_reward": 0.648531436920166, "step": 170 }, { "adv/mean_abs_final_conf": 0.7609789967536926, "adv/mean_abs_reasoning": 0.6566393375396729, "adv/mean_abs_step_conf": 0.7822959423065186, "adv/ratio_final_to_reasoning": 1.1588994951246212, "adv/ratio_step_to_reasoning": 1.1913631998315266, "adv/std_final_conf": 0.9215835332870483, "adv/std_reasoning": 0.843531608581543, "adv/std_step_conf": 0.921733021736145, "calib/answer_extract_rate": 0.7109375, "calib/auroc": 0.5436290617013508, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.35318681318681316, "calib/final_conf_rate": 0.7109375, "calib/format_rate": 0.703125, "calib/frac_conf_gt_0.9": 0.17032967032967034, "calib/gap": 0.018617500304247303, "calib/mean_conf": 0.3986813186813187, "calib/mu_c": 0.4071717171717172, "calib/mu_w": 0.3885542168674699, "calib/nonempty_final_conf_rate": 0.7109375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10395604395604394, "calib/std_conf": 0.32226434031028584, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3401717557251908, "calib/step_q_c_n": 524.0, "calib/step_q_gap": 0.04644601187008085, "calib/step_q_w": 0.29372574385510997, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26953125, "completions/max_length": 2899.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 387.21484375, "completions/mean_terminated_length": 530.0909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.1824, "grad_norm": 0.13014192879199982, "learning_rate": 8.055555555555557e-07, "loss": -1.0885, "mask/has_final_conf_rate": 0.7109375, "mask/share_final_conf": 0.02836516499519348, "mask/share_reasoning": 0.5377077460289001, "mask/share_step_conf": 0.16439583897590637, "num_tokens": 42234377.0, "reward": 0.7764548063278198, "reward_std": 0.4468100070953369, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.4449746012687683, "rewards/format_reward_step": 0.703125, "rewards/step_l2_reward": 0.5933108329772949, "step": 171 }, { "adv/mean_abs_final_conf": 0.7410951852798462, "adv/mean_abs_reasoning": 0.6668893694877625, "adv/mean_abs_step_conf": 0.7876383066177368, "adv/ratio_final_to_reasoning": 1.1112715529549995, "adv/ratio_step_to_reasoning": 1.1810629208600545, "adv/std_final_conf": 0.9066214561462402, "adv/std_reasoning": 0.8595982193946838, "adv/std_step_conf": 0.9360899329185486, "calib/answer_extract_rate": 0.7734375, "calib/auroc": 0.7890518596123625, "calib/avg_num_step_conf": 4.828125, "calib/ece": 0.1681818181818183, "calib/final_conf_rate": 0.7734375, "calib/format_rate": 0.765625, "calib/frac_conf_gt_0.9": 0.1919191919191919, "calib/gap": 0.30592771084337345, "calib/mean_conf": 0.4157575757575757, "calib/mu_c": 0.544, "calib/mu_w": 0.23807228915662657, "calib/nonempty_final_conf_rate": 0.7734375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.0015656565656565666, "calib/std_conf": 0.30808988229722994, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3722596964586847, "calib/step_q_c_n": 593.0, "calib/step_q_gap": 0.07675425322384799, "calib/step_q_w": 0.2955054432348367, "calib/step_q_w_n": 643.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 396.65625, "completions/mean_terminated_length": 495.3365783691406, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.18346666666666667, "grad_norm": 0.11630342900753021, "learning_rate": 7.777777777777779e-07, "loss": -0.8021, "mask/has_final_conf_rate": 0.7734375, "mask/share_final_conf": 0.03203282505273819, "mask/share_reasoning": 0.5971311926841736, "mask/share_step_conf": 0.1716172695159912, "num_tokens": 42439273.0, "reward": 0.9423291683197021, "reward_std": 0.44360941648483276, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.600367546081543, "rewards/format_reward_step": 0.765625, "rewards/step_l2_reward": 0.6942147016525269, "step": 172 }, { "adv/mean_abs_final_conf": 0.7796030044555664, "adv/mean_abs_reasoning": 0.6838363409042358, "adv/mean_abs_step_conf": 0.7759481072425842, "adv/ratio_final_to_reasoning": 1.1400432498581436, "adv/ratio_step_to_reasoning": 1.1346985540671166, "adv/std_final_conf": 0.9365084767341614, "adv/std_reasoning": 0.875312328338623, "adv/std_step_conf": 0.9361023902893066, "calib/answer_extract_rate": 0.69140625, "calib/auroc": 0.6069118579581484, "calib/avg_num_step_conf": 3.99609375, "calib/ece": 0.28174157303370784, "calib/final_conf_rate": 0.6953125, "calib/format_rate": 0.68359375, "calib/frac_conf_gt_0.9": 0.12359550561797752, "calib/gap": 0.07989220038046912, "calib/mean_conf": 0.37432584269662916, "calib/mu_c": 0.41157894736842093, "calib/mu_w": 0.3316867469879518, "calib/nonempty_final_conf_rate": 0.6953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.06117977528089885, "calib/std_conf": 0.27864206330795527, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.37427160493827155, "calib/step_q_c_n": 405.0, "calib/step_q_gap": 0.04116480882176671, "calib/step_q_w": 0.33310679611650484, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 402.9765625, "completions/mean_terminated_length": 548.7340087890625, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.18453333333333333, "grad_norm": 0.08679164946079254, "learning_rate": 7.5e-07, "loss": -1.104, "mask/has_final_conf_rate": 0.6953125, "mask/share_final_conf": 0.03351902961730957, "mask/share_reasoning": 0.5200715065002441, "mask/share_step_conf": 0.1807844340801239, "num_tokens": 42645595.0, "reward": 0.7727449536323547, "reward_std": 0.44814908504486084, "rewards/accuracy_reward_step": 0.37109375, "rewards/final_brier_reward_step": 0.46951329708099365, "rewards/format_reward_step": 0.68359375, "rewards/step_l2_reward": 0.5766927003860474, "step": 173 }, { "adv/mean_abs_final_conf": 0.7140973210334778, "adv/mean_abs_reasoning": 0.7074859142303467, "adv/mean_abs_step_conf": 0.7206076383590698, "adv/ratio_final_to_reasoning": 1.009344930648299, "adv/ratio_step_to_reasoning": 1.0185469757980947, "adv/std_final_conf": 0.8916718363761902, "adv/std_reasoning": 0.8908472061157227, "adv/std_step_conf": 0.8918491005897522, "calib/answer_extract_rate": 0.484375, "calib/auroc": 0.5900680272108844, "calib/avg_num_step_conf": 4.32421875, "calib/ece": 0.1684677419354839, "calib/final_conf_rate": 0.484375, "calib/format_rate": 0.484375, "calib/frac_conf_gt_0.9": 0.04032258064516129, "calib/gap": 0.08675646258503394, "calib/mean_conf": 0.26201612903225807, "calib/mu_c": 0.31448979591836734, "calib/mu_w": 0.2277333333333334, "calib/nonempty_final_conf_rate": 0.484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.017661290322580646, "calib/std_conf": 0.20782661755814813, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34782178217821785, "calib/step_q_c_n": 202.0, "calib/step_q_gap": 0.04724719654285875, "calib/step_q_w": 0.3005745856353591, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45703125, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 381.796875, "completions/mean_terminated_length": 703.1654663085938, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.1856, "grad_norm": 0.2889764606952667, "learning_rate": 7.222222222222222e-07, "loss": -1.5071, "mask/has_final_conf_rate": 0.484375, "mask/share_final_conf": 0.018657710403203964, "mask/share_reasoning": 0.3527323603630066, "mask/share_step_conf": 0.17157870531082153, "num_tokens": 42847567.0, "reward": 0.567007303237915, "reward_std": 0.4658864140510559, "rewards/accuracy_reward_step": 0.19140625, "rewards/final_brier_reward_step": 0.35918477177619934, "rewards/format_reward_step": 0.484375, "rewards/step_l2_reward": 0.42644909024238586, "step": 174 }, { "adv/mean_abs_final_conf": 0.7631533741950989, "adv/mean_abs_reasoning": 0.751017153263092, "adv/mean_abs_step_conf": 0.7918944358825684, "adv/ratio_final_to_reasoning": 1.0161597120375696, "adv/ratio_step_to_reasoning": 1.054429226339064, "adv/std_final_conf": 0.9218874573707581, "adv/std_reasoning": 0.9059832692146301, "adv/std_step_conf": 0.9221577048301697, "calib/answer_extract_rate": 0.4296875, "calib/auroc": 0.712914769030579, "calib/avg_num_step_conf": 3.86328125, "calib/ece": 0.20828828828828827, "calib/final_conf_rate": 0.43359375, "calib/format_rate": 0.41796875, "calib/frac_conf_gt_0.9": 0.09009009009009009, "calib/gap": 0.21828887443070916, "calib/mean_conf": 0.2900900900900901, "calib/mu_c": 0.4041509433962264, "calib/mu_w": 0.18586206896551727, "calib/nonempty_final_conf_rate": 0.43359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.010450450450450456, "calib/std_conf": 0.2692983726378555, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3528125, "calib/step_q_c_n": 192.0, "calib/step_q_gap": 0.059181383312421565, "calib/step_q_w": 0.2936311166875784, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 434.8046875, "completions/mean_terminated_length": 856.2307739257812, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.18666666666666668, "grad_norm": 0.16033883392810822, "learning_rate": 6.944444444444446e-07, "loss": -1.7116, "mask/has_final_conf_rate": 0.43359375, "mask/share_final_conf": 0.01755821704864502, "mask/share_reasoning": 0.30797672271728516, "mask/share_step_conf": 0.18227756023406982, "num_tokens": 43064701.0, "reward": 0.4913473129272461, "reward_std": 0.5304877758026123, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.311006635427475, "rewards/format_reward_step": 0.41796875, "rewards/step_l2_reward": 0.36445868015289307, "step": 175 }, { "adv/mean_abs_final_conf": 0.7785801291465759, "adv/mean_abs_reasoning": 0.7697429656982422, "adv/mean_abs_step_conf": 0.8165347576141357, "adv/ratio_final_to_reasoning": 1.01148066801795, "adv/ratio_step_to_reasoning": 1.0607888529042266, "adv/std_final_conf": 0.9365408420562744, "adv/std_reasoning": 0.9209882616996765, "adv/std_step_conf": 0.936777651309967, "calib/answer_extract_rate": 0.4765625, "calib/auroc": 0.6894117647058824, "calib/avg_num_step_conf": 3.37890625, "calib/ece": 0.16142857142857137, "calib/final_conf_rate": 0.4921875, "calib/format_rate": 0.4765625, "calib/frac_conf_gt_0.9": 0.03968253968253968, "calib/gap": 0.16075294117647057, "calib/mean_conf": 0.24333333333333337, "calib/mu_c": 0.3390196078431373, "calib/mu_w": 0.1782666666666667, "calib/nonempty_final_conf_rate": 0.4921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0, "calib/std_conf": 0.21761696362449057, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3471938775510203, "calib/step_q_c_n": 196.0, "calib/step_q_gap": 0.04832990146731331, "calib/step_q_w": 0.298863976083707, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 388.859375, "completions/mean_terminated_length": 711.05712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.18773333333333334, "grad_norm": 0.20968042314052582, "learning_rate": 6.666666666666667e-07, "loss": -1.6118, "mask/has_final_conf_rate": 0.4921875, "mask/share_final_conf": 0.020962495356798172, "mask/share_reasoning": 0.3428986966609955, "mask/share_step_conf": 0.18301381170749664, "num_tokens": 43268313.0, "reward": 0.555877149105072, "reward_std": 0.538494348526001, "rewards/accuracy_reward_step": 0.19921875, "rewards/final_brier_reward_step": 0.36044570803642273, "rewards/format_reward_step": 0.4765625, "rewards/step_l2_reward": 0.4107682406902313, "step": 176 }, { "adv/mean_abs_final_conf": 0.7055525779724121, "adv/mean_abs_reasoning": 0.6864043474197388, "adv/mean_abs_step_conf": 0.7357356548309326, "adv/ratio_final_to_reasoning": 1.0278964295967143, "adv/ratio_step_to_reasoning": 1.0718691651599164, "adv/std_final_conf": 0.8761857748031616, "adv/std_reasoning": 0.8595938086509705, "adv/std_step_conf": 0.8763537406921387, "calib/answer_extract_rate": 0.41796875, "calib/auroc": 0.5895833333333333, "calib/avg_num_step_conf": 3.015625, "calib/ece": 0.23651376146788994, "calib/final_conf_rate": 0.42578125, "calib/format_rate": 0.41796875, "calib/frac_conf_gt_0.9": 0.009174311926605505, "calib/gap": 0.05122916666666663, "calib/mean_conf": 0.20458715596330276, "calib/mu_c": 0.23466666666666663, "calib/mu_w": 0.1834375, "calib/nonempty_final_conf_rate": 0.42578125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.014128440366972478, "calib/std_conf": 0.17430086879460374, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.34139860139860134, "calib/step_q_c_n": 143.0, "calib/step_q_gap": 0.0337356443238796, "calib/step_q_w": 0.30766295707472174, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53515625, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 327.7890625, "completions/mean_terminated_length": 705.1597290039062, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.1888, "grad_norm": 0.4212135970592499, "learning_rate": 6.388888888888889e-07, "loss": -1.9844, "mask/has_final_conf_rate": 0.42578125, "mask/share_final_conf": 0.016804736107587814, "mask/share_reasoning": 0.26395851373672485, "mask/share_step_conf": 0.18408048152923584, "num_tokens": 43456059.0, "reward": 0.48361408710479736, "reward_std": 0.47823816537857056, "rewards/accuracy_reward_step": 0.17578125, "rewards/final_brier_reward_step": 0.2939867079257965, "rewards/format_reward_step": 0.41796875, "rewards/step_l2_reward": 0.3696609139442444, "step": 177 }, { "adv/mean_abs_final_conf": 0.7811000943183899, "adv/mean_abs_reasoning": 0.8019535541534424, "adv/mean_abs_step_conf": 0.8349236249923706, "adv/ratio_final_to_reasoning": 0.9739966738384671, "adv/ratio_step_to_reasoning": 1.04111219492472, "adv/std_final_conf": 0.9219565987586975, "adv/std_reasoning": 0.9210929274559021, "adv/std_step_conf": 0.9368710517883301, "calib/answer_extract_rate": 0.4375, "calib/auroc": 0.7688524590163934, "calib/avg_num_step_conf": 2.91015625, "calib/ece": 0.3042342342342341, "calib/final_conf_rate": 0.43359375, "calib/format_rate": 0.4296875, "calib/frac_conf_gt_0.9": 0.07207207207207207, "calib/gap": 0.21966229508196722, "calib/mean_conf": 0.2453153153153153, "calib/mu_c": 0.3442622950819672, "calib/mu_w": 0.12459999999999999, "calib/nonempty_final_conf_rate": 0.43359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0, "calib/std_conf": 0.25174640400133486, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40408284023668634, "calib/step_q_c_n": 169.0, "calib/step_q_gap": 0.12107954162557522, "calib/step_q_w": 0.2830032986111111, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.50390625, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 344.79296875, "completions/mean_terminated_length": 695.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.18986666666666666, "grad_norm": 0.15451431274414062, "learning_rate": 6.111111111111112e-07, "loss": -2.3624, "mask/has_final_conf_rate": 0.43359375, "mask/share_final_conf": 0.01961149275302887, "mask/share_reasoning": 0.29202020168304443, "mask/share_step_conf": 0.1844620406627655, "num_tokens": 43650398.0, "reward": 0.5029100775718689, "reward_std": 0.5544787049293518, "rewards/accuracy_reward_step": 0.23828125, "rewards/final_brier_reward_step": 0.3045867085456848, "rewards/format_reward_step": 0.4296875, "rewards/step_l2_reward": 0.3784264326095581, "step": 178 }, { "adv/mean_abs_final_conf": 0.7245436906814575, "adv/mean_abs_reasoning": 0.7352625131607056, "adv/mean_abs_step_conf": 0.7421489953994751, "adv/ratio_final_to_reasoning": 0.9854217748254693, "adv/ratio_step_to_reasoning": 1.0093660184158801, "adv/std_final_conf": 0.9060723781585693, "adv/std_reasoning": 0.9059785008430481, "adv/std_step_conf": 0.9071795344352722, "calib/answer_extract_rate": 0.3203125, "calib/auroc": 0.7464285714285713, "calib/avg_num_step_conf": 2.484375, "calib/ece": 0.22743975903614458, "calib/final_conf_rate": 0.32421875, "calib/format_rate": 0.30078125, "calib/frac_conf_gt_0.9": 0.07228915662650602, "calib/gap": 0.1914181547619048, "calib/mean_conf": 0.21472891566265062, "calib/mu_c": 0.32542857142857146, "calib/mu_w": 0.13401041666666666, "calib/nonempty_final_conf_rate": 0.32421875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.010240963855421682, "calib/std_conf": 0.24911651482602507, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.3837313432835821, "calib/step_q_c_n": 67.0, "calib/step_q_gap": 0.07870058757180703, "calib/step_q_w": 0.30503075571177507, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.56640625, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 429.3515625, "completions/mean_terminated_length": 990.2162475585938, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.19093333333333334, "grad_norm": 0.2442111372947693, "learning_rate": 5.833333333333334e-07, "loss": -1.9978, "mask/has_final_conf_rate": 0.32421875, "mask/share_final_conf": 0.011269412003457546, "mask/share_reasoning": 0.24272221326828003, "mask/share_step_conf": 0.17960213124752045, "num_tokens": 43866576.0, "reward": 0.355862557888031, "reward_std": 0.5103425979614258, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.22041286528110504, "rewards/format_reward_step": 0.30078125, "rewards/step_l2_reward": 0.26868730783462524, "step": 179 }, { "adv/mean_abs_final_conf": 0.655032753944397, "adv/mean_abs_reasoning": 0.6900363564491272, "adv/mean_abs_step_conf": 0.6993515491485596, "adv/ratio_final_to_reasoning": 0.9492728141385824, "adv/ratio_step_to_reasoning": 1.0134995679754726, "adv/std_final_conf": 0.844301164150238, "adv/std_reasoning": 0.85964435338974, "adv/std_step_conf": 0.8606016039848328, "calib/answer_extract_rate": 0.37109375, "calib/auroc": 0.6911637931034482, "calib/avg_num_step_conf": 2.578125, "calib/ece": 0.34959183673469385, "calib/final_conf_rate": 0.3828125, "calib/format_rate": 0.37109375, "calib/frac_conf_gt_0.9": 0.02040816326530612, "calib/gap": 0.1368534482758621, "calib/mean_conf": 0.2422448979591837, "calib/mu_c": 0.2981034482758621, "calib/mu_w": 0.16125, "calib/nonempty_final_conf_rate": 0.3828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0, "calib/std_conf": 0.19933834743892975, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3713533834586466, "calib/step_q_c_n": 133.0, "calib/step_q_gap": 0.05163801343967128, "calib/step_q_w": 0.3197153700189753, "calib/step_q_w_n": 527.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 559.1796875, "completions/mean_terminated_length": 1118.359375, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.192, "grad_norm": 0.13445325195789337, "learning_rate": 5.555555555555555e-07, "loss": -1.4242, "mask/has_final_conf_rate": 0.3828125, "mask/share_final_conf": 0.012486796826124191, "mask/share_reasoning": 0.2714724540710449, "mask/share_step_conf": 0.2160407453775406, "num_tokens": 44113582.0, "reward": 0.42552027106285095, "reward_std": 0.4537741541862488, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.24204804003238678, "rewards/format_reward_step": 0.37109375, "rewards/step_l2_reward": 0.3263075053691864, "step": 180 }, { "adv/mean_abs_final_conf": 0.6773039102554321, "adv/mean_abs_reasoning": 0.6787272691726685, "adv/mean_abs_step_conf": 0.7220475673675537, "adv/ratio_final_to_reasoning": 0.9979029000573805, "adv/ratio_step_to_reasoning": 1.0638257812268106, "adv/std_final_conf": 0.875947117805481, "adv/std_reasoning": 0.8751358389854431, "adv/std_step_conf": 0.891865611076355, "calib/answer_extract_rate": 0.37890625, "calib/auroc": 0.7386877828054299, "calib/avg_num_step_conf": 2.2421875, "calib/ece": 0.18888888888888886, "calib/final_conf_rate": 0.38671875, "calib/format_rate": 0.375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.12079638009049776, "calib/mean_conf": 0.17010101010101009, "calib/mu_c": 0.24941176470588236, "calib/mu_w": 0.1286153846153846, "calib/nonempty_final_conf_rate": 0.38671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.007777777777777778, "calib/std_conf": 0.15428882105327657, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39999999999999997, "calib/step_q_c_n": 67.0, "calib/step_q_gap": 0.08163708086785015, "calib/step_q_w": 0.3183629191321498, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.5546875, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 389.5234375, "completions/mean_terminated_length": 874.7192993164062, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.19306666666666666, "grad_norm": 0.1355988085269928, "learning_rate": 5.277777777777779e-07, "loss": -1.8989, "mask/has_final_conf_rate": 0.38671875, "mask/share_final_conf": 0.01411872822791338, "mask/share_reasoning": 0.25829386711120605, "mask/share_step_conf": 0.17289991676807404, "num_tokens": 44319564.0, "reward": 0.44823896884918213, "reward_std": 0.4739459753036499, "rewards/accuracy_reward_step": 0.1328125, "rewards/final_brier_reward_step": 0.2897332012653351, "rewards/format_reward_step": 0.375, "rewards/step_l2_reward": 0.3367881774902344, "step": 181 }, { "adv/mean_abs_final_conf": 0.7794720530509949, "adv/mean_abs_reasoning": 0.7892525792121887, "adv/mean_abs_step_conf": 0.7970051765441895, "adv/ratio_final_to_reasoning": 0.9876078629087832, "adv/ratio_step_to_reasoning": 1.009822707630223, "adv/std_final_conf": 0.9069880843162537, "adv/std_reasoning": 0.9061655402183533, "adv/std_step_conf": 0.8967230916023254, "calib/answer_extract_rate": 0.4140625, "calib/auroc": 0.672941590429275, "calib/avg_num_step_conf": 1.984375, "calib/ece": 0.2689719626168224, "calib/final_conf_rate": 0.41796875, "calib/format_rate": 0.4140625, "calib/frac_conf_gt_0.9": 0.018691588785046728, "calib/gap": 0.13818789584799437, "calib/mean_conf": 0.18897196261682248, "calib/mu_c": 0.26387755102040816, "calib/mu_w": 0.1256896551724138, "calib/nonempty_final_conf_rate": 0.41796875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.0, "calib/std_conf": 0.2071206004702062, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.33185567010309275, "calib/step_q_c_n": 97.0, "calib/step_q_gap": 0.01767075526124362, "calib/step_q_w": 0.31418491484184913, "calib/step_q_w_n": 411.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.54296875, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 346.546875, "completions/mean_terminated_length": 758.2564697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.19413333333333332, "grad_norm": 0.17529059946537018, "learning_rate": 5.000000000000001e-07, "loss": -2.0152, "mask/has_final_conf_rate": 0.41796875, "mask/share_final_conf": 0.015305116772651672, "mask/share_reasoning": 0.2717783451080322, "mask/share_step_conf": 0.1699477881193161, "num_tokens": 44514440.0, "reward": 0.49129849672317505, "reward_std": 0.5451165437698364, "rewards/accuracy_reward_step": 0.19140625, "rewards/final_brier_reward_step": 0.29107969999313354, "rewards/format_reward_step": 0.4140625, "rewards/step_l2_reward": 0.38028234243392944, "step": 182 }, { "adv/mean_abs_final_conf": 0.5986714363098145, "adv/mean_abs_reasoning": 0.5951822996139526, "adv/mean_abs_step_conf": 0.5982235670089722, "adv/ratio_final_to_reasoning": 1.0058622991613242, "adv/ratio_step_to_reasoning": 1.0051098081999283, "adv/std_final_conf": 0.8110201954841614, "adv/std_reasoning": 0.8103538751602173, "adv/std_step_conf": 0.8114042282104492, "calib/answer_extract_rate": 0.33984375, "calib/auroc": 0.6104525862068966, "calib/avg_num_step_conf": 1.90234375, "calib/ece": 0.2295555555555555, "calib/final_conf_rate": 0.3515625, "calib/format_rate": 0.33984375, "calib/frac_conf_gt_0.9": 0.05555555555555555, "calib/gap": 0.0735668103448276, "calib/mean_conf": 0.1897777777777778, "calib/mu_c": 0.23718750000000002, "calib/mu_w": 0.16362068965517243, "calib/nonempty_final_conf_rate": 0.3515625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03188888888888888, "calib/std_conf": 0.23606485821475123, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3470370370370371, "calib/step_q_c_n": 54.0, "calib/step_q_gap": 0.021425027799161833, "calib/step_q_w": 0.32561200923787526, "calib/step_q_w_n": 433.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 383.5078125, "completions/mean_terminated_length": 926.2075805664062, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.1952, "grad_norm": 0.21741926670074463, "learning_rate": 4.7222222222222226e-07, "loss": -1.6925, "mask/has_final_conf_rate": 0.3515625, "mask/share_final_conf": 0.012612584047019482, "mask/share_reasoning": 0.23839637637138367, "mask/share_step_conf": 0.16305354237556458, "num_tokens": 44719298.0, "reward": 0.39942651987075806, "reward_std": 0.408824622631073, "rewards/accuracy_reward_step": 0.125, "rewards/final_brier_reward_step": 0.24213281273841858, "rewards/format_reward_step": 0.33984375, "rewards/step_l2_reward": 0.30916762351989746, "step": 183 }, { "adv/mean_abs_final_conf": 0.7714831829071045, "adv/mean_abs_reasoning": 0.8002102971076965, "adv/mean_abs_step_conf": 0.8143944144248962, "adv/ratio_final_to_reasoning": 0.9641005441889161, "adv/ratio_step_to_reasoning": 1.0177254871231565, "adv/std_final_conf": 0.9366480708122253, "adv/std_reasoning": 0.9359897375106812, "adv/std_step_conf": 0.9369352459907532, "calib/answer_extract_rate": 0.44140625, "calib/auroc": 0.6713501291989665, "calib/avg_num_step_conf": 1.93359375, "calib/ece": 0.3282608695652174, "calib/final_conf_rate": 0.44921875, "calib/format_rate": 0.43359375, "calib/frac_conf_gt_0.9": 0.14782608695652175, "calib/gap": 0.19810400516795856, "calib/mean_conf": 0.3435652173913044, "calib/mu_c": 0.41763888888888884, "calib/mu_w": 0.21953488372093027, "calib/nonempty_final_conf_rate": 0.44921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.022869565217391315, "calib/std_conf": 0.32192948540445754, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3510743801652892, "calib/step_q_c_n": 121.0, "calib/step_q_gap": 0.019924647544968366, "calib/step_q_w": 0.3311497326203208, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.47265625, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 451.2265625, "completions/mean_terminated_length": 855.6592407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.19626666666666667, "grad_norm": 0.12983690202236176, "learning_rate": 4.444444444444445e-07, "loss": -1.5325, "mask/has_final_conf_rate": 0.44921875, "mask/share_final_conf": 0.017030686140060425, "mask/share_reasoning": 0.3019544780254364, "mask/share_step_conf": 0.20835858583450317, "num_tokens": 44940092.0, "reward": 0.5032200813293457, "reward_std": 0.5530544519424438, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.29165664315223694, "rewards/format_reward_step": 0.43359375, "rewards/step_l2_reward": 0.3812098801136017, "step": 184 }, { "adv/mean_abs_final_conf": 0.7000118494033813, "adv/mean_abs_reasoning": 0.7548315525054932, "adv/mean_abs_step_conf": 0.7643401622772217, "adv/ratio_final_to_reasoning": 0.9273749183905334, "adv/ratio_step_to_reasoning": 1.012596995634545, "adv/std_final_conf": 0.8915876150131226, "adv/std_reasoning": 0.9061233401298523, "adv/std_step_conf": 0.907081127166748, "calib/answer_extract_rate": 0.3828125, "calib/auroc": 0.7020016339869282, "calib/avg_num_step_conf": 1.68359375, "calib/ece": 0.30151515151515146, "calib/final_conf_rate": 0.38671875, "calib/format_rate": 0.375, "calib/frac_conf_gt_0.9": 0.050505050505050504, "calib/gap": 0.16599264705882352, "calib/mean_conf": 0.21363636363636365, "calib/mu_c": 0.29411764705882354, "calib/mu_w": 0.12812500000000002, "calib/nonempty_final_conf_rate": 0.38671875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.0, "calib/std_conf": 0.2240375804760548, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3832911392405063, "calib/step_q_c_n": 79.0, "calib/step_q_gap": 0.04158659378596086, "calib/step_q_w": 0.3417045454545454, "calib/step_q_w_n": 352.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 482.5, "completions/mean_terminated_length": 996.1290283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.19733333333333333, "grad_norm": 0.1541745811700821, "learning_rate": 4.1666666666666667e-07, "loss": -1.5324, "mask/has_final_conf_rate": 0.38671875, "mask/share_final_conf": 0.011759690940380096, "mask/share_reasoning": 0.2782679796218872, "mask/share_step_conf": 0.1943473368883133, "num_tokens": 45170532.0, "reward": 0.4367647171020508, "reward_std": 0.502619743347168, "rewards/accuracy_reward_step": 0.19921875, "rewards/final_brier_reward_step": 0.2577418088912964, "rewards/format_reward_step": 0.375, "rewards/step_l2_reward": 0.333962619304657, "step": 185 }, { "adv/mean_abs_final_conf": 0.7071709632873535, "adv/mean_abs_reasoning": 0.7387014627456665, "adv/mean_abs_step_conf": 0.7503018379211426, "adv/ratio_final_to_reasoning": 0.9573163164709085, "adv/ratio_step_to_reasoning": 1.0157037392783261, "adv/std_final_conf": 0.8916911482810974, "adv/std_reasoning": 0.9060976505279541, "adv/std_step_conf": 0.9071915745735168, "calib/answer_extract_rate": 0.390625, "calib/auroc": 0.7592592592592592, "calib/avg_num_step_conf": 1.75, "calib/ece": 0.22000000000000003, "calib/final_conf_rate": 0.39453125, "calib/format_rate": 0.390625, "calib/frac_conf_gt_0.9": 0.12871287128712872, "calib/gap": 0.2488416075650118, "calib/mean_conf": 0.298019801980198, "calib/mu_c": 0.43106382978723407, "calib/mu_w": 0.18222222222222226, "calib/nonempty_final_conf_rate": 0.39453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.026336633663366357, "calib/std_conf": 0.3058052826473249, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.37469879518072285, "calib/step_q_c_n": 83.0, "calib/step_q_gap": 0.0377672883314078, "calib/step_q_w": 0.33693150684931505, "calib/step_q_w_n": 365.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.51953125, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 479.26171875, "completions/mean_terminated_length": 997.48779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.1984, "grad_norm": 0.09268806129693985, "learning_rate": 3.8888888888888895e-07, "loss": -1.891, "mask/has_final_conf_rate": 0.39453125, "mask/share_final_conf": 0.012729963287711143, "mask/share_reasoning": 0.2655085027217865, "mask/share_step_conf": 0.2022302895784378, "num_tokens": 45398263.0, "reward": 0.47248899936676025, "reward_std": 0.5240097045898438, "rewards/accuracy_reward_step": 0.18359375, "rewards/final_brier_reward_step": 0.29347658157348633, "rewards/format_reward_step": 0.390625, "rewards/step_l2_reward": 0.35777178406715393, "step": 186 }, { "adv/mean_abs_final_conf": 0.7503759860992432, "adv/mean_abs_reasoning": 0.7579132318496704, "adv/mean_abs_step_conf": 0.7848829627037048, "adv/ratio_final_to_reasoning": 0.990055265650881, "adv/ratio_step_to_reasoning": 1.0355841931776484, "adv/std_final_conf": 0.9218424558639526, "adv/std_reasoning": 0.921066164970398, "adv/std_step_conf": 0.9221832752227783, "calib/answer_extract_rate": 0.375, "calib/auroc": 0.6381551362683437, "calib/avg_num_step_conf": 1.7109375, "calib/ece": 0.2854081632653061, "calib/final_conf_rate": 0.3828125, "calib/format_rate": 0.3671875, "calib/frac_conf_gt_0.9": 0.04081632653061224, "calib/gap": 0.06703144654088047, "calib/mean_conf": 0.22908163265306125, "calib/mu_c": 0.2653333333333333, "calib/mu_w": 0.19830188679245284, "calib/nonempty_final_conf_rate": 0.3828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.027653061224489794, "calib/std_conf": 0.21257453307397892, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.33675, "calib/step_q_c_n": 80.0, "calib/step_q_gap": 0.001498603351955341, "calib/step_q_w": 0.33525139664804465, "calib/step_q_w_n": 358.0, "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.53515625, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 446.62109375, "completions/mean_terminated_length": 960.79833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.19946666666666665, "grad_norm": 0.15147480368614197, "learning_rate": 3.611111111111111e-07, "loss": -1.9245, "mask/has_final_conf_rate": 0.3828125, "mask/share_final_conf": 0.0132368765771389, "mask/share_reasoning": 0.24744805693626404, "mask/share_step_conf": 0.20415881276130676, "num_tokens": 45614142.0, "reward": 0.4245920777320862, "reward_std": 0.5307902097702026, "rewards/accuracy_reward_step": 0.17578125, "rewards/final_brier_reward_step": 0.2493503987789154, "rewards/format_reward_step": 0.3671875, "rewards/step_l2_reward": 0.3274933695793152, "step": 187 }, { "adv/mean_abs_final_conf": 0.7576410174369812, "adv/mean_abs_reasoning": 0.728103756904602, "adv/mean_abs_step_conf": 0.7716506123542786, "adv/ratio_final_to_reasoning": 1.0405673782785456, "adv/ratio_step_to_reasoning": 1.0598085850220138, "adv/std_final_conf": 0.9068464636802673, "adv/std_reasoning": 0.8908753991127014, "adv/std_step_conf": 0.9070549607276917, "calib/answer_extract_rate": 0.47265625, "calib/auroc": 0.609681697612732, "calib/avg_num_step_conf": 1.5625, "calib/ece": 0.2597560975609756, "calib/final_conf_rate": 0.48046875, "calib/format_rate": 0.46875, "calib/frac_conf_gt_0.9": 0.04065040650406504, "calib/gap": 0.12151724137931033, "calib/mean_conf": 0.23130081300813007, "calib/mu_c": 0.29551724137931035, "calib/mu_w": 0.17400000000000002, "calib/nonempty_final_conf_rate": 0.48046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.009756097560975606, "calib/std_conf": 0.22595042031021628, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3656321839080459, "calib/step_q_c_n": 87.0, "calib/step_q_gap": 0.021127391575777588, "calib/step_q_w": 0.34450479233226833, "calib/step_q_w_n": 313.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 514.01171875, "completions/mean_terminated_length": 913.7986450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.20053333333333334, "grad_norm": 0.14727431535720825, "learning_rate": 3.3333333333333335e-07, "loss": -1.3545, "mask/has_final_conf_rate": 0.48046875, "mask/share_final_conf": 0.01569814234972, "mask/share_reasoning": 0.31940892338752747, "mask/share_step_conf": 0.22739294171333313, "num_tokens": 45849801.0, "reward": 0.5597624778747559, "reward_std": 0.5230478644371033, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.32674336433410645, "rewards/format_reward_step": 0.46875, "rewards/step_l2_reward": 0.435812771320343, "step": 188 }, { "adv/mean_abs_final_conf": 0.7794215679168701, "adv/mean_abs_reasoning": 0.8135305643081665, "adv/mean_abs_step_conf": 0.8352118134498596, "adv/ratio_final_to_reasoning": 0.9580728765608174, "adv/ratio_step_to_reasoning": 1.0266508107905339, "adv/std_final_conf": 0.9219034910202026, "adv/std_reasoning": 0.935815155506134, "adv/std_step_conf": 0.9369481205940247, "calib/answer_extract_rate": 0.4140625, "calib/auroc": 0.662152530292231, "calib/avg_num_step_conf": 1.546875, "calib/ece": 0.23214953271028038, "calib/final_conf_rate": 0.41796875, "calib/format_rate": 0.3984375, "calib/frac_conf_gt_0.9": 0.08411214953271028, "calib/gap": 0.1729116179615111, "calib/mean_conf": 0.22859813084112152, "calib/mu_c": 0.3271739130434783, "calib/mu_w": 0.1542622950819672, "calib/nonempty_final_conf_rate": 0.41796875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.015420560747663549, "calib/std_conf": 0.2735760025210266, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3747826086956521, "calib/step_q_c_n": 69.0, "calib/step_q_gap": 0.04169392368036162, "calib/step_q_w": 0.3330886850152905, "calib/step_q_w_n": 327.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 376.17578125, "completions/mean_terminated_length": 776.6209716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.2016, "grad_norm": 0.2416364550590515, "learning_rate": 3.055555555555556e-07, "loss": -1.9951, "mask/has_final_conf_rate": 0.41796875, "mask/share_final_conf": 0.01586807146668434, "mask/share_reasoning": 0.28485164046287537, "mask/share_step_conf": 0.1836552917957306, "num_tokens": 46053870.0, "reward": 0.46894943714141846, "reward_std": 0.5665953755378723, "rewards/accuracy_reward_step": 0.1796875, "rewards/final_brier_reward_step": 0.2840324342250824, "rewards/format_reward_step": 0.3984375, "rewards/step_l2_reward": 0.3588276207447052, "step": 189 }, { "adv/mean_abs_final_conf": 0.7758157253265381, "adv/mean_abs_reasoning": 0.7713011503219604, "adv/mean_abs_step_conf": 0.8179733753204346, "adv/ratio_final_to_reasoning": 1.0058531936620256, "adv/ratio_step_to_reasoning": 1.0605110273451452, "adv/std_final_conf": 0.921683132648468, "adv/std_reasoning": 0.9060755372047424, "adv/std_step_conf": 0.922048807144165, "calib/answer_extract_rate": 0.44140625, "calib/auroc": 0.637952366223322, "calib/avg_num_step_conf": 1.52734375, "calib/ece": 0.26342105263157894, "calib/final_conf_rate": 0.4453125, "calib/format_rate": 0.4375, "calib/frac_conf_gt_0.9": 0.02631578947368421, "calib/gap": 0.0982523971543458, "calib/mean_conf": 0.24535087719298246, "calib/mu_c": 0.2979245283018868, "calib/mu_w": 0.199672131147541, "calib/nonempty_final_conf_rate": 0.4453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02192982456140351, "calib/std_conf": 0.22736931787239203, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37897435897435894, "calib/step_q_c_n": 78.0, "calib/step_q_gap": 0.04494879986892769, "calib/step_q_w": 0.33402555910543125, "calib/step_q_w_n": 313.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.47265625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 502.70703125, "completions/mean_terminated_length": 953.281494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.20266666666666666, "grad_norm": 0.1024363711476326, "learning_rate": 2.7777777777777776e-07, "loss": -1.7315, "mask/has_final_conf_rate": 0.4453125, "mask/share_final_conf": 0.014365598559379578, "mask/share_reasoning": 0.27649250626564026, "mask/share_step_conf": 0.23648564517498016, "num_tokens": 46288171.0, "reward": 0.5105445384979248, "reward_std": 0.5440113544464111, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.30749452114105225, "rewards/format_reward_step": 0.4375, "rewards/step_l2_reward": 0.38979214429855347, "step": 190 }, { "adv/mean_abs_final_conf": 0.7739953994750977, "adv/mean_abs_reasoning": 0.753760814666748, "adv/mean_abs_step_conf": 0.7896829843521118, "adv/ratio_final_to_reasoning": 1.026844835144814, "adv/ratio_step_to_reasoning": 1.0476572527868082, "adv/std_final_conf": 0.9358944296836853, "adv/std_reasoning": 0.9209585189819336, "adv/std_step_conf": 0.9367083311080933, "calib/answer_extract_rate": 0.4765625, "calib/auroc": 0.594832251082251, "calib/avg_num_step_conf": 1.3984375, "calib/ece": 0.28459016393442627, "calib/final_conf_rate": 0.4765625, "calib/format_rate": 0.46875, "calib/frac_conf_gt_0.9": 0.06557377049180328, "calib/gap": 0.04901515151515151, "calib/mean_conf": 0.27098360655737713, "calib/mu_c": 0.2975, "calib/mu_w": 0.24848484848484848, "calib/nonempty_final_conf_rate": 0.4765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.04827868852459017, "calib/std_conf": 0.25365980133482374, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3499999999999999, "calib/step_q_c_n": 65.0, "calib/step_q_gap": -0.0038225255972696437, "calib/step_q_w": 0.35382252559726957, "calib/step_q_w_n": 293.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 459.91015625, "completions/mean_terminated_length": 853.1666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.20373333333333332, "grad_norm": 0.1299065798521042, "learning_rate": 2.5000000000000004e-07, "loss": -1.4379, "mask/has_final_conf_rate": 0.4765625, "mask/share_final_conf": 0.01533226203173399, "mask/share_reasoning": 0.29629403352737427, "mask/share_step_conf": 0.22743621468544006, "num_tokens": 46510076.0, "reward": 0.533978283405304, "reward_std": 0.5094114542007446, "rewards/accuracy_reward_step": 0.21875, "rewards/final_brier_reward_step": 0.3145742118358612, "rewards/format_reward_step": 0.46875, "rewards/step_l2_reward": 0.41058823466300964, "step": 191 }, { "adv/mean_abs_final_conf": 0.732417643070221, "adv/mean_abs_reasoning": 0.7503454685211182, "adv/mean_abs_step_conf": 0.7655529975891113, "adv/ratio_final_to_reasoning": 0.9761072383281906, "adv/ratio_step_to_reasoning": 1.0202673697729743, "adv/std_final_conf": 0.9219067692756653, "adv/std_reasoning": 0.9209879040718079, "adv/std_step_conf": 0.9221699237823486, "calib/answer_extract_rate": 0.4375, "calib/auroc": 0.5488588878174221, "calib/avg_num_step_conf": 1.41796875, "calib/ece": 0.3138839285714286, "calib/final_conf_rate": 0.4375, "calib/format_rate": 0.421875, "calib/frac_conf_gt_0.9": 0.08035714285714286, "calib/gap": 0.016311475409836074, "calib/mean_conf": 0.26111607142857146, "calib/mu_c": 0.27, "calib/mu_w": 0.25368852459016394, "calib/nonempty_final_conf_rate": 0.4375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.05982142857142858, "calib/std_conf": 0.26199245986749975, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.34548387096774186, "calib/step_q_c_n": 62.0, "calib/step_q_gap": 0.0072778909012966575, "calib/step_q_w": 0.3382059800664452, "calib/step_q_w_n": 301.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45703125, "completions/max_length": 3042.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 508.93359375, "completions/mean_terminated_length": 937.3165893554688, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.2048, "grad_norm": 0.21384847164154053, "learning_rate": 2.2222222222222224e-07, "loss": -1.3701, "mask/has_final_conf_rate": 0.4375, "mask/share_final_conf": 0.013786101713776588, "mask/share_reasoning": 0.33207717537879944, "mask/share_step_conf": 0.19710546731948853, "num_tokens": 46745339.0, "reward": 0.48342451453208923, "reward_std": 0.5122061371803284, "rewards/accuracy_reward_step": 0.19921875, "rewards/final_brier_reward_step": 0.27475929260253906, "rewards/format_reward_step": 0.421875, "rewards/step_l2_reward": 0.37858062982559204, "step": 192 }, { "adv/mean_abs_final_conf": 0.7556190490722656, "adv/mean_abs_reasoning": 0.7361305952072144, "adv/mean_abs_step_conf": 0.7891688346862793, "adv/ratio_final_to_reasoning": 1.0264741799783033, "adv/ratio_step_to_reasoning": 1.0720500408818563, "adv/std_final_conf": 0.9069178700447083, "adv/std_reasoning": 0.8907724618911743, "adv/std_step_conf": 0.9070672988891602, "calib/answer_extract_rate": 0.49609375, "calib/auroc": 0.6909090909090909, "calib/avg_num_step_conf": 1.55078125, "calib/ece": 0.24330708661417322, "calib/final_conf_rate": 0.49609375, "calib/format_rate": 0.48828125, "calib/frac_conf_gt_0.9": 0.06299212598425197, "calib/gap": 0.13722222222222233, "calib/mean_conf": 0.24220472440944882, "calib/mu_c": 0.3200000000000001, "calib/mu_w": 0.1827777777777778, "calib/nonempty_final_conf_rate": 0.49609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.02622047244094488, "calib/std_conf": 0.2458855549222285, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.34407894736842104, "calib/step_q_c_n": 76.0, "calib/step_q_gap": 0.009686424003935101, "calib/step_q_w": 0.33439252336448594, "calib/step_q_w_n": 321.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3049.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 446.3515625, "completions/mean_terminated_length": 816.1857299804688, "completions/min_length": 0.0, "completions/min_terminated_length": 91.0, "epoch": 0.20586666666666667, "grad_norm": 0.12955591082572937, "learning_rate": 1.9444444444444447e-07, "loss": -1.5353, "mask/has_final_conf_rate": 0.49609375, "mask/share_final_conf": 0.0178175400942564, "mask/share_reasoning": 0.31756535172462463, "mask/share_step_conf": 0.21149210631847382, "num_tokens": 46965317.0, "reward": 0.5772705674171448, "reward_std": 0.5131789445877075, "rewards/accuracy_reward_step": 0.21484375, "rewards/final_brier_reward_step": 0.3521054685115814, "rewards/format_reward_step": 0.48828125, "rewards/step_l2_reward": 0.4412071108818054, "step": 193 }, { "adv/mean_abs_final_conf": 0.8192306160926819, "adv/mean_abs_reasoning": 0.819528341293335, "adv/mean_abs_step_conf": 0.8271200656890869, "adv/ratio_final_to_reasoning": 0.9996367115258232, "adv/ratio_step_to_reasoning": 1.009263528804594, "adv/std_final_conf": 0.9367586374282837, "adv/std_reasoning": 0.9357778429985046, "adv/std_step_conf": 0.9369503855705261, "calib/answer_extract_rate": 0.453125, "calib/auroc": 0.7305976806422836, "calib/avg_num_step_conf": 1.4296875, "calib/ece": 0.2682758620689655, "calib/final_conf_rate": 0.453125, "calib/format_rate": 0.453125, "calib/frac_conf_gt_0.9": 0.06896551724137931, "calib/gap": 0.20591733571216173, "calib/mean_conf": 0.25017241379310345, "calib/mu_c": 0.3513559322033898, "calib/mu_w": 0.14543859649122806, "calib/nonempty_final_conf_rate": 0.453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.004913793103448276, "calib/std_conf": 0.2617579256733104, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3655294117647058, "calib/step_q_c_n": 85.0, "calib/step_q_gap": 0.017522294326983434, "calib/step_q_w": 0.3480071174377224, "calib/step_q_w_n": 281.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.50390625, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 342.58984375, "completions/mean_terminated_length": 690.5748291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.20693333333333333, "grad_norm": 0.19686636328697205, "learning_rate": 1.6666666666666668e-07, "loss": -2.0312, "mask/has_final_conf_rate": 0.453125, "mask/share_final_conf": 0.020617563277482986, "mask/share_reasoning": 0.26393723487854004, "mask/share_step_conf": 0.21153897047042847, "num_tokens": 47158964.0, "reward": 0.53669673204422, "reward_std": 0.5804100036621094, "rewards/accuracy_reward_step": 0.23046875, "rewards/final_brier_reward_step": 0.3252031207084656, "rewards/format_reward_step": 0.453125, "rewards/step_l2_reward": 0.4076477289199829, "step": 194 }, { "adv/mean_abs_final_conf": 0.8255274295806885, "adv/mean_abs_reasoning": 0.8241317868232727, "adv/mean_abs_step_conf": 0.818082869052887, "adv/ratio_final_to_reasoning": 1.0016934703644855, "adv/ratio_step_to_reasoning": 0.9926602542613941, "adv/std_final_conf": 0.9363267421722412, "adv/std_reasoning": 0.9357849359512329, "adv/std_step_conf": 0.9369533658027649, "calib/answer_extract_rate": 0.52734375, "calib/auroc": 0.6524175824175824, "calib/avg_num_step_conf": 1.3046875, "calib/ece": 0.2911851851851852, "calib/final_conf_rate": 0.52734375, "calib/format_rate": 0.515625, "calib/frac_conf_gt_0.9": 0.02962962962962963, "calib/gap": 0.10741758241758251, "calib/mean_conf": 0.23414814814814813, "calib/mu_c": 0.2898461538461539, "calib/mu_w": 0.1824285714285714, "calib/nonempty_final_conf_rate": 0.52734375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.021925925925925925, "calib/std_conf": 0.23289923169946325, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3679761904761904, "calib/step_q_c_n": 84.0, "calib/step_q_gap": 0.026256190476190433, "calib/step_q_w": 0.34171999999999997, "calib/step_q_w_n": 250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 458.34765625, "completions/mean_terminated_length": 771.9539794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.208, "grad_norm": 0.09223850816488266, "learning_rate": 1.3888888888888888e-07, "loss": -1.6689, "mask/has_final_conf_rate": 0.52734375, "mask/share_final_conf": 0.02058326080441475, "mask/share_reasoning": 0.3318544030189514, "mask/share_step_conf": 0.24131232500076294, "num_tokens": 47382285.0, "reward": 0.6079497337341309, "reward_std": 0.5632410049438477, "rewards/accuracy_reward_step": 0.25390625, "rewards/final_brier_reward_step": 0.35428789258003235, "rewards/format_reward_step": 0.515625, "rewards/step_l2_reward": 0.47180354595184326, "step": 195 }, { "adv/mean_abs_final_conf": 0.8046687841415405, "adv/mean_abs_reasoning": 0.7653719186782837, "adv/mean_abs_step_conf": 0.8247447609901428, "adv/ratio_final_to_reasoning": 1.0513434900134804, "adv/ratio_step_to_reasoning": 1.0775738446406418, "adv/std_final_conf": 0.9367269277572632, "adv/std_reasoning": 0.9060116410255432, "adv/std_step_conf": 0.9361777305603027, "calib/answer_extract_rate": 0.49609375, "calib/auroc": 0.7307135969141756, "calib/avg_num_step_conf": 1.3203125, "calib/ece": 0.2507751937984496, "calib/final_conf_rate": 0.50390625, "calib/format_rate": 0.484375, "calib/frac_conf_gt_0.9": 0.13953488372093023, "calib/gap": 0.21747830279652844, "calib/mean_conf": 0.3462790697674419, "calib/mu_c": 0.4491176470588235, "calib/mu_w": 0.23163934426229507, "calib/nonempty_final_conf_rate": 0.50390625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.03496124031007749, "calib/std_conf": 0.3126557156660814, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4024675324675324, "calib/step_q_c_n": 77.0, "calib/step_q_gap": 0.057831517141861966, "calib/step_q_w": 0.34463601532567045, "calib/step_q_w_n": 261.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 368.390625, "completions/mean_terminated_length": 673.6286010742188, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.20906666666666668, "grad_norm": 0.1020633801817894, "learning_rate": 1.1111111111111112e-07, "loss": -1.5135, "mask/has_final_conf_rate": 0.50390625, "mask/share_final_conf": 0.021770047023892403, "mask/share_reasoning": 0.3151125907897949, "mask/share_step_conf": 0.20999234914779663, "num_tokens": 47579137.0, "reward": 0.5831013321876526, "reward_std": 0.5391672849655151, "rewards/accuracy_reward_step": 0.265625, "rewards/final_brier_reward_step": 0.3524019718170166, "rewards/format_reward_step": 0.484375, "rewards/step_l2_reward": 0.44253382086753845, "step": 196 }, { "adv/mean_abs_final_conf": 0.7114823460578918, "adv/mean_abs_reasoning": 0.7125142812728882, "adv/mean_abs_step_conf": 0.7193067073822021, "adv/ratio_final_to_reasoning": 0.9985516989032798, "adv/ratio_step_to_reasoning": 1.0095330385479144, "adv/std_final_conf": 0.8917362093925476, "adv/std_reasoning": 0.8907146453857422, "adv/std_step_conf": 0.8919350504875183, "calib/answer_extract_rate": 0.33984375, "calib/auroc": 0.6805555555555556, "calib/avg_num_step_conf": 1.375, "calib/ece": 0.18109890109890112, "calib/final_conf_rate": 0.35546875, "calib/format_rate": 0.33203125, "calib/frac_conf_gt_0.9": 0.13186813186813187, "calib/gap": 0.21496969696969695, "calib/mean_conf": 0.2934065934065934, "calib/mu_c": 0.42333333333333334, "calib/mu_w": 0.2083636363636364, "calib/nonempty_final_conf_rate": 0.35546875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.039450549450549474, "calib/std_conf": 0.30445129811442734, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.39638297872340433, "calib/step_q_c_n": 47.0, "calib/step_q_gap": 0.048415765608650296, "calib/step_q_w": 0.34796721311475404, "calib/step_q_w_n": 305.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 340.95703125, "completions/mean_terminated_length": 839.2788696289062, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.21013333333333334, "grad_norm": 0.22904542088508606, "learning_rate": 8.333333333333334e-08, "loss": -2.1411, "mask/has_final_conf_rate": 0.35546875, "mask/share_final_conf": 0.011857477948069572, "mask/share_reasoning": 0.23438555002212524, "mask/share_step_conf": 0.16000698506832123, "num_tokens": 47771478.0, "reward": 0.40049824118614197, "reward_std": 0.5027368664741516, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.25186797976493835, "rewards/format_reward_step": 0.33203125, "rewards/step_l2_reward": 0.30306482315063477, "step": 197 }, { "adv/mean_abs_final_conf": 0.7501786351203918, "adv/mean_abs_reasoning": 0.7775917053222656, "adv/mean_abs_step_conf": 0.7811833620071411, "adv/ratio_final_to_reasoning": 0.9647461900451823, "adv/ratio_step_to_reasoning": 1.0046189493281528, "adv/std_final_conf": 0.9219053983688354, "adv/std_reasoning": 0.921049952507019, "adv/std_step_conf": 0.9221871495246887, "calib/answer_extract_rate": 0.46484375, "calib/auroc": 0.6527815468113976, "calib/avg_num_step_conf": 1.265625, "calib/ece": 0.2981967213114755, "calib/final_conf_rate": 0.4765625, "calib/format_rate": 0.45703125, "calib/frac_conf_gt_0.9": 0.08196721311475409, "calib/gap": 0.13176119402985073, "calib/mean_conf": 0.27836065573770485, "calib/mu_c": 0.3377611940298507, "calib/mu_w": 0.206, "calib/nonempty_final_conf_rate": 0.4765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.013688524590163946, "calib/std_conf": 0.24966653384897258, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3542857142857142, "calib/step_q_c_n": 70.0, "calib/step_q_gap": 0.005860517435320511, "calib/step_q_w": 0.3484251968503937, "calib/step_q_w_n": 254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 443.6171875, "completions/mean_terminated_length": 822.9420166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.2112, "grad_norm": 0.13247403502464294, "learning_rate": 5.555555555555556e-08, "loss": -1.6816, "mask/has_final_conf_rate": 0.4765625, "mask/share_final_conf": 0.018223222345113754, "mask/share_reasoning": 0.29040613770484924, "mask/share_step_conf": 0.2304331362247467, "num_tokens": 47990428.0, "reward": 0.5378419160842896, "reward_std": 0.5349186062812805, "rewards/accuracy_reward_step": 0.26171875, "rewards/final_brier_reward_step": 0.31049844622612, "rewards/format_reward_step": 0.45703125, "rewards/step_l2_reward": 0.41429033875465393, "step": 198 }, { "adv/mean_abs_final_conf": 0.7660308480262756, "adv/mean_abs_reasoning": 0.7764954566955566, "adv/mean_abs_step_conf": 0.793433666229248, "adv/ratio_final_to_reasoning": 0.9865232840977408, "adv/ratio_step_to_reasoning": 1.0218136621246612, "adv/std_final_conf": 0.9069642424583435, "adv/std_reasoning": 0.9061921238899231, "adv/std_step_conf": 0.9068816304206848, "calib/answer_extract_rate": 0.57421875, "calib/auroc": 0.6813743218806512, "calib/avg_num_step_conf": 1.2890625, "calib/ece": 0.23738255033557049, "calib/final_conf_rate": 0.58203125, "calib/format_rate": 0.5625, "calib/frac_conf_gt_0.9": 0.11409395973154363, "calib/gap": 0.19223508137432188, "calib/mean_conf": 0.33563758389261744, "calib/mu_c": 0.4259493670886076, "calib/mu_w": 0.2337142857142857, "calib/nonempty_final_conf_rate": 0.58203125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.02140939597315436, "calib/std_conf": 0.2951448788765415, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3555670103092783, "calib/step_q_c_n": 97.0, "calib/step_q_gap": 0.00912924206893495, "calib/step_q_w": 0.34643776824034334, "calib/step_q_w_n": 233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.37109375, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 508.765625, "completions/mean_terminated_length": 808.9689331054688, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.21226666666666666, "grad_norm": 0.1422998607158661, "learning_rate": 2.777777777777778e-08, "loss": -1.4611, "mask/has_final_conf_rate": 0.58203125, "mask/share_final_conf": 0.018554989248514175, "mask/share_reasoning": 0.3966650366783142, "mask/share_step_conf": 0.21368621289730072, "num_tokens": 48224872.0, "reward": 0.6758462190628052, "reward_std": 0.5250706672668457, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.40119296312332153, "rewards/format_reward_step": 0.5625, "rewards/step_l2_reward": 0.5169996619224548, "step": 199 }, { "adv/mean_abs_final_conf": 0.7240675687789917, "adv/mean_abs_reasoning": 0.7000408172607422, "adv/mean_abs_step_conf": 0.7183365821838379, "adv/ratio_final_to_reasoning": 1.0343219294158676, "adv/ratio_step_to_reasoning": 1.0261352830749026, "adv/std_final_conf": 0.8912106156349182, "adv/std_reasoning": 0.8754467368125916, "adv/std_step_conf": 0.8915499448776245, "calib/answer_extract_rate": 0.515625, "calib/auroc": 0.75, "calib/avg_num_step_conf": 1.1796875, "calib/ece": 0.2829323308270677, "calib/final_conf_rate": 0.51953125, "calib/format_rate": 0.5078125, "calib/frac_conf_gt_0.9": 0.12781954887218044, "calib/gap": 0.19675438596491232, "calib/mean_conf": 0.3343609022556391, "calib/mu_c": 0.4186842105263158, "calib/mu_w": 0.2219298245614035, "calib/nonempty_final_conf_rate": 0.51953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.02293233082706766, "calib/std_conf": 0.28058036893869176, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.3819101123595505, "calib/step_q_c_n": 89.0, "calib/step_q_gap": 0.045900722688189066, "calib/step_q_w": 0.33600938967136146, "calib/step_q_w_n": 213.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2989.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 426.30078125, "completions/mean_terminated_length": 737.3851318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.21333333333333335, "grad_norm": 0.09993603080511093, "learning_rate": 0.0, "loss": -1.3081, "mask/has_final_conf_rate": 0.51953125, "mask/share_final_conf": 0.018429240211844444, "mask/share_reasoning": 0.37678319215774536, "mask/share_step_conf": 0.18291257321834564, "num_tokens": 48442053.0, "reward": 0.60547935962677, "reward_std": 0.48481401801109314, "rewards/accuracy_reward_step": 0.296875, "rewards/final_brier_reward_step": 0.3608894348144531, "rewards/format_reward_step": 0.5078125, "rewards/step_l2_reward": 0.4594211280345917, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.2843201345717534, "train_runtime": 30116.5704, "train_samples_per_second": 1.7, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 48442053, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }