{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7490277290344238, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5698086470140988, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9343300461769104, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04301927983760834, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0135, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.8933746814727783, "reward_std": 0.19672557711601257, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.7698483467102051, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5081617139634353, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9345317482948303, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04039499908685684, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.8337589502334595, "reward_std": 0.1928534209728241, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7824219465255737, "adv/mean_abs_reasoning": 0.49416670203208923, "adv/mean_abs_step_conf": 0.7591285705566406, "adv/ratio_final_to_reasoning": 1.583315798713541, "adv/ratio_step_to_reasoning": 1.5361791222172347, "adv/std_final_conf": 0.931019127368927, "adv/std_reasoning": 0.7392831444740295, "adv/std_step_conf": 0.9340925216674805, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.41486037234042555, "calib/avg_num_step_conf": 4.84375, "calib/ece": 0.25031496062992126, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2992125984251969, "calib/gap": -0.012628989361702203, "calib/mean_conf": 0.8802362204724409, "calib/mu_c": 0.8755625, "calib/mu_w": 0.8881914893617022, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25031496062992126, "calib/std_conf": 0.04860194362675066, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8069333333333334, "calib/step_q_c_n": 675.0, "calib/step_q_gap": 0.04603067846607678, "calib/step_q_w": 0.7609026548672566, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 499.26171875, "completions/mean_terminated_length": 501.2196350097656, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.04080616310238838, "kl": 0.0015122145414352417, "learning_rate": 7.5e-07, "loss": 0.0555, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032926708459854126, "mask/share_reasoning": 0.8567208051681519, "mask/share_step_conf": 0.10644622147083282, "num_tokens": 691728.0, "reward": 0.878947377204895, "reward_std": 0.1959269642829895, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6897921562194824, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7454462647438049, "step": 3 }, { "adv/mean_abs_final_conf": 0.760870099067688, "adv/mean_abs_reasoning": 0.5114138722419739, "adv/mean_abs_step_conf": 0.7488071918487549, "adv/ratio_final_to_reasoning": 1.487777591429285, "adv/ratio_step_to_reasoning": 1.4641902234017992, "adv/std_final_conf": 0.9306450486183167, "adv/std_reasoning": 0.7575408220291138, "adv/std_step_conf": 0.9347666501998901, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4202674897119342, "calib/avg_num_step_conf": 4.8125, "calib/ece": 0.23678571428571432, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.24603174603174602, "calib/gap": -0.010506172839506278, "calib/mean_conf": 0.8775793650793651, "calib/mu_c": 0.8738271604938271, "calib/mu_w": 0.8843333333333334, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23575396825396827, "calib/std_conf": 0.052486710816122675, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7923658536585366, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.01923478569737147, "calib/step_q_w": 0.7731310679611652, "calib/step_q_w_n": 412.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2321.0, "completions/max_terminated_length": 2321.0, "completions/mean_length": 503.68359375, "completions/mean_terminated_length": 503.68359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.004266666666666667, "grad_norm": 0.0455099456012249, "kl": 0.0005451589822769165, "learning_rate": 1.0000000000000002e-06, "loss": 0.027, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0339834988117218, "mask/share_reasoning": 0.8529292345046997, "mask/share_step_conf": 0.1130872368812561, "num_tokens": 926839.0, "reward": 0.8701273202896118, "reward_std": 0.20101894438266754, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6966761350631714, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7201408743858337, "step": 4 }, { "adv/mean_abs_final_conf": 0.7657008767127991, "adv/mean_abs_reasoning": 0.39336252212524414, "adv/mean_abs_step_conf": 0.7815566658973694, "adv/ratio_final_to_reasoning": 1.946552692859247, "adv/ratio_step_to_reasoning": 1.9868610300615437, "adv/std_final_conf": 0.9317295551300049, "adv/std_reasoning": 0.6815266609191895, "adv/std_step_conf": 0.9331948757171631, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4420645161290322, "calib/avg_num_step_conf": 4.71875, "calib/ece": 0.3807630522088353, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.27309236947791166, "calib/gap": -0.011701290322580715, "calib/mean_conf": 0.8732128514056225, "calib/mu_c": 0.8673387096774194, "calib/mu_w": 0.8790400000000002, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.37799196787148587, "calib/std_conf": 0.05030947274314032, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7987844408427877, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.011102545749725135, "calib/step_q_w": 0.7876818950930625, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 496.82421875, "completions/mean_terminated_length": 498.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.005333333333333333, "grad_norm": 0.042361173778772354, "kl": 0.0002884864807128906, "learning_rate": 1.25e-06, "loss": 0.0051, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.034748584032058716, "mask/share_reasoning": 0.8492770791053772, "mask/share_step_conf": 0.1120680719614029, "num_tokens": 1160714.0, "reward": 0.7748649716377258, "reward_std": 0.16152727603912354, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5831875205039978, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.675917387008667, "step": 5 }, { "adv/mean_abs_final_conf": 0.7618625164031982, "adv/mean_abs_reasoning": 0.2755002975463867, "adv/mean_abs_step_conf": 0.7575180530548096, "adv/ratio_final_to_reasoning": 2.765378198094038, "adv/ratio_step_to_reasoning": 2.749608838180163, "adv/std_final_conf": 0.9300011992454529, "adv/std_reasoning": 0.5726578831672668, "adv/std_step_conf": 0.9342772364616394, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5214813870474249, "calib/avg_num_step_conf": 4.83984375, "calib/ece": 0.29996062992125977, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3228346456692913, "calib/gap": 0.004041305456399913, "calib/mean_conf": 0.8826377952755905, "calib/mu_c": 0.8843243243243243, "calib/mu_w": 0.8802830188679244, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29996062992125977, "calib/std_conf": 0.04387404471739072, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7977023121387283, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.0004079794147795468, "calib/step_q_w": 0.7972943327239488, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 447.80859375, "completions/mean_terminated_length": 447.80859375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0064, "grad_norm": 0.039196934551000595, "kl": 0.0003452599048614502, "learning_rate": 1.5e-06, "loss": 0.0626, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.037614528089761734, "mask/share_reasoning": 0.8385595083236694, "mask/share_step_conf": 0.12382596731185913, "num_tokens": 1381305.0, "reward": 0.8490424156188965, "reward_std": 0.13823771476745605, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6616894602775574, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.722332775592804, "step": 6 }, { "adv/mean_abs_final_conf": 0.7597512006759644, "adv/mean_abs_reasoning": 0.4481978416442871, "adv/mean_abs_step_conf": 0.7643946409225464, "adv/ratio_final_to_reasoning": 1.6951246304281449, "adv/ratio_step_to_reasoning": 1.7054848772101168, "adv/std_final_conf": 0.930804967880249, "adv/std_reasoning": 0.7205727696418762, "adv/std_step_conf": 0.9342158436775208, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5157617044012556, "calib/avg_num_step_conf": 5.6328125, "calib/ece": 0.24736220472440945, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.29133858267716534, "calib/gap": 0.004130768717024003, "calib/mean_conf": 0.8812204724409448, "calib/mu_c": 0.8827329192546585, "calib/mu_w": 0.8786021505376345, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24736220472440945, "calib/std_conf": 0.046791616010890866, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7910278372591005, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.00996484513311624, "calib/step_q_w": 0.7810629921259843, "calib/step_q_w_n": 508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 547.94140625, "completions/mean_terminated_length": 550.0902099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.007466666666666667, "grad_norm": 0.036365069448947906, "kl": 0.0002646446228027344, "learning_rate": 1.75e-06, "loss": 0.0327, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.029931582510471344, "mask/share_reasoning": 0.8557358980178833, "mask/share_step_conf": 0.11042627692222595, "num_tokens": 1629002.0, "reward": 0.8819910287857056, "reward_std": 0.18995745480060577, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6971203088760376, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7434240579605103, "step": 7 }, { "adv/mean_abs_final_conf": 0.7698913812637329, "adv/mean_abs_reasoning": 0.47465720772743225, "adv/mean_abs_step_conf": 0.7837836742401123, "adv/ratio_final_to_reasoning": 1.6219945019898157, "adv/ratio_step_to_reasoning": 1.6512625563882581, "adv/std_final_conf": 0.93187016248703, "adv/std_reasoning": 0.7392329573631287, "adv/std_step_conf": 0.9343845248222351, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49318181818181817, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.3134920634920635, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": 0.004327784891165032, "calib/mean_conf": 0.876984126984127, "calib/mu_c": 0.8788732394366198, "calib/mu_w": 0.8745454545454547, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3134920634920635, "calib/std_conf": 0.05139745286243065, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.798998482549317, "calib/step_q_c_n": 659.0, "calib/step_q_gap": 0.05534115141257889, "calib/step_q_w": 0.7436573311367382, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 529.98046875, "completions/mean_terminated_length": 529.98046875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.008533333333333334, "grad_norm": 0.05035858228802681, "kl": 0.0004501938819885254, "learning_rate": 2.0000000000000003e-06, "loss": -0.0013, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03286789357662201, "mask/share_reasoning": 0.8621331453323364, "mask/share_step_conf": 0.10499894618988037, "num_tokens": 1871189.0, "reward": 0.8457349538803101, "reward_std": 0.1846785992383957, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6442609429359436, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7401776909828186, "step": 8 }, { "adv/mean_abs_final_conf": 0.7846425175666809, "adv/mean_abs_reasoning": 0.5143224000930786, "adv/mean_abs_step_conf": 0.7686464786529541, "adv/ratio_final_to_reasoning": 1.5255849588209294, "adv/ratio_step_to_reasoning": 1.4944837683792298, "adv/std_final_conf": 0.932033896446228, "adv/std_reasoning": 0.7576586604118347, "adv/std_step_conf": 0.9345789551734924, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4397239665096808, "calib/avg_num_step_conf": 4.84765625, "calib/ece": 0.29884462151394425, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2549800796812749, "calib/gap": -0.009276556776556588, "calib/mean_conf": 0.878605577689243, "calib/mu_c": 0.874761904761905, "calib/mu_w": 0.8840384615384616, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2958964143426295, "calib/std_conf": 0.0430144954809309, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7825727411944869, "calib/step_q_c_n": 653.0, "calib/step_q_gap": 0.054018319425779504, "calib/step_q_w": 0.7285544217687074, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 510.5859375, "completions/mean_terminated_length": 514.6063232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0096, "grad_norm": 0.04146807640790939, "kl": 0.005201190710067749, "learning_rate": 2.25e-06, "loss": -0.0547, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03358618915081024, "mask/share_reasoning": 0.8540961742401123, "mask/share_step_conf": 0.10450513660907745, "num_tokens": 2109435.0, "reward": 0.8226215839385986, "reward_std": 0.2236328274011612, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6483156085014343, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6867712736129761, "step": 9 }, { "adv/mean_abs_final_conf": 0.7660097479820251, "adv/mean_abs_reasoning": 0.3731532096862793, "adv/mean_abs_step_conf": 0.7574515342712402, "adv/ratio_final_to_reasoning": 2.0528022487761306, "adv/ratio_step_to_reasoning": 2.0298673965796827, "adv/std_final_conf": 0.9304073452949524, "adv/std_reasoning": 0.6611678600311279, "adv/std_step_conf": 0.9340184330940247, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5357142857142858, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.2694094488188976, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.27165354330708663, "calib/gap": 0.013868831168830975, "calib/mean_conf": 0.8757086614173228, "calib/mu_c": 0.881168831168831, "calib/mu_w": 0.8673000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2694094488188976, "calib/std_conf": 0.06993310754770636, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7612042620689656, "calib/step_q_c_n": 725.0, "calib/step_q_gap": -0.01596419671239646, "calib/step_q_w": 0.7771684587813621, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 506.41796875, "completions/mean_terminated_length": 506.41796875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.010666666666666666, "grad_norm": 0.04930815100669861, "kl": 0.0004813075065612793, "learning_rate": 2.5e-06, "loss": 0.0962, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03370947390794754, "mask/share_reasoning": 0.8564748764038086, "mask/share_step_conf": 0.10981567949056625, "num_tokens": 2345878.0, "reward": 0.8647788763046265, "reward_std": 0.16030901670455933, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6840370893478394, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7275517582893372, "step": 10 }, { "adv/mean_abs_final_conf": 0.7875567674636841, "adv/mean_abs_reasoning": 0.405804842710495, "adv/mean_abs_step_conf": 0.7895094752311707, "adv/ratio_final_to_reasoning": 1.9407278686063747, "adv/ratio_step_to_reasoning": 1.9455398066661669, "adv/std_final_conf": 0.928180456161499, "adv/std_reasoning": 0.6816299557685852, "adv/std_step_conf": 0.933238685131073, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.42639821029082775, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.305748031496063, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3110236220472441, "calib/gap": -0.021482901885586325, "calib/mean_conf": 0.8778740157480315, "calib/mu_c": 0.8689932885906041, "calib/mu_w": 0.8904761904761904, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29850393700787403, "calib/std_conf": 0.07778559186119627, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7642151238591917, "calib/step_q_c_n": 767.0, "calib/step_q_gap": -0.01411560937013745, "calib/step_q_w": 0.7783307332293291, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 528.2890625, "completions/mean_terminated_length": 530.36083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.011733333333333333, "grad_norm": 0.03836844116449356, "kl": 0.0006773471832275391, "learning_rate": 2.7500000000000004e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032403960824012756, "mask/share_reasoning": 0.8480488657951355, "mask/share_step_conf": 0.11564093828201294, "num_tokens": 2585600.0, "reward": 0.8380727171897888, "reward_std": 0.1705556958913803, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6433242559432983, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7195398807525635, "step": 11 }, { "adv/mean_abs_final_conf": 0.7522432804107666, "adv/mean_abs_reasoning": 0.495732843875885, "adv/mean_abs_step_conf": 0.7614065408706665, "adv/ratio_final_to_reasoning": 1.5174368406364926, "adv/ratio_step_to_reasoning": 1.5359211121006486, "adv/std_final_conf": 0.9305465817451477, "adv/std_reasoning": 0.7576424479484558, "adv/std_step_conf": 0.9341990351676941, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5511680869820746, "calib/avg_num_step_conf": 5.5390625, "calib/ece": 0.203266129032258, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.2903225806451613, "calib/gap": 0.015575962386129905, "calib/mean_conf": 0.8726209677419355, "calib/mu_c": 0.8777710843373494, "calib/mu_w": 0.8621951219512195, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.203266129032258, "calib/std_conf": 0.059194743219900266, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7626148409893994, "calib/step_q_c_n": 849.0, "calib/step_q_gap": 0.015245772448098771, "calib/step_q_w": 0.7473690685413006, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 479.0, "completions/mean_terminated_length": 482.7716369628906, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.0128, "grad_norm": 0.03968612849712372, "kl": 0.001405954360961914, "learning_rate": 3e-06, "loss": 0.0169, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.036165084689855576, "mask/share_reasoning": 0.828498125076294, "mask/share_step_conf": 0.12752431631088257, "num_tokens": 2812400.0, "reward": 0.9073599576950073, "reward_std": 0.20147816836833954, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7176058888435364, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7728952169418335, "step": 12 }, { "adv/mean_abs_final_conf": 0.7512305974960327, "adv/mean_abs_reasoning": 0.39446693658828735, "adv/mean_abs_step_conf": 0.7417395114898682, "adv/ratio_final_to_reasoning": 1.9044196808821683, "adv/ratio_step_to_reasoning": 1.8803591446855172, "adv/std_final_conf": 0.9301111698150635, "adv/std_reasoning": 0.6814743280410767, "adv/std_step_conf": 0.9348357319831848, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5096972095784682, "calib/avg_num_step_conf": 4.69921875, "calib/ece": 0.24792968750000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.36328125, "calib/gap": 0.0034217296655455476, "calib/mean_conf": 0.8812109375, "calib/mu_c": 0.8824539877300616, "calib/mu_w": 0.879032258064516, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24621093750000006, "calib/std_conf": 0.05488356316212618, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7557579787234041, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.009505207104778979, "calib/step_q_w": 0.7462527716186251, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 460.07421875, "completions/mean_terminated_length": 461.8784484863281, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.013866666666666666, "grad_norm": 0.038122136145830154, "kl": 0.0019731521606445312, "learning_rate": 3.2500000000000002e-06, "loss": -0.0146, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03544770926237106, "mask/share_reasoning": 0.8494640588760376, "mask/share_step_conf": 0.11118200421333313, "num_tokens": 3034771.0, "reward": 0.8951612710952759, "reward_std": 0.17123009264469147, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6993730068206787, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7659494876861572, "step": 13 }, { "adv/mean_abs_final_conf": 0.7730479836463928, "adv/mean_abs_reasoning": 0.5005540251731873, "adv/mean_abs_step_conf": 0.7642529606819153, "adv/ratio_final_to_reasoning": 1.544384711278518, "adv/ratio_step_to_reasoning": 1.5268141344333221, "adv/std_final_conf": 0.9311867356300354, "adv/std_reasoning": 0.7394110560417175, "adv/std_step_conf": 0.9345505237579346, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.47611822140124027, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.32891566265060246, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4979919678714859, "calib/gap": -0.006167040506662724, "calib/mean_conf": 0.8975903614457832, "calib/mu_c": 0.8949650349650352, "calib/mu_w": 0.9011320754716979, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32610441767068277, "calib/std_conf": 0.04852665465823042, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.720193298969072, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.028521374678851186, "calib/step_q_w": 0.6916719242902208, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 526.9140625, "completions/mean_terminated_length": 533.1620483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.014933333333333333, "grad_norm": 0.043646715581417084, "kl": 0.0052623748779296875, "learning_rate": 3.5e-06, "loss": -0.0802, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03268653526902199, "mask/share_reasoning": 0.8374663591384888, "mask/share_step_conf": 0.11812833696603775, "num_tokens": 3275061.0, "reward": 0.8433182835578918, "reward_std": 0.1970943808555603, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6279773712158203, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7524092197418213, "step": 14 }, { "adv/mean_abs_final_conf": 0.7509260773658752, "adv/mean_abs_reasoning": 0.43354499340057373, "adv/mean_abs_step_conf": 0.7791382670402527, "adv/ratio_final_to_reasoning": 1.7320603139154633, "adv/ratio_step_to_reasoning": 1.7971335822124652, "adv/std_final_conf": 0.9233602285385132, "adv/std_reasoning": 0.7012984156608582, "adv/std_step_conf": 0.9344486594200134, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5131959281136107, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.33823529411764713, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.596078431372549, "calib/gap": 0.009061203971346021, "calib/mean_conf": 0.9107843137254903, "calib/mu_c": 0.9146575342465755, "calib/mu_w": 0.9055963302752295, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33823529411764713, "calib/std_conf": 0.04947739183668031, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6933720930232558, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.016401865352502765, "calib/step_q_w": 0.676970227670753, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 454.046875, "completions/mean_terminated_length": 455.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.016, "grad_norm": 0.032701823860406876, "kl": 0.0075130462646484375, "learning_rate": 3.7500000000000005e-06, "loss": 0.0092, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035241350531578064, "mask/share_reasoning": 0.8442916870117188, "mask/share_step_conf": 0.1165606826543808, "num_tokens": 3499177.0, "reward": 0.8645692467689514, "reward_std": 0.1731414645910263, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6403363347053528, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7747396230697632, "step": 15 }, { "adv/mean_abs_final_conf": 0.7602238655090332, "adv/mean_abs_reasoning": 0.44012323021888733, "adv/mean_abs_step_conf": 0.7481175661087036, "adv/ratio_final_to_reasoning": 1.727297750520802, "adv/ratio_step_to_reasoning": 1.6997911374426677, "adv/std_final_conf": 0.9273681044578552, "adv/std_reasoning": 0.7205715179443359, "adv/std_step_conf": 0.9346485137939453, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5498666666666667, "calib/avg_num_step_conf": 5.97265625, "calib/ece": 0.30691999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.676, "calib/gap": 0.026200000000000112, "calib/mean_conf": 0.9069200000000001, "calib/mu_c": 0.9174000000000001, "calib/mu_w": 0.8912, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30691999999999997, "calib/std_conf": 0.08308858886754547, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6207349272349273, "calib/step_q_c_n": 962.0, "calib/step_q_gap": -0.01829505512838847, "calib/step_q_w": 0.6390299823633158, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 628.4921875, "completions/mean_terminated_length": 633.44091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.017066666666666667, "grad_norm": 0.049372486770153046, "kl": 0.010541915893554688, "learning_rate": 4.000000000000001e-06, "loss": 0.0158, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.026201680302619934, "mask/share_reasoning": 0.8591794967651367, "mask/share_step_conf": 0.10680627077817917, "num_tokens": 3768919.0, "reward": 0.8785654306411743, "reward_std": 0.1860627830028534, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6557347774505615, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7888960838317871, "step": 16 }, { "adv/mean_abs_final_conf": 0.763668417930603, "adv/mean_abs_reasoning": 0.4707931876182556, "adv/mean_abs_step_conf": 0.7634800672531128, "adv/ratio_final_to_reasoning": 1.6220889299482097, "adv/ratio_step_to_reasoning": 1.6216888589989187, "adv/std_final_conf": 0.9241020679473877, "adv/std_reasoning": 0.7206013798713684, "adv/std_step_conf": 0.9347437024116516, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5685767790262172, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.22802371541501973, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7747035573122529, "calib/gap": 0.01153707865168574, "calib/mean_conf": 0.9269169960474308, "calib/mu_c": 0.9303370786516855, "calib/mu_w": 0.9187999999999997, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22569169960474306, "calib/std_conf": 0.04471231081930742, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6131984585741811, "calib/step_q_c_n": 1038.0, "calib/step_q_gap": 0.01755053681378993, "calib/step_q_w": 0.5956479217603912, "calib/step_q_w_n": 409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 534.51171875, "completions/mean_terminated_length": 534.51171875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.018133333333333335, "grad_norm": 3.9036219120025635, "kl": 13.952611923217773, "learning_rate": 4.25e-06, "loss": 0.3213, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03322366625070572, "mask/share_reasoning": 0.8428764343261719, "mask/share_step_conf": 0.12389989197254181, "num_tokens": 4009282.0, "reward": 0.9437460899353027, "reward_std": 0.18401694297790527, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7354055047035217, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8161492347717285, "step": 17 }, { "adv/mean_abs_final_conf": 0.7250304222106934, "adv/mean_abs_reasoning": 0.4055905044078827, "adv/mean_abs_step_conf": 0.7682268023490906, "adv/ratio_final_to_reasoning": 1.7875922003380174, "adv/ratio_step_to_reasoning": 1.8940946447220621, "adv/std_final_conf": 0.9232602715492249, "adv/std_reasoning": 0.7013160586357117, "adv/std_step_conf": 0.9348356127738953, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4806957186544342, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.3623715415019763, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.83399209486166, "calib/gap": -0.001804918450560411, "calib/mean_conf": 0.9315415019762846, "calib/mu_c": 0.9307638888888891, "calib/mu_w": 0.9325688073394495, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3623715415019763, "calib/std_conf": 0.04609231217906996, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5888536953242836, "calib/step_q_c_n": 663.0, "calib/step_q_gap": -0.011679638009049809, "calib/step_q_w": 0.6005333333333334, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 495.640625, "completions/mean_terminated_length": 497.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.0192, "grad_norm": 0.03445158153772354, "kl": 0.014752388000488281, "learning_rate": 4.5e-06, "loss": -0.062, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0338444709777832, "mask/share_reasoning": 0.8541897535324097, "mask/share_step_conf": 0.10805948078632355, "num_tokens": 4246886.0, "reward": 0.843170166015625, "reward_std": 0.18039628863334656, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6090675592422485, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7686790227890015, "step": 18 }, { "adv/mean_abs_final_conf": 0.7084481716156006, "adv/mean_abs_reasoning": 0.34135955572128296, "adv/mean_abs_step_conf": 0.7834237217903137, "adv/ratio_final_to_reasoning": 2.07537231561797, "adv/ratio_step_to_reasoning": 2.2950103744275205, "adv/std_final_conf": 0.9185612201690674, "adv/std_reasoning": 0.6610927581787109, "adv/std_step_conf": 0.934778094291687, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.43786184210526313, "calib/avg_num_step_conf": 4.6015625, "calib/ece": 0.3341666666666669, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8650793650793651, "calib/gap": 0.006036842105263451, "calib/mean_conf": 0.9373412698412698, "calib/mu_c": 0.9397368421052633, "calib/mu_w": 0.9336999999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3341666666666669, "calib/std_conf": 0.07056795247028086, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6050581395348836, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.040119364024679416, "calib/step_q_w": 0.5649387755102042, "calib/step_q_w_n": 490.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 490.83203125, "completions/mean_terminated_length": 496.6521911621094, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.020266666666666665, "grad_norm": 0.03231712058186531, "kl": 0.018993377685546875, "learning_rate": 4.75e-06, "loss": -0.0115, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032016873359680176, "mask/share_reasoning": 0.8505151867866516, "mask/share_step_conf": 0.10574917495250702, "num_tokens": 4477299.0, "reward": 0.8782888650894165, "reward_std": 0.1701107621192932, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6328773498535156, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8088566064834595, "step": 19 }, { "adv/mean_abs_final_conf": 0.7251197099685669, "adv/mean_abs_reasoning": 0.4031530022621155, "adv/mean_abs_step_conf": 0.7815650105476379, "adv/ratio_final_to_reasoning": 1.7986216297531634, "adv/ratio_step_to_reasoning": 1.9386312545416509, "adv/std_final_conf": 0.9166728258132935, "adv/std_reasoning": 0.7012306451797485, "adv/std_step_conf": 0.934552013874054, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4580340386979874, "calib/avg_num_step_conf": 5.3984375, "calib/ece": 0.3506299212598426, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9409448818897638, "calib/gap": -0.0028796997346796083, "calib/mean_conf": 0.9486614173228347, "calib/mu_c": 0.9475163398692809, "calib/mu_w": 0.9503960396039605, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3484645669291339, "calib/std_conf": 0.0423774231365238, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5716861219195849, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.024730311772285463, "calib/step_q_w": 0.5469558101472994, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 460.74609375, "completions/mean_terminated_length": 462.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.021333333333333333, "grad_norm": 0.03185487538576126, "kl": 0.024953842163085938, "learning_rate": 5e-06, "loss": 0.0105, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036644428968429565, "mask/share_reasoning": 0.8307029008865356, "mask/share_step_conf": 0.1287464201450348, "num_tokens": 4700122.0, "reward": 0.8812888860702515, "reward_std": 0.16984190046787262, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6324000358581543, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.812208890914917, "step": 20 }, { "adv/mean_abs_final_conf": 0.737258791923523, "adv/mean_abs_reasoning": 0.4711818993091583, "adv/mean_abs_step_conf": 0.759905993938446, "adv/ratio_final_to_reasoning": 1.5647010061389957, "adv/ratio_step_to_reasoning": 1.6127656751089372, "adv/std_final_conf": 0.9146026968955994, "adv/std_reasoning": 0.7392024993896484, "adv/std_step_conf": 0.9352498054504395, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5120763901891031, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.38578124999999985, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9609375, "calib/gap": -0.0014004868002245319, "calib/mean_conf": 0.95671875, "calib/mu_c": 0.9561224489795919, "calib/mu_w": 0.9575229357798164, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38414062499999985, "calib/std_conf": 0.033298117791213055, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5543406593406593, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.008941988244313914, "calib/step_q_w": 0.5453986710963454, "calib/step_q_w_n": 602.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 492.125, "completions/mean_terminated_length": 494.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.0224, "grad_norm": 0.025124864652752876, "kl": 0.026861190795898438, "learning_rate": 4.9722222222222224e-06, "loss": -0.0157, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03341342881321907, "mask/share_reasoning": 0.8461604118347168, "mask/share_step_conf": 0.11651992797851562, "num_tokens": 4929066.0, "reward": 0.8638095855712891, "reward_std": 0.19264093041419983, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6035058498382568, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8100508451461792, "step": 21 }, { "adv/mean_abs_final_conf": 0.6960275173187256, "adv/mean_abs_reasoning": 0.3700941503047943, "adv/mean_abs_step_conf": 0.7384877800941467, "adv/ratio_final_to_reasoning": 1.8806768946375023, "adv/ratio_step_to_reasoning": 1.9954051678092144, "adv/std_final_conf": 0.9030138254165649, "adv/std_reasoning": 0.6815323829650879, "adv/std_step_conf": 0.9350609183311462, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.45413126884077837, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.31256916996047424, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": -0.002593861331871783, "calib/mean_conf": 0.960790513833992, "calib/mu_c": 0.9598780487804879, "calib/mu_w": 0.9624719101123597, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31256916996047424, "calib/std_conf": 0.020200760878050606, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5592044198895028, "calib/step_q_c_n": 905.0, "calib/step_q_gap": 0.029487915035133927, "calib/step_q_w": 0.5297165048543688, "calib/step_q_w_n": 515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 482.3046875, "completions/mean_terminated_length": 482.3046875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.023466666666666667, "grad_norm": 0.04782715439796448, "kl": 0.041934967041015625, "learning_rate": 4.944444444444445e-06, "loss": 0.0586, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0337035208940506, "mask/share_reasoning": 0.8390634059906006, "mask/share_step_conf": 0.12723305821418762, "num_tokens": 5154352.0, "reward": 0.9010443091392517, "reward_std": 0.1715245544910431, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6647961139678955, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8115112781524658, "step": 22 }, { "adv/mean_abs_final_conf": 0.7233649492263794, "adv/mean_abs_reasoning": 0.4360201060771942, "adv/mean_abs_step_conf": 0.7778668403625488, "adv/ratio_final_to_reasoning": 1.6590174148948829, "adv/ratio_step_to_reasoning": 1.7840159880719655, "adv/std_final_conf": 0.9036288261413574, "adv/std_reasoning": 0.7204331755638123, "adv/std_step_conf": 0.9352081418037415, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4829948617567898, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.4408203125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.984375, "calib/gap": -0.001261316368974863, "calib/mean_conf": 0.9642578125, "calib/mu_c": 0.9636567164179104, "calib/mu_w": 0.9649180327868853, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4408203125, "calib/std_conf": 0.020940669944269776, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5581728045325779, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.019948923935563023, "calib/step_q_w": 0.5382238805970149, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 508.5703125, "completions/mean_terminated_length": 510.5647277832031, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.024533333333333334, "grad_norm": 0.025247525423765182, "kl": 0.031337738037109375, "learning_rate": 4.9166666666666665e-06, "loss": -0.0069, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03410639613866806, "mask/share_reasoning": 0.8367390632629395, "mask/share_step_conf": 0.1252482831478119, "num_tokens": 5388482.0, "reward": 0.824210524559021, "reward_std": 0.1818922758102417, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5551589727401733, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7885745763778687, "step": 23 }, { "adv/mean_abs_final_conf": 0.7442210912704468, "adv/mean_abs_reasoning": 0.625908374786377, "adv/mean_abs_step_conf": 0.7788434028625488, "adv/ratio_final_to_reasoning": 1.189025616607942, "adv/ratio_step_to_reasoning": 1.2443409199123894, "adv/std_final_conf": 0.9220814108848572, "adv/std_reasoning": 0.8429231643676758, "adv/std_step_conf": 0.9353823661804199, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5473770491803278, "calib/avg_num_step_conf": 6.10546875, "calib/ece": 0.4607692307692309, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838056680161943, "calib/gap": 0.0032131147540985783, "calib/mean_conf": 0.9644129554655871, "calib/mu_c": 0.9660000000000002, "calib/mu_w": 0.9627868852459016, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.45955465587044547, "calib/std_conf": 0.026133407237792707, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5604884004884005, "calib/step_q_c_n": 819.0, "calib/step_q_gap": 0.03531366930560487, "calib/step_q_w": 0.5251747311827957, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 593.07421875, "completions/mean_terminated_length": 593.07421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0256, "grad_norm": 0.03646933659911156, "kl": 0.027853012084960938, "learning_rate": 4.888888888888889e-06, "loss": -0.0254, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03060779720544815, "mask/share_reasoning": 0.8466688394546509, "mask/share_step_conf": 0.12272335588932037, "num_tokens": 5644821.0, "reward": 0.7971993088722229, "reward_std": 0.24867865443229675, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5218691229820251, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7819044589996338, "step": 24 }, { "adv/mean_abs_final_conf": 0.7325584888458252, "adv/mean_abs_reasoning": 0.4987294375896454, "adv/mean_abs_step_conf": 0.749326229095459, "adv/ratio_final_to_reasoning": 1.468849507633384, "adv/ratio_step_to_reasoning": 1.5024704230753763, "adv/std_final_conf": 0.9082023501396179, "adv/std_reasoning": 0.7576895356178284, "adv/std_step_conf": 0.9353220462799072, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6032865389661507, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.37721115537848604, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880478087649402, "calib/gap": 0.007340593020204844, "calib/mean_conf": 0.9668525896414343, "calib/mu_c": 0.9698648648648648, "calib/mu_w": 0.9625242718446599, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37721115537848604, "calib/std_conf": 0.01931223275365144, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.548428927680798, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.014344420638544464, "calib/step_q_w": 0.5340845070422535, "calib/step_q_w_n": 639.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 487.37109375, "completions/mean_terminated_length": 487.37109375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.02666666666666667, "grad_norm": 0.02042507752776146, "kl": 0.034358978271484375, "learning_rate": 4.861111111111111e-06, "loss": 0.0507, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03349972888827324, "mask/share_reasoning": 0.8386950492858887, "mask/share_step_conf": 0.12780524790287018, "num_tokens": 5872812.0, "reward": 0.8485996723175049, "reward_std": 0.22764171659946442, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6068382859230042, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7786422371864319, "step": 25 }, { "adv/mean_abs_final_conf": 0.7012295126914978, "adv/mean_abs_reasoning": 0.4339819550514221, "adv/mean_abs_step_conf": 0.752228856086731, "adv/ratio_final_to_reasoning": 1.6158033865910617, "adv/ratio_step_to_reasoning": 1.7333182804745881, "adv/std_final_conf": 0.8902946710586548, "adv/std_reasoning": 0.7205467820167542, "adv/std_step_conf": 0.935208797454834, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5849214077062178, "calib/avg_num_step_conf": 5.34375, "calib/ece": 0.3228915662650604, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9718875502008032, "calib/gap": 0.018116566977326443, "calib/mean_conf": 0.957429718875502, "calib/mu_c": 0.9640506329113924, "calib/mu_w": 0.9459340659340659, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3228915662650604, "calib/std_conf": 0.07118489472683348, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5745430463576159, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.057430485183064484, "calib/step_q_w": 0.5171125611745514, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 538.953125, "completions/mean_terminated_length": 538.953125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.027733333333333332, "grad_norm": 0.02381037175655365, "kl": 0.03139495849609375, "learning_rate": 4.833333333333333e-06, "loss": 0.0775, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03021368384361267, "mask/share_reasoning": 0.8615808486938477, "mask/share_step_conf": 0.10820543766021729, "num_tokens": 6116024.0, "reward": 0.8803870677947998, "reward_std": 0.1974932700395584, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6489335894584656, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7938718199729919, "step": 26 }, { "adv/mean_abs_final_conf": 0.7713116407394409, "adv/mean_abs_reasoning": 0.47726085782051086, "adv/mean_abs_step_conf": 0.7782514691352844, "adv/ratio_final_to_reasoning": 1.616121724839872, "adv/ratio_step_to_reasoning": 1.6306626792930308, "adv/std_final_conf": 0.9125264883041382, "adv/std_reasoning": 0.720649242401123, "adv/std_step_conf": 0.9354422092437744, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5927287581699346, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.4235968379446642, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9802371541501976, "calib/gap": 0.011120537958773102, "calib/mean_conf": 0.9574308300395258, "calib/mu_c": 0.9625735294117647, "calib/mu_w": 0.9514529914529916, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4217391304347828, "calib/std_conf": 0.055589902953488125, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5399999999999999, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.0006224627875507371, "calib/step_q_w": 0.5393775372124492, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 512.63671875, "completions/mean_terminated_length": 512.63671875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.0288, "grad_norm": 0.020446553826332092, "kl": 0.03476715087890625, "learning_rate": 4.805555555555556e-06, "loss": 0.031, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032595522701740265, "mask/share_reasoning": 0.8426357507705688, "mask/share_step_conf": 0.12476875633001328, "num_tokens": 6352475.0, "reward": 0.8334004282951355, "reward_std": 0.20353764295578003, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5707800388336182, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7921144962310791, "step": 27 }, { "adv/mean_abs_final_conf": 0.7157641649246216, "adv/mean_abs_reasoning": 0.3361932337284088, "adv/mean_abs_step_conf": 0.7597331404685974, "adv/ratio_final_to_reasoning": 2.129026087130731, "adv/ratio_step_to_reasoning": 2.2598109189857825, "adv/std_final_conf": 0.8849893808364868, "adv/std_reasoning": 0.6185228228569031, "adv/std_step_conf": 0.9350174069404602, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5315514735033698, "calib/avg_num_step_conf": 5.234375, "calib/ece": 0.32996078431372555, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.011650588079820556, "calib/mean_conf": 0.9575686274509804, "calib/mu_c": 0.9618633540372672, "calib/mu_w": 0.9502127659574466, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.32807843137254905, "calib/std_conf": 0.06293764573812151, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.566830985915493, "calib/step_q_c_n": 852.0, "calib/step_q_gap": -0.009582948510736489, "calib/step_q_w": 0.5764139344262295, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 526.67578125, "completions/mean_terminated_length": 528.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.029866666666666666, "grad_norm": 0.023936253041028976, "kl": 0.031139373779296875, "learning_rate": 4.777777777777778e-06, "loss": -0.0249, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03148103132843971, "mask/share_reasoning": 0.8523135781288147, "mask/share_step_conf": 0.11229914426803589, "num_tokens": 6594248.0, "reward": 0.8912913799285889, "reward_std": 0.15562722086906433, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6597297191619873, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7978529930114746, "step": 28 }, { "adv/mean_abs_final_conf": 0.74814772605896, "adv/mean_abs_reasoning": 0.5354911088943481, "adv/mean_abs_step_conf": 0.7605692148208618, "adv/ratio_final_to_reasoning": 1.3971244594587073, "adv/ratio_step_to_reasoning": 1.4203209020430652, "adv/std_final_conf": 0.9092041850090027, "adv/std_reasoning": 0.7754234671592712, "adv/std_step_conf": 0.935039222240448, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.542939903234021, "calib/avg_num_step_conf": 5.96484375, "calib/ece": 0.439402390438247, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.003364527629233449, "calib/mean_conf": 0.9652988047808766, "calib/mu_c": 0.9668939393939393, "calib/mu_w": 0.9635294117647059, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.439402390438247, "calib/std_conf": 0.021742113067573822, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5532034632034633, "calib/step_q_c_n": 693.0, "calib/step_q_gap": 0.05847924257996212, "calib/step_q_w": 0.49472422062350113, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 574.78125, "completions/mean_terminated_length": 574.78125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.030933333333333334, "grad_norm": 0.025617143139243126, "kl": 0.033428192138671875, "learning_rate": 4.75e-06, "loss": 0.0279, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.029048418626189232, "mask/share_reasoning": 0.854861855506897, "mask/share_step_conf": 0.11608975380659103, "num_tokens": 6848520.0, "reward": 0.8259302377700806, "reward_std": 0.22149044275283813, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5478870868682861, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8047546148300171, "step": 29 }, { "adv/mean_abs_final_conf": 0.7529537677764893, "adv/mean_abs_reasoning": 0.5834614038467407, "adv/mean_abs_step_conf": 0.7625214457511902, "adv/ratio_final_to_reasoning": 1.2904945602439017, "adv/ratio_step_to_reasoning": 1.3068926937136078, "adv/std_final_conf": 0.9145447611808777, "adv/std_reasoning": 0.7929871082305908, "adv/std_step_conf": 0.9354909062385559, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4917687074829932, "calib/avg_num_step_conf": 6.015625, "calib/ece": 0.35708502024291505, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9352226720647774, "calib/gap": 0.011062585034013583, "calib/mean_conf": 0.9519838056680162, "calib/mu_c": 0.9564625850340136, "calib/mu_w": 0.9454, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3569635627530365, "calib/std_conf": 0.07052515372967161, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5104925544100801, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.006054773300634864, "calib/step_q_w": 0.5044377811094453, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 615.70703125, "completions/mean_terminated_length": 618.12158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.032, "grad_norm": 0.019550230354070663, "kl": 0.032825469970703125, "learning_rate": 4.722222222222222e-06, "loss": 0.0398, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028160445392131805, "mask/share_reasoning": 0.854744553565979, "mask/share_step_conf": 0.1131887435913086, "num_tokens": 7113125.0, "reward": 0.8469289541244507, "reward_std": 0.23248617351055145, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6096968650817871, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7771298289299011, "step": 30 }, { "adv/mean_abs_final_conf": 0.7611095309257507, "adv/mean_abs_reasoning": 0.4538407325744629, "adv/mean_abs_step_conf": 0.7753937244415283, "adv/ratio_final_to_reasoning": 1.6770410328933474, "adv/ratio_step_to_reasoning": 1.7085150555857331, "adv/std_final_conf": 0.9070990085601807, "adv/std_reasoning": 0.7207038402557373, "adv/std_step_conf": 0.9356343746185303, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5348313117253264, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.520217741935484, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9516129032258065, "calib/gap": 0.02418651819447204, "calib/mean_conf": 0.9516693548387097, "calib/mu_c": 0.9654205607476635, "calib/mu_w": 0.9412340425531914, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.520217741935484, "calib/std_conf": 0.10482598651084025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5421069182389937, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.036309710560110675, "calib/step_q_w": 0.505797207678883, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 605.9765625, "completions/mean_terminated_length": 608.3529663085938, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.03306666666666667, "grad_norm": 0.026389990001916885, "kl": 0.029544830322265625, "learning_rate": 4.694444444444445e-06, "loss": 0.0218, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.02856561541557312, "mask/share_reasoning": 0.8553394079208374, "mask/share_step_conf": 0.11218871921300888, "num_tokens": 7374167.0, "reward": 0.7536863684654236, "reward_std": 0.2233159840106964, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.4697951674461365, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7602337598800659, "step": 31 }, { "adv/mean_abs_final_conf": 0.7454090714454651, "adv/mean_abs_reasoning": 0.44065701961517334, "adv/mean_abs_step_conf": 0.75468909740448, "adv/ratio_final_to_reasoning": 1.6915856057312608, "adv/ratio_step_to_reasoning": 1.7126451271865621, "adv/std_final_conf": 0.9135096669197083, "adv/std_reasoning": 0.701328456401825, "adv/std_step_conf": 0.9353702664375305, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5898970478789382, "calib/avg_num_step_conf": 5.58984375, "calib/ece": 0.40207843137254895, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9176470588235294, "calib/gap": 0.035063259737038055, "calib/mean_conf": 0.938078431372549, "calib/mu_c": 0.9540287769784171, "calib/mu_w": 0.9189655172413791, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39752941176470585, "calib/std_conf": 0.13279437785424603, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5582739726027397, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.040970120962226186, "calib/step_q_w": 0.5173038516405135, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 538.4921875, "completions/mean_terminated_length": 538.4921875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.034133333333333335, "grad_norm": 0.03299793228507042, "kl": 0.03919219970703125, "learning_rate": 4.666666666666667e-06, "loss": 0.0164, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.029952574521303177, "mask/share_reasoning": 0.8586723804473877, "mask/share_step_conf": 0.11137507855892181, "num_tokens": 7618725.0, "reward": 0.8499400615692139, "reward_std": 0.191944882273674, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5950214862823486, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7970460057258606, "step": 32 }, { "adv/mean_abs_final_conf": 0.7218849658966064, "adv/mean_abs_reasoning": 0.4429064989089966, "adv/mean_abs_step_conf": 0.7579190731048584, "adv/ratio_final_to_reasoning": 1.6298811773473914, "adv/ratio_step_to_reasoning": 1.7112394488945781, "adv/std_final_conf": 0.8951165676116943, "adv/std_reasoning": 0.7013714909553528, "adv/std_step_conf": 0.9354180693626404, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5398657289002557, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.4180478087649402, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9482071713147411, "calib/gap": 0.011603580562659865, "calib/mean_conf": 0.9573306772908368, "calib/mu_c": 0.9626470588235295, "calib/mu_w": 0.9510434782608697, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4167729083665338, "calib/std_conf": 0.05388165997824674, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5722263940520446, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.07556917407903507, "calib/step_q_w": 0.4966572199730095, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 547.3359375, "completions/mean_terminated_length": 549.4823608398438, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.0352, "grad_norm": 0.020352911204099655, "kl": 0.03536224365234375, "learning_rate": 4.638888888888889e-06, "loss": 0.0601, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030452530831098557, "mask/share_reasoning": 0.8487410545349121, "mask/share_step_conf": 0.11690014600753784, "num_tokens": 7865715.0, "reward": 0.8267344832420349, "reward_std": 0.19006367027759552, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5706027746200562, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.780522346496582, "step": 33 }, { "adv/mean_abs_final_conf": 0.7223448157310486, "adv/mean_abs_reasoning": 0.5534857511520386, "adv/mean_abs_step_conf": 0.7550450563430786, "adv/ratio_final_to_reasoning": 1.3050829478943997, "adv/ratio_step_to_reasoning": 1.3641634943112984, "adv/std_final_conf": 0.9112024903297424, "adv/std_reasoning": 0.7928467392921448, "adv/std_step_conf": 0.9351396560668945, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4595193340494093, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.36218666666666666, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.896, "calib/gap": -0.016228070175438702, "calib/mean_conf": 0.9401333333333335, "calib/mu_c": 0.9337719298245613, "calib/mu_w": 0.95, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.34716, "calib/std_conf": 0.11772578307235847, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.506442731277533, "calib/step_q_c_n": 908.0, "calib/step_q_gap": -0.001121565242890532, "calib/step_q_w": 0.5075642965204236, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 510.0234375, "completions/mean_terminated_length": 512.0235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.03626666666666667, "grad_norm": 0.019217276945710182, "kl": 0.038822174072265625, "learning_rate": 4.611111111111112e-06, "loss": 0.0494, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03184622898697853, "mask/share_reasoning": 0.8305675387382507, "mask/share_step_conf": 0.13367998600006104, "num_tokens": 8101393.0, "reward": 0.8644185066223145, "reward_std": 0.21517950296401978, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6149966716766357, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7997777462005615, "step": 34 }, { "adv/mean_abs_final_conf": 0.7637768983840942, "adv/mean_abs_reasoning": 0.4960789382457733, "adv/mean_abs_step_conf": 0.7720634937286377, "adv/ratio_final_to_reasoning": 1.539627747722873, "adv/ratio_step_to_reasoning": 1.5563319347094169, "adv/std_final_conf": 0.9017539024353027, "adv/std_reasoning": 0.7394310832023621, "adv/std_step_conf": 0.9350427985191345, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5493046776232617, "calib/avg_num_step_conf": 5.20703125, "calib/ece": 0.4012252964426877, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8932806324110671, "calib/gap": 0.020494943109987007, "calib/mean_conf": 0.9387747035573122, "calib/mu_c": 0.9479285714285712, "calib/mu_w": 0.9274336283185842, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.39332015810276677, "calib/std_conf": 0.11451427164975285, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5202301808066759, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.057233438135666115, "calib/step_q_w": 0.4629967426710098, "calib/step_q_w_n": 614.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2079.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 565.484375, "completions/mean_terminated_length": 565.484375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.037333333333333336, "grad_norm": 0.03530250862240791, "kl": 0.03521728515625, "learning_rate": 4.583333333333333e-06, "loss": -0.0291, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.028718896210193634, "mask/share_reasoning": 0.871752142906189, "mask/share_step_conf": 0.0995289534330368, "num_tokens": 8355413.0, "reward": 0.8562443256378174, "reward_std": 0.22214269638061523, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5939667820930481, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.812271773815155, "step": 35 }, { "adv/mean_abs_final_conf": 0.7034646272659302, "adv/mean_abs_reasoning": 0.30368494987487793, "adv/mean_abs_step_conf": 0.7625923156738281, "adv/ratio_final_to_reasoning": 2.3164290082724435, "adv/ratio_step_to_reasoning": 2.5111297612477204, "adv/std_final_conf": 0.9043211340904236, "adv/std_reasoning": 0.5960076451301575, "adv/std_step_conf": 0.9348888397216797, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4618394513916902, "calib/avg_num_step_conf": 5.8046875, "calib/ece": 0.22792328042328053, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9246031746031746, "calib/gap": 0.008001882479494626, "calib/mean_conf": 0.9431878306878307, "calib/mu_c": 0.9453153153153154, "calib/mu_w": 0.9373134328358208, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2184920634920636, "calib/std_conf": 0.11297182458457421, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47746388384754984, "calib/step_q_c_n": 1102.0, "calib/step_q_gap": -0.0036298661524502007, "calib/step_q_w": 0.48109375000000004, "calib/step_q_w_n": 384.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 498.4765625, "completions/mean_terminated_length": 500.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.0384, "grad_norm": 0.03462150692939758, "kl": 0.042942047119140625, "learning_rate": 4.555555555555556e-06, "loss": -0.0177, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03405049070715904, "mask/share_reasoning": 0.829599142074585, "mask/share_step_conf": 0.1324441134929657, "num_tokens": 8585735.0, "reward": 0.9496276378631592, "reward_std": 0.14591683447360992, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7397283315658569, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8181206583976746, "step": 36 }, { "adv/mean_abs_final_conf": 0.7528542876243591, "adv/mean_abs_reasoning": 0.4255194067955017, "adv/mean_abs_step_conf": 0.7782937288284302, "adv/ratio_final_to_reasoning": 1.7692595815874732, "adv/ratio_step_to_reasoning": 1.8290440257228187, "adv/std_final_conf": 0.9233216047286987, "adv/std_reasoning": 0.6816492676734924, "adv/std_step_conf": 0.9350104928016663, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5996240601503761, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.4537349397590361, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8112449799196787, "calib/gap": 0.043233082706766846, "calib/mean_conf": 0.9069076305220883, "calib/mu_c": 0.9299999999999999, "calib/mu_w": 0.8867669172932331, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4473895582329317, "calib/std_conf": 0.16586536945712516, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5145901639344262, "calib/step_q_c_n": 549.0, "calib/step_q_gap": 0.06427547861974092, "calib/step_q_w": 0.4503146853146853, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 515.546875, "completions/mean_terminated_length": 517.5686645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.039466666666666664, "grad_norm": 0.02837124653160572, "kl": 0.040561676025390625, "learning_rate": 4.527777777777778e-06, "loss": 0.0259, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03237922489643097, "mask/share_reasoning": 0.8477351069450378, "mask/share_step_conf": 0.11597943305969238, "num_tokens": 8824811.0, "reward": 0.8100247383117676, "reward_std": 0.19293108582496643, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.5355929732322693, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7993002533912659, "step": 37 }, { "adv/mean_abs_final_conf": 0.7337380647659302, "adv/mean_abs_reasoning": 0.3961666226387024, "adv/mean_abs_step_conf": 0.7522833347320557, "adv/ratio_final_to_reasoning": 1.85209460574645, "adv/ratio_step_to_reasoning": 1.8989063988313977, "adv/std_final_conf": 0.9243285655975342, "adv/std_reasoning": 0.6816597580909729, "adv/std_step_conf": 0.9352226257324219, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6171377641965877, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.4039043824701196, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7689243027888446, "calib/gap": 0.03243060860707914, "calib/mean_conf": 0.9108366533864543, "calib/mu_c": 0.9262121212121213, "calib/mu_w": 0.8937815126050421, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3944223107569722, "calib/std_conf": 0.1487092446801543, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48603050397877984, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.043750561698837565, "calib/step_q_w": 0.4422799422799423, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2186.0, "completions/max_terminated_length": 2186.0, "completions/mean_length": 544.875, "completions/mean_terminated_length": 544.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.04053333333333333, "grad_norm": 0.022631574422121048, "kl": 0.041919708251953125, "learning_rate": 4.5e-06, "loss": -0.0302, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030402235686779022, "mask/share_reasoning": 0.8540716171264648, "mask/share_step_conf": 0.11552612483501434, "num_tokens": 9071187.0, "reward": 0.8525607585906982, "reward_std": 0.19134631752967834, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5848976373672485, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.82100510597229, "step": 38 }, { "adv/mean_abs_final_conf": 0.7698835730552673, "adv/mean_abs_reasoning": 0.41834837198257446, "adv/mean_abs_step_conf": 0.7499821186065674, "adv/ratio_final_to_reasoning": 1.840292982154441, "adv/ratio_step_to_reasoning": 1.7927214944147232, "adv/std_final_conf": 0.9209132790565491, "adv/std_reasoning": 0.6816251873970032, "adv/std_step_conf": 0.9349772930145264, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6363636363636364, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.4293307086614172, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7401574803149606, "calib/gap": 0.05593641331346233, "calib/mean_conf": 0.8942913385826772, "calib/mu_c": 0.9233606557377048, "calib/mu_w": 0.8674242424242424, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4216535433070865, "calib/std_conf": 0.16674539081501089, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47681818181818186, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.025802650157488893, "calib/step_q_w": 0.45101553166069297, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 512.64453125, "completions/mean_terminated_length": 512.64453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.0416, "grad_norm": 0.027761587873101234, "kl": 0.03902435302734375, "learning_rate": 4.472222222222223e-06, "loss": 0.0511, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03315240889787674, "mask/share_reasoning": 0.8416691422462463, "mask/share_step_conf": 0.12517844140529633, "num_tokens": 9308512.0, "reward": 0.8408872485160828, "reward_std": 0.19443252682685852, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.569301187992096, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8202857375144958, "step": 39 }, { "adv/mean_abs_final_conf": 0.7725449800491333, "adv/mean_abs_reasoning": 0.5383433103561401, "adv/mean_abs_step_conf": 0.7697858810424805, "adv/ratio_final_to_reasoning": 1.435041478528, "adv/ratio_step_to_reasoning": 1.4299163122009073, "adv/std_final_conf": 0.935536801815033, "adv/std_reasoning": 0.7927297353744507, "adv/std_step_conf": 0.9351649284362793, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5562437437437437, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.43980392156862746, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6745098039215687, "calib/gap": 0.04463213213213213, "calib/mean_conf": 0.8680392156862745, "calib/mu_c": 0.8932432432432431, "calib/mu_w": 0.848611111111111, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4362745098039216, "calib/std_conf": 0.19242130663324078, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4874868421052631, "calib/step_q_c_n": 532.0, "calib/step_q_gap": 0.04840693895756332, "calib/step_q_w": 0.4390799031476998, "calib/step_q_w_n": 826.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 529.17578125, "completions/mean_terminated_length": 531.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.042666666666666665, "grad_norm": 0.039750393480062485, "kl": 0.042789459228515625, "learning_rate": 4.444444444444444e-06, "loss": -0.0242, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032181888818740845, "mask/share_reasoning": 0.852412223815918, "mask/share_step_conf": 0.11149965226650238, "num_tokens": 9550741.0, "reward": 0.8274902701377869, "reward_std": 0.20201276242733002, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.5496792793273926, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8193637132644653, "step": 40 }, { "adv/mean_abs_final_conf": 0.7592229843139648, "adv/mean_abs_reasoning": 0.47157496213912964, "adv/mean_abs_step_conf": 0.7730763554573059, "adv/ratio_final_to_reasoning": 1.6099730589385488, "adv/ratio_step_to_reasoning": 1.6393498754694777, "adv/std_final_conf": 0.9329091310501099, "adv/std_reasoning": 0.7392388582229614, "adv/std_step_conf": 0.9349347949028015, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.562404999155548, "calib/avg_num_step_conf": 5.12890625, "calib/ece": 0.20612648221343866, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6521739130434783, "calib/gap": 0.00318780611383207, "calib/mean_conf": 0.8506324110671936, "calib/mu_c": 0.851413612565445, "calib/mu_w": 0.8482258064516129, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15090909090909085, "calib/std_conf": 0.21578149918890283, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4662576687116564, "calib/step_q_c_n": 978.0, "calib/step_q_gap": 0.00951140005494, "calib/step_q_w": 0.4567462686567164, "calib/step_q_w_n": 335.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 468.28125, "completions/mean_terminated_length": 470.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.04373333333333333, "grad_norm": 0.05282951146364212, "kl": 0.04245758056640625, "learning_rate": 4.416666666666667e-06, "loss": 0.0053, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035104911774396896, "mask/share_reasoning": 0.8374688625335693, "mask/share_step_conf": 0.12351995706558228, "num_tokens": 9777869.0, "reward": 0.9565410614013672, "reward_std": 0.19869878888130188, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7464367151260376, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8205516338348389, "step": 41 }, { "adv/mean_abs_final_conf": 0.7502855658531189, "adv/mean_abs_reasoning": 0.308381050825119, "adv/mean_abs_step_conf": 0.7523162364959717, "adv/ratio_final_to_reasoning": 2.4329820650316196, "adv/ratio_step_to_reasoning": 2.439567004791762, "adv/std_final_conf": 0.9343463778495789, "adv/std_reasoning": 0.5959736108779907, "adv/std_step_conf": 0.934657096862793, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.590472027972028, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.32019607843137265, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.615686274509804, "calib/gap": 0.07060939060939053, "calib/mean_conf": 0.8238823529411764, "calib/mu_c": 0.8548951048951049, "calib/mu_w": 0.7842857142857144, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29164705882352954, "calib/std_conf": 0.24580764415264922, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4692866407263294, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.0379847809266155, "calib/step_q_w": 0.4313018597997139, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 450.12890625, "completions/mean_terminated_length": 451.8941345214844, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.0448, "grad_norm": 0.023945538327097893, "kl": 0.0476837158203125, "learning_rate": 4.388888888888889e-06, "loss": -0.0103, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035657431930303574, "mask/share_reasoning": 0.827800989151001, "mask/share_step_conf": 0.13263539969921112, "num_tokens": 9997470.0, "reward": 0.896149754524231, "reward_std": 0.16470317542552948, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6523573994636536, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8297858238220215, "step": 42 }, { "adv/mean_abs_final_conf": 0.7824147939682007, "adv/mean_abs_reasoning": 0.5487573146820068, "adv/mean_abs_step_conf": 0.7633628845214844, "adv/ratio_final_to_reasoning": 1.4257938309607652, "adv/ratio_step_to_reasoning": 1.3910755521570348, "adv/std_final_conf": 0.932792067527771, "adv/std_reasoning": 0.7754303812980652, "adv/std_step_conf": 0.9347939491271973, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.60501389239707, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.27352941176470585, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5098039215686274, "calib/gap": 0.0822448850719879, "calib/mean_conf": 0.8023137254901961, "calib/mu_c": 0.8368243243243243, "calib/mu_w": 0.7545794392523364, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24772549019607842, "calib/std_conf": 0.23805531489312692, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4695673671199011, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.04070818708425045, "calib/step_q_w": 0.42885918003565066, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 489.28125, "completions/mean_terminated_length": 489.28125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.04586666666666667, "grad_norm": 0.025007378309965134, "kl": 0.042606353759765625, "learning_rate": 4.361111111111112e-06, "loss": -0.0209, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03451922535896301, "mask/share_reasoning": 0.8459039926528931, "mask/share_step_conf": 0.11957676708698273, "num_tokens": 10227950.0, "reward": 0.9199116230010986, "reward_std": 0.18631719052791595, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6879050731658936, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8370743989944458, "step": 43 }, { "adv/mean_abs_final_conf": 0.7647424340248108, "adv/mean_abs_reasoning": 0.4228684604167938, "adv/mean_abs_step_conf": 0.7544612884521484, "adv/ratio_final_to_reasoning": 1.8084641102603256, "adv/ratio_step_to_reasoning": 1.7841512410467435, "adv/std_final_conf": 0.9265362024307251, "adv/std_reasoning": 0.7205361127853394, "adv/std_step_conf": 0.9344058632850647, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6182170542635659, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.3047011952191234, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5219123505976095, "calib/gap": 0.09971470326598042, "calib/mean_conf": 0.7993625498007969, "calib/mu_c": 0.8478294573643411, "calib/mu_w": 0.7481147540983607, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.29505976095617525, "calib/std_conf": 0.24303278030476025, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4634710743801653, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.03848469835836699, "calib/step_q_w": 0.42498637602179834, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 531.890625, "completions/mean_terminated_length": 531.890625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.046933333333333334, "grad_norm": 0.030400488525629044, "kl": 0.0402679443359375, "learning_rate": 4.333333333333334e-06, "loss": -0.0253, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031088586896657944, "mask/share_reasoning": 0.8493733406066895, "mask/share_step_conf": 0.1195380911231041, "num_tokens": 10470434.0, "reward": 0.8869525194168091, "reward_std": 0.18943586945533752, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6466039419174194, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8296449184417725, "step": 44 }, { "adv/mean_abs_final_conf": 0.7844476699829102, "adv/mean_abs_reasoning": 0.5681071281433105, "adv/mean_abs_step_conf": 0.7434054017066956, "adv/ratio_final_to_reasoning": 1.3808094127363697, "adv/ratio_step_to_reasoning": 1.3085655237882605, "adv/std_final_conf": 0.9305549263954163, "adv/std_reasoning": 0.7928244471549988, "adv/std_step_conf": 0.9343764781951904, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6013617677286742, "calib/avg_num_step_conf": 5.8359375, "calib/ece": 0.2962948207171314, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.47808764940239046, "calib/gap": 0.08765994347379247, "calib/mean_conf": 0.7457768924302789, "calib/mu_c": 0.7848920863309353, "calib/mu_w": 0.6972321428571429, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2441434262948207, "calib/std_conf": 0.29521903210368977, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.461213282247765, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.056431285060704495, "calib/step_q_w": 0.4047819971870605, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 510.1484375, "completions/mean_terminated_length": 510.1484375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.048, "grad_norm": 0.031916260719299316, "kl": 0.044345855712890625, "learning_rate": 4.305555555555556e-06, "loss": -0.0493, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03434551879763603, "mask/share_reasoning": 0.835544228553772, "mask/share_step_conf": 0.13011020421981812, "num_tokens": 10706080.0, "reward": 0.89674973487854, "reward_std": 0.20731501281261444, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6590714454650879, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8297404646873474, "step": 45 }, { "adv/mean_abs_final_conf": 0.7703957557678223, "adv/mean_abs_reasoning": 0.4477524757385254, "adv/mean_abs_step_conf": 0.7582173943519592, "adv/ratio_final_to_reasoning": 1.7205840224492948, "adv/ratio_step_to_reasoning": 1.6933851523687307, "adv/std_final_conf": 0.9339072108268738, "adv/std_reasoning": 0.7014877200126648, "adv/std_step_conf": 0.9343469738960266, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5816142921406079, "calib/avg_num_step_conf": 5.95703125, "calib/ece": 0.30355999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4, "calib/gap": 0.049293104556262346, "calib/mean_conf": 0.7646000000000002, "calib/mu_c": 0.7876691729323307, "calib/mu_w": 0.7383760683760684, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26808, "calib/std_conf": 0.2485398157237588, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.41774371727748694, "calib/step_q_c_n": 764.0, "calib/step_q_gap": -0.0005611447461661978, "calib/step_q_w": 0.41830486202365313, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 541.796875, "completions/mean_terminated_length": 541.796875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.04906666666666667, "grad_norm": 0.023222772404551506, "kl": 0.0390472412109375, "learning_rate": 4.277777777777778e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03602005913853645, "mask/share_reasoning": 0.8326444625854492, "mask/share_step_conf": 0.13133545219898224, "num_tokens": 10949548.0, "reward": 0.8750042915344238, "reward_std": 0.18136216700077057, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.640655517578125, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8109156489372253, "step": 46 }, { "adv/mean_abs_final_conf": 0.7664064764976501, "adv/mean_abs_reasoning": 0.4121689200401306, "adv/mean_abs_step_conf": 0.7210854291915894, "adv/ratio_final_to_reasoning": 1.8594475207471475, "adv/ratio_step_to_reasoning": 1.7494900613112245, "adv/std_final_conf": 0.9348024725914001, "adv/std_reasoning": 0.6817787289619446, "adv/std_step_conf": 0.9342029094696045, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6607632093933463, "calib/avg_num_step_conf": 6.2109375, "calib/ece": 0.18179282868525892, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3745019920318725, "calib/gap": 0.1404827136333986, "calib/mean_conf": 0.7221912350597609, "calib/mu_c": 0.780958904109589, "calib/mu_w": 0.6404761904761904, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16115537848605574, "calib/std_conf": 0.25993176258025147, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42186956521739133, "calib/step_q_c_n": 920.0, "calib/step_q_gap": 0.021048669695003352, "calib/step_q_w": 0.400820895522388, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2286.0, "completions/max_terminated_length": 2286.0, "completions/mean_length": 537.7421875, "completions/mean_terminated_length": 539.8510131835938, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.050133333333333335, "grad_norm": 0.0441867858171463, "kl": 0.057865142822265625, "learning_rate": 4.25e-06, "loss": -0.1043, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031595584005117416, "mask/share_reasoning": 0.840315043926239, "mask/share_step_conf": 0.12418308854103088, "num_tokens": 11193186.0, "reward": 0.9337431788444519, "reward_std": 0.1738939881324768, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7196019887924194, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8385093212127686, "step": 47 }, { "adv/mean_abs_final_conf": 0.7661705017089844, "adv/mean_abs_reasoning": 0.5280641913414001, "adv/mean_abs_step_conf": 0.7433052062988281, "adv/ratio_final_to_reasoning": 1.4509041027052818, "adv/ratio_step_to_reasoning": 1.407603883176907, "adv/std_final_conf": 0.9345859289169312, "adv/std_reasoning": 0.7753331065177917, "adv/std_step_conf": 0.934105396270752, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.609538002980626, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.25224409448818896, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3346456692913386, "calib/gap": 0.09073273720814712, "calib/mean_conf": 0.6824803149606299, "calib/mu_c": 0.7260606060606061, "calib/mu_w": 0.635327868852459, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20751968503937007, "calib/std_conf": 0.284834095941698, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4449210903873745, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.036738030278084866, "calib/step_q_w": 0.4081830601092896, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 485.16796875, "completions/mean_terminated_length": 487.07061767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.0512, "grad_norm": 0.028268778696656227, "kl": 0.05120849609375, "learning_rate": 4.222222222222223e-06, "loss": -0.0433, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034794606268405914, "mask/share_reasoning": 0.8305359482765198, "mask/share_step_conf": 0.1307632029056549, "num_tokens": 11421077.0, "reward": 0.9115187525749207, "reward_std": 0.16462844610214233, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6826753616333008, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8380183577537537, "step": 48 }, { "adv/mean_abs_final_conf": 0.7610164880752563, "adv/mean_abs_reasoning": 0.37937378883361816, "adv/mean_abs_step_conf": 0.7463734149932861, "adv/ratio_final_to_reasoning": 2.0059806725577847, "adv/ratio_step_to_reasoning": 1.9673826631196782, "adv/std_final_conf": 0.927111029624939, "adv/std_reasoning": 0.6403971314430237, "adv/std_step_conf": 0.9335190057754517, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6618327886710238, "calib/avg_num_step_conf": 5.7890625, "calib/ece": 0.1867469879518073, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.39759036144578314, "calib/gap": 0.14107434640522876, "calib/mean_conf": 0.7339759036144577, "calib/mu_c": 0.7883660130718955, "calib/mu_w": 0.6472916666666667, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.153132530120482, "calib/std_conf": 0.26491768711331565, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4299315849486887, "calib/step_q_c_n": 877.0, "calib/step_q_gap": 0.025981171725548258, "calib/step_q_w": 0.40395041322314046, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 500.1640625, "completions/mean_terminated_length": 500.1640625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.05226666666666667, "grad_norm": 0.034990474581718445, "kl": 0.04461669921875, "learning_rate": 4.194444444444445e-06, "loss": 0.0257, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03338824212551117, "mask/share_reasoning": 0.8339203000068665, "mask/share_step_conf": 0.13269150257110596, "num_tokens": 11653655.0, "reward": 0.9292441010475159, "reward_std": 0.16790857911109924, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7214101552963257, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8237967491149902, "step": 49 }, { "adv/mean_abs_final_conf": 0.7604638338088989, "adv/mean_abs_reasoning": 0.4985535740852356, "adv/mean_abs_step_conf": 0.7410811185836792, "adv/ratio_final_to_reasoning": 1.5253402509534224, "adv/ratio_step_to_reasoning": 1.4864623525033234, "adv/std_final_conf": 0.9323478937149048, "adv/std_reasoning": 0.7394198775291443, "adv/std_step_conf": 0.9342080354690552, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.696236559139785, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.16378486055776886, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2868525896414343, "calib/gap": 0.17506669388866214, "calib/mean_conf": 0.6755776892430279, "calib/mu_c": 0.7404430379746836, "calib/mu_w": 0.5653763440860214, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10494023904382463, "calib/std_conf": 0.2817333318214269, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4413221153846154, "calib/step_q_c_n": 832.0, "calib/step_q_gap": 0.0021435439560439917, "calib/step_q_w": 0.4391785714285714, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 502.4453125, "completions/mean_terminated_length": 504.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.05333333333333334, "grad_norm": 0.033615924417972565, "kl": 0.0509490966796875, "learning_rate": 4.166666666666667e-06, "loss": -0.0329, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03367741405963898, "mask/share_reasoning": 0.8400186896324158, "mask/share_step_conf": 0.12239763140678406, "num_tokens": 11887641.0, "reward": 0.953680157661438, "reward_std": 0.16476009786128998, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7499589920043945, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8386512994766235, "step": 50 }, { "adv/mean_abs_final_conf": 0.7251266837120056, "adv/mean_abs_reasoning": 0.48975202441215515, "adv/mean_abs_step_conf": 0.7647483348846436, "adv/ratio_final_to_reasoning": 1.4805996658867688, "adv/ratio_step_to_reasoning": 1.5615011204957527, "adv/std_final_conf": 0.9339321851730347, "adv/std_reasoning": 0.7574661374092102, "adv/std_step_conf": 0.9339545965194702, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7184901247401246, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.1662698412698413, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4087301587301587, "calib/gap": 0.2079625779625781, "calib/mean_conf": 0.7244444444444444, "calib/mu_c": 0.8102702702702703, "calib/mu_w": 0.6023076923076922, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15170634920634923, "calib/std_conf": 0.27266339372688003, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4352357320099255, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.036962458736652215, "calib/step_q_w": 0.3982732732732733, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 527.70703125, "completions/mean_terminated_length": 527.70703125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0544, "grad_norm": 0.03140642121434212, "kl": 0.047603607177734375, "learning_rate": 4.138888888888889e-06, "loss": 0.0705, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03250539302825928, "mask/share_reasoning": 0.8480877876281738, "mask/share_step_conf": 0.11940683424472809, "num_tokens": 12132030.0, "reward": 0.9555783867835999, "reward_std": 0.16661688685417175, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7533218860626221, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8453348875045776, "step": 51 }, { "adv/mean_abs_final_conf": 0.7324730157852173, "adv/mean_abs_reasoning": 0.37932929396629333, "adv/mean_abs_step_conf": 0.7724111080169678, "adv/ratio_final_to_reasoning": 1.930968758374627, "adv/ratio_step_to_reasoning": 2.036254832682664, "adv/std_final_conf": 0.9282681345939636, "adv/std_reasoning": 0.6612043976783752, "adv/std_step_conf": 0.9335633516311646, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7267468944099378, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.11173228346456689, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.37401574803149606, "calib/gap": 0.23133385093167713, "calib/mean_conf": 0.6908661417322836, "calib/mu_c": 0.7546195652173913, "calib/mu_w": 0.5232857142857141, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.039094488188976304, "calib/std_conf": 0.284220230740669, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45716188524590157, "calib/step_q_c_n": 976.0, "calib/step_q_gap": 0.062108409310072665, "calib/step_q_w": 0.3950534759358289, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 483.15625, "completions/mean_terminated_length": 483.15625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.055466666666666664, "grad_norm": 0.07565945386886597, "kl": 0.04888153076171875, "learning_rate": 4.111111111111111e-06, "loss": 0.0458, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03548673540353775, "mask/share_reasoning": 0.8450421094894409, "mask/share_step_conf": 0.11947111040353775, "num_tokens": 12363670.0, "reward": 1.001354455947876, "reward_std": 0.14560005068778992, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.8012363314628601, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8600664734840393, "step": 52 }, { "adv/mean_abs_final_conf": 0.7319698929786682, "adv/mean_abs_reasoning": 0.42775505781173706, "adv/mean_abs_step_conf": 0.7573376893997192, "adv/ratio_final_to_reasoning": 1.7111893351377314, "adv/ratio_step_to_reasoning": 1.7704938271776967, "adv/std_final_conf": 0.9073989391326904, "adv/std_reasoning": 0.7013720273971558, "adv/std_step_conf": 0.9335355162620544, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6764725195350975, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.22283464566929134, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5118110236220472, "calib/gap": 0.14950554862433507, "calib/mean_conf": 0.7622047244094489, "calib/mu_c": 0.8192993630573248, "calib/mu_w": 0.6697938144329897, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18346456692913385, "calib/std_conf": 0.27814320614979304, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4495724713242961, "calib/step_q_c_n": 959.0, "calib/step_q_gap": 0.028577896279088133, "calib/step_q_w": 0.420994575045208, "calib/step_q_w_n": 553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 514.00390625, "completions/mean_terminated_length": 514.00390625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.05653333333333333, "grad_norm": 0.03649696707725525, "kl": 0.044811248779296875, "learning_rate": 4.083333333333334e-06, "loss": -0.0379, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0319090262055397, "mask/share_reasoning": 0.8450208306312561, "mask/share_step_conf": 0.12307015061378479, "num_tokens": 12601079.0, "reward": 0.9459646940231323, "reward_std": 0.1464155912399292, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7299094200134277, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8417074680328369, "step": 53 }, { "adv/mean_abs_final_conf": 0.6466980576515198, "adv/mean_abs_reasoning": 0.41618654131889343, "adv/mean_abs_step_conf": 0.7486047744750977, "adv/ratio_final_to_reasoning": 1.5538658592902508, "adv/ratio_step_to_reasoning": 1.7987241300566141, "adv/std_final_conf": 0.8837612867355347, "adv/std_reasoning": 0.7204023599624634, "adv/std_step_conf": 0.9337106347084045, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7693302779420461, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.17437007874015756, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7125984251968503, "calib/gap": 0.2069544648137196, "calib/mean_conf": 0.8731889763779528, "calib/mu_c": 0.9351123595505617, "calib/mu_w": 0.7281578947368421, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17338582677165362, "calib/std_conf": 0.21359794567870916, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4733155080213904, "calib/step_q_c_n": 935.0, "calib/step_q_gap": 0.06228693659281892, "calib/step_q_w": 0.41102857142857147, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2332.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 472.18359375, "completions/mean_terminated_length": 472.18359375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.0576, "grad_norm": 0.03446084260940552, "kl": 0.042400360107421875, "learning_rate": 4.055555555555556e-06, "loss": 0.0397, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03629143908619881, "mask/share_reasoning": 0.832728922367096, "mask/share_step_conf": 0.13097967207431793, "num_tokens": 12828190.0, "reward": 0.9863956570625305, "reward_std": 0.1475645750761032, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7954957485198975, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8397955894470215, "step": 54 }, { "adv/mean_abs_final_conf": 0.710288405418396, "adv/mean_abs_reasoning": 0.4166402816772461, "adv/mean_abs_step_conf": 0.7546051740646362, "adv/ratio_final_to_reasoning": 1.7048001277241525, "adv/ratio_step_to_reasoning": 1.8111671080550908, "adv/std_final_conf": 0.8620692491531372, "adv/std_reasoning": 0.681664228439331, "adv/std_step_conf": 0.9338399171829224, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.831180984465656, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.3065354330708662, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6692913385826772, "calib/gap": 0.2554569842161083, "calib/mean_conf": 0.8459055118110237, "calib/mu_c": 0.9635766423357663, "calib/mu_w": 0.708119658119658, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3065354330708662, "calib/std_conf": 0.23156555280354305, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49875362318840577, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.06803933747412, "calib/step_q_w": 0.43071428571428577, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 471.10546875, "completions/mean_terminated_length": 472.9529724121094, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.058666666666666666, "grad_norm": 0.0477132648229599, "kl": 0.042133331298828125, "learning_rate": 4.027777777777779e-06, "loss": -0.0111, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034928515553474426, "mask/share_reasoning": 0.8399760723114014, "mask/share_step_conf": 0.12118920683860779, "num_tokens": 13056617.0, "reward": 0.9348831176757812, "reward_std": 0.1851484775543213, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.717585563659668, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8482744097709656, "step": 55 }, { "adv/mean_abs_final_conf": 0.6221961975097656, "adv/mean_abs_reasoning": 0.44493114948272705, "adv/mean_abs_step_conf": 0.7350568771362305, "adv/ratio_final_to_reasoning": 1.3984100646428674, "adv/ratio_step_to_reasoning": 1.6520688155702314, "adv/std_final_conf": 0.8303307890892029, "adv/std_reasoning": 0.7205959558486938, "adv/std_step_conf": 0.9340452551841736, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.760609149993628, "calib/avg_num_step_conf": 6.0078125, "calib/ece": 0.35442231075697206, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7370517928286853, "calib/gap": 0.1552262010959603, "calib/mean_conf": 0.8819123505976096, "calib/mu_c": 0.9548872180451129, "calib/mu_w": 0.7996610169491526, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3532270916334661, "calib/std_conf": 0.2205596112582244, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49531598513011155, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.05866755831752596, "calib/step_q_w": 0.4366484268125856, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 518.59375, "completions/mean_terminated_length": 522.6771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.05973333333333333, "grad_norm": 0.027666104957461357, "kl": 0.041835784912109375, "learning_rate": 4.000000000000001e-06, "loss": 0.0158, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03235214948654175, "mask/share_reasoning": 0.8360726833343506, "mask/share_step_conf": 0.12376265227794647, "num_tokens": 13296217.0, "reward": 0.8819053173065186, "reward_std": 0.1791996955871582, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6354690790176392, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8299038410186768, "step": 56 }, { "adv/mean_abs_final_conf": 0.6285187005996704, "adv/mean_abs_reasoning": 0.4043678939342499, "adv/mean_abs_step_conf": 0.7555187940597534, "adv/ratio_final_to_reasoning": 1.5543239461584637, "adv/ratio_step_to_reasoning": 1.86839461142432, "adv/std_final_conf": 0.8306865096092224, "adv/std_reasoning": 0.6815210580825806, "adv/std_step_conf": 0.9339905381202698, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7224432009708084, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.2949602362204724, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8110236220472441, "calib/gap": 0.1101253354007955, "calib/mean_conf": 0.9085830708661418, "calib/mu_c": 0.9480374233128834, "calib/mu_w": 0.8379120879120879, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2809055118110235, "calib/std_conf": 0.20481708814279678, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5285098952270082, "calib/step_q_c_n": 859.0, "calib/step_q_gap": 0.0643495898834967, "calib/step_q_w": 0.4641603053435115, "calib/step_q_w_n": 524.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 502.36328125, "completions/mean_terminated_length": 502.36328125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.0608, "grad_norm": 0.03946515545248985, "kl": 0.042720794677734375, "learning_rate": 3.972222222222223e-06, "loss": -0.0043, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03254619985818863, "mask/share_reasoning": 0.8481278419494629, "mask/share_step_conf": 0.11932602524757385, "num_tokens": 13531614.0, "reward": 0.9359113574028015, "reward_std": 0.17461207509040833, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7020390629768372, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8432211875915527, "step": 57 }, { "adv/mean_abs_final_conf": 0.702364444732666, "adv/mean_abs_reasoning": 0.6132348775863647, "adv/mean_abs_step_conf": 0.7652078866958618, "adv/ratio_final_to_reasoning": 1.14534327776187, "adv/ratio_step_to_reasoning": 1.2478218618413366, "adv/std_final_conf": 0.8918449878692627, "adv/std_reasoning": 0.8266597390174866, "adv/std_step_conf": 0.9347798824310303, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5812231404958678, "calib/avg_num_step_conf": 6.26171875, "calib/ece": 0.3894715447154472, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7520325203252033, "calib/gap": 0.08071999999999979, "calib/mean_conf": 0.8710162601626017, "calib/mu_c": 0.9107199999999999, "calib/mu_w": 0.8300000000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3761788617886179, "calib/std_conf": 0.2487010640262768, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5189918256130791, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.09607352872009867, "calib/step_q_w": 0.42291829689298044, "calib/step_q_w_n": 869.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 612.94921875, "completions/mean_terminated_length": 612.94921875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.06186666666666667, "grad_norm": 0.03811941668391228, "kl": 0.03516387939453125, "learning_rate": 3.944444444444445e-06, "loss": 0.0669, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02849685214459896, "mask/share_reasoning": 0.8575760722160339, "mask/share_step_conf": 0.11392708867788315, "num_tokens": 13794849.0, "reward": 0.8243527412414551, "reward_std": 0.2575843036174774, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5681480169296265, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.793057382106781, "step": 58 }, { "adv/mean_abs_final_conf": 0.6952832937240601, "adv/mean_abs_reasoning": 0.5429081916809082, "adv/mean_abs_step_conf": 0.7698467969894409, "adv/ratio_final_to_reasoning": 1.2806645845062319, "adv/ratio_step_to_reasoning": 1.4180054911418887, "adv/std_final_conf": 0.8595199584960938, "adv/std_reasoning": 0.7577628493309021, "adv/std_step_conf": 0.9347355961799622, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6182361439453639, "calib/avg_num_step_conf": 5.3984375, "calib/ece": 0.36128514056224903, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8192771084337349, "calib/gap": 0.07229905437352246, "calib/mean_conf": 0.9089959839357429, "calib/mu_c": 0.940354609929078, "calib/mu_w": 0.8680555555555556, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3520080321285141, "calib/std_conf": 0.2067461532203717, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5530294511378849, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.039438899956782625, "calib/step_q_w": 0.5135905511811023, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 560.06640625, "completions/mean_terminated_length": 560.06640625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.06293333333333333, "grad_norm": 0.048179443925619125, "kl": 0.040973663330078125, "learning_rate": 3.916666666666667e-06, "loss": 0.0553, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032195284962654114, "mask/share_reasoning": 0.8571901321411133, "mask/share_step_conf": 0.11061456054449081, "num_tokens": 14044474.0, "reward": 0.8571747541427612, "reward_std": 0.24229061603546143, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6124788522720337, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.796401858329773, "step": 59 }, { "adv/mean_abs_final_conf": 0.6338130235671997, "adv/mean_abs_reasoning": 0.5311764478683472, "adv/mean_abs_step_conf": 0.7529253959655762, "adv/ratio_final_to_reasoning": 1.1932250123489871, "adv/ratio_step_to_reasoning": 1.417467583487794, "adv/std_final_conf": 0.8211374878883362, "adv/std_reasoning": 0.7754148840904236, "adv/std_step_conf": 0.9346369504928589, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7307056579783853, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.3291304347826087, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.766798418972332, "calib/gap": 0.16638461538461535, "calib/mean_conf": 0.8721343873517787, "calib/mu_c": 0.9444755244755245, "calib/mu_w": 0.7780909090909092, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3180237154150198, "calib/std_conf": 0.25232239424135916, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5669808541973491, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.07766474477485669, "calib/step_q_w": 0.48931610942249243, "calib/step_q_w_n": 658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 517.28515625, "completions/mean_terminated_length": 517.28515625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.064, "grad_norm": 0.02972288429737091, "kl": 0.043041229248046875, "learning_rate": 3.88888888888889e-06, "loss": 0.0674, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0328441746532917, "mask/share_reasoning": 0.8521276712417603, "mask/share_step_conf": 0.11502814292907715, "num_tokens": 14285755.0, "reward": 0.9059640169143677, "reward_std": 0.21950387954711914, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6663120985031128, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.837022066116333, "step": 60 }, { "adv/mean_abs_final_conf": 0.5326590538024902, "adv/mean_abs_reasoning": 0.4154004752635956, "adv/mean_abs_step_conf": 0.7765494585037231, "adv/ratio_final_to_reasoning": 1.2822783928316097, "adv/ratio_step_to_reasoning": 1.8693995427206906, "adv/std_final_conf": 0.7565453052520752, "adv/std_reasoning": 0.6815720200538635, "adv/std_step_conf": 0.9333240389823914, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5917389020225899, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.3258984375, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8984375, "calib/gap": 0.0783412135539796, "calib/mean_conf": 0.9430859375, "calib/mu_c": 0.9718518518518519, "calib/mu_w": 0.8935106382978723, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3180859375, "calib/std_conf": 0.17727607033874057, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5699270072992701, "calib/step_q_c_n": 822.0, "calib/step_q_gap": 0.04366129814307629, "calib/step_q_w": 0.5262657091561939, "calib/step_q_w_n": 557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 433.859375, "completions/mean_terminated_length": 435.5608215332031, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.033378370106220245, "kl": 0.0516357421875, "learning_rate": 3.861111111111112e-06, "loss": -0.0017, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0384836420416832, "mask/share_reasoning": 0.8277353048324585, "mask/share_step_conf": 0.1298747956752777, "num_tokens": 14500887.0, "reward": 0.9083299040794373, "reward_std": 0.1809770166873932, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6727949380874634, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8180835247039795, "step": 61 }, { "adv/mean_abs_final_conf": 0.6663320064544678, "adv/mean_abs_reasoning": 0.5563945770263672, "adv/mean_abs_step_conf": 0.7701988220214844, "adv/ratio_final_to_reasoning": 1.1975889664770956, "adv/ratio_step_to_reasoning": 1.3842673056552546, "adv/std_final_conf": 0.8596050143241882, "adv/std_reasoning": 0.7928605079650879, "adv/std_step_conf": 0.9351630806922913, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5797742721559573, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.34231075697211155, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7768924302788844, "calib/gap": 0.10010773374374748, "calib/mean_conf": 0.8688446215139443, "calib/mu_c": 0.9139130434782609, "calib/mu_w": 0.8138053097345134, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.33067729083665337, "calib/std_conf": 0.25382778724686866, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5865571428571429, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.059089893948845895, "calib/step_q_w": 0.527467248908297, "calib/step_q_w_n": 687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 501.4609375, "completions/mean_terminated_length": 503.427490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.06613333333333334, "grad_norm": 0.04095854610204697, "kl": 0.042919158935546875, "learning_rate": 3.833333333333334e-06, "loss": -0.0457, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032502319663763046, "mask/share_reasoning": 0.850849986076355, "mask/share_step_conf": 0.11274144798517227, "num_tokens": 14736341.0, "reward": 0.8477140665054321, "reward_std": 0.2483111470937729, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6155894994735718, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7782761454582214, "step": 62 }, { "adv/mean_abs_final_conf": 0.6767548322677612, "adv/mean_abs_reasoning": 0.48050642013549805, "adv/mean_abs_step_conf": 0.7537362575531006, "adv/ratio_final_to_reasoning": 1.4084199584199584, "adv/ratio_step_to_reasoning": 1.5686289006098075, "adv/std_final_conf": 0.8813891410827637, "adv/std_reasoning": 0.739285409450531, "adv/std_step_conf": 0.9344167113304138, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7938900330774512, "calib/avg_num_step_conf": 5.0234375, "calib/ece": 0.20101562500000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.57421875, "calib/gap": 0.3144954128440365, "calib/mean_conf": 0.76609375, "calib/mu_c": 0.8999999999999999, "calib/mu_w": 0.5855045871559634, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19644531250000008, "calib/std_conf": 0.3045802295142242, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5879384203480589, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.060153262648615535, "calib/step_q_w": 0.5277851576994433, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 538.26953125, "completions/mean_terminated_length": 540.3804321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0672, "grad_norm": 0.041442278772592545, "kl": 0.0406951904296875, "learning_rate": 3.8055555555555556e-06, "loss": 0.0049, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03217097744345665, "mask/share_reasoning": 0.856690526008606, "mask/share_step_conf": 0.1072322428226471, "num_tokens": 14982778.0, "reward": 0.9666212201118469, "reward_std": 0.17020484805107117, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7759265899658203, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8432533740997314, "step": 63 }, { "adv/mean_abs_final_conf": 0.6773377060890198, "adv/mean_abs_reasoning": 0.43898671865463257, "adv/mean_abs_step_conf": 0.7621839046478271, "adv/ratio_final_to_reasoning": 1.5429571722918272, "adv/ratio_step_to_reasoning": 1.7362345425476664, "adv/std_final_conf": 0.8537333607673645, "adv/std_reasoning": 0.7014127969741821, "adv/std_step_conf": 0.934469997882843, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6689581856839122, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.21031620553359676, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6245059288537549, "calib/gap": 0.15607228915662652, "calib/mean_conf": 0.7967984189723321, "calib/mu_c": 0.848, "calib/mu_w": 0.6919277108433735, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16758893280632406, "calib/std_conf": 0.2851302089308167, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5487165178571428, "calib/step_q_c_n": 896.0, "calib/step_q_gap": 0.03657209772585179, "calib/step_q_w": 0.512144420131291, "calib/step_q_w_n": 457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2263.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 500.83984375, "completions/mean_terminated_length": 500.83984375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.06826666666666667, "grad_norm": 0.047779619693756104, "kl": 0.044464111328125, "learning_rate": 3.777777777777778e-06, "loss": 0.0616, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0342421680688858, "mask/share_reasoning": 0.8493179678916931, "mask/share_step_conf": 0.11643985658884048, "num_tokens": 15214769.0, "reward": 0.939239501953125, "reward_std": 0.19172075390815735, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7383691072463989, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8112034201622009, "step": 64 }, { "adv/mean_abs_final_conf": 0.5757031440734863, "adv/mean_abs_reasoning": 0.29549628496170044, "adv/mean_abs_step_conf": 0.766379177570343, "adv/ratio_final_to_reasoning": 1.9482584836831494, "adv/ratio_step_to_reasoning": 2.593532360887969, "adv/std_final_conf": 0.8076551556587219, "adv/std_reasoning": 0.5959193110466003, "adv/std_step_conf": 0.9337574243545532, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.620545465892779, "calib/avg_num_step_conf": 5.0625, "calib/ece": 0.32832031250000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.7890625, "calib/gap": 0.12905510828184497, "calib/mean_conf": 0.8714453125000001, "calib/mu_c": 0.9263945578231293, "calib/mu_w": 0.7973394495412843, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3127734375000001, "calib/std_conf": 0.253972848837779, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5844356306892068, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.04492898932298284, "calib/step_q_w": 0.539506641366224, "calib/step_q_w_n": 527.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 406.51171875, "completions/mean_terminated_length": 408.10589599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.06933333333333333, "grad_norm": 0.058045756071805954, "kl": 0.051605224609375, "learning_rate": 3.7500000000000005e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.039061564952135086, "mask/share_reasoning": 0.8285530209541321, "mask/share_step_conf": 0.12847915291786194, "num_tokens": 15423860.0, "reward": 0.9013949632644653, "reward_std": 0.1410367488861084, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6657683849334717, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8221778273582458, "step": 65 }, { "adv/mean_abs_final_conf": 0.6945334672927856, "adv/mean_abs_reasoning": 0.49474918842315674, "adv/mean_abs_step_conf": 0.7819595336914062, "adv/ratio_final_to_reasoning": 1.4038092098874841, "adv/ratio_step_to_reasoning": 1.5805170619554403, "adv/std_final_conf": 0.8789015412330627, "adv/std_reasoning": 0.7575258612632751, "adv/std_step_conf": 0.9340898394584656, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7342580645161291, "calib/avg_num_step_conf": 5.5625, "calib/ece": 0.22863453815261048, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5180722891566265, "calib/gap": 0.2801380645161292, "calib/mean_conf": 0.7266265060240964, "calib/mu_c": 0.8672580645161292, "calib/mu_w": 0.58712, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22863453815261048, "calib/std_conf": 0.3171102342449586, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5563850556438791, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.10730329463758981, "calib/step_q_w": 0.44908176100628927, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 546.6328125, "completions/mean_terminated_length": 550.93701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.0704, "grad_norm": 0.04052841290831566, "kl": 0.0453948974609375, "learning_rate": 3.7222222222222225e-06, "loss": -0.0003, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032529592514038086, "mask/share_reasoning": 0.8484911322593689, "mask/share_step_conf": 0.1111668050289154, "num_tokens": 15670150.0, "reward": 0.9120385646820068, "reward_std": 0.20076248049736023, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7170792818069458, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8155916333198547, "step": 66 }, { "adv/mean_abs_final_conf": 0.5726084113121033, "adv/mean_abs_reasoning": 0.33323174715042114, "adv/mean_abs_step_conf": 0.7718336582183838, "adv/ratio_final_to_reasoning": 1.7183489154580078, "adv/ratio_step_to_reasoning": 2.3162068584959203, "adv/std_final_conf": 0.7796696424484253, "adv/std_reasoning": 0.6185460686683655, "adv/std_step_conf": 0.9334813356399536, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.789329290303949, "calib/avg_num_step_conf": 5.2421875, "calib/ece": 0.11793650793650802, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5992063492063492, "calib/gap": 0.3455620532813516, "calib/mean_conf": 0.7521428571428572, "calib/mu_c": 0.8632163742690059, "calib/mu_w": 0.5176543209876543, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09575396825396834, "calib/std_conf": 0.3248298874523601, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.565239179954442, "calib/step_q_c_n": 878.0, "calib/step_q_gap": 0.08791159374754537, "calib/step_q_w": 0.4773275862068966, "calib/step_q_w_n": 464.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 532.609375, "completions/mean_terminated_length": 532.609375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.07146666666666666, "grad_norm": 0.037965673953294754, "kl": 0.04518890380859375, "learning_rate": 3.694444444444445e-06, "loss": 0.0047, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032018158584833145, "mask/share_reasoning": 0.859516978263855, "mask/share_step_conf": 0.10846483707427979, "num_tokens": 15911506.0, "reward": 0.996482789516449, "reward_std": 0.13825537264347076, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.8088640570640564, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8536326885223389, "step": 67 }, { "adv/mean_abs_final_conf": 0.6434445977210999, "adv/mean_abs_reasoning": 0.508094310760498, "adv/mean_abs_step_conf": 0.7290732860565186, "adv/ratio_final_to_reasoning": 1.2663881175091571, "adv/ratio_step_to_reasoning": 1.4349172400007133, "adv/std_final_conf": 0.8660622239112854, "adv/std_reasoning": 0.7752746343612671, "adv/std_step_conf": 0.9345232844352722, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7749092761189279, "calib/avg_num_step_conf": 5.0859375, "calib/ece": 0.24063492063492076, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6746031746031746, "calib/gap": 0.34736614248424247, "calib/mean_conf": 0.7922222222222223, "calib/mu_c": 0.9479856115107912, "calib/mu_w": 0.6006194690265487, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.24063492063492076, "calib/std_conf": 0.3141248586275211, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.54993893129771, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.1253021461354225, "calib/step_q_w": 0.4246367851622875, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 489.25, "completions/mean_terminated_length": 489.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.07253333333333334, "grad_norm": 0.03003855049610138, "kl": 0.049896240234375, "learning_rate": 3.6666666666666666e-06, "loss": 0.0552, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03539380431175232, "mask/share_reasoning": 0.846167802810669, "mask/share_step_conf": 0.11843834817409515, "num_tokens": 16140842.0, "reward": 0.9478596448898315, "reward_std": 0.21063147485256195, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7486066222190857, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8432064056396484, "step": 68 }, { "adv/mean_abs_final_conf": 0.7356120347976685, "adv/mean_abs_reasoning": 0.5031900405883789, "adv/mean_abs_step_conf": 0.7626789808273315, "adv/ratio_final_to_reasoning": 1.4618970477585747, "adv/ratio_step_to_reasoning": 1.5156877507661575, "adv/std_final_conf": 0.9045613408088684, "adv/std_reasoning": 0.7393735647201538, "adv/std_step_conf": 0.9342234134674072, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7375190548780488, "calib/avg_num_step_conf": 5.1484375, "calib/ece": 0.19768924302788846, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4342629482071713, "calib/gap": 0.2881923272357725, "calib/mean_conf": 0.6307569721115538, "calib/mu_c": 0.7777235772357725, "calib/mu_w": 0.48953125, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16920318725099603, "calib/std_conf": 0.3507042213854929, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5171782178217821, "calib/step_q_c_n": 606.0, "calib/step_q_gap": 0.07135841445099556, "calib/step_q_w": 0.4458198033707865, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 581.4296875, "completions/mean_terminated_length": 583.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.0736, "grad_norm": 0.0420890748500824, "kl": 0.04071044921875, "learning_rate": 3.638888888888889e-06, "loss": 0.0318, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03073749504983425, "mask/share_reasoning": 0.8656871318817139, "mask/share_step_conf": 0.09966909885406494, "num_tokens": 16394184.0, "reward": 0.9349965453147888, "reward_std": 0.17003074288368225, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7366687059402466, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8411368131637573, "step": 69 }, { "adv/mean_abs_final_conf": 0.6535979509353638, "adv/mean_abs_reasoning": 0.4485911726951599, "adv/mean_abs_step_conf": 0.78276127576828, "adv/ratio_final_to_reasoning": 1.4570013649811968, "adv/ratio_step_to_reasoning": 1.7449324093147178, "adv/std_final_conf": 0.8590722680091858, "adv/std_reasoning": 0.7205913066864014, "adv/std_step_conf": 0.9336462020874023, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7910840932117528, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.14784552845528456, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.4377446808510638, "calib/mean_conf": 0.6422357723577237, "calib/mu_c": 0.8290780141843972, "calib/mu_w": 0.39133333333333337, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10845528455284556, "calib/std_conf": 0.3799853994050151, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5202646596858639, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.11241115650115052, "calib/step_q_w": 0.40785350318471336, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2263.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 564.765625, "completions/mean_terminated_length": 564.765625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.07466666666666667, "grad_norm": 0.04838375374674797, "kl": 0.043308258056640625, "learning_rate": 3.6111111111111115e-06, "loss": 0.0411, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03303219750523567, "mask/share_reasoning": 0.8492827415466309, "mask/share_step_conf": 0.11768506467342377, "num_tokens": 16645756.0, "reward": 0.9579899311065674, "reward_std": 0.18355971574783325, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7846719026565552, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8297454118728638, "step": 70 }, { "adv/mean_abs_final_conf": 0.656448245048523, "adv/mean_abs_reasoning": 0.5046650171279907, "adv/mean_abs_step_conf": 0.7693231105804443, "adv/ratio_final_to_reasoning": 1.30076035145911, "adv/ratio_step_to_reasoning": 1.5244232995554203, "adv/std_final_conf": 0.8575155138969421, "adv/std_reasoning": 0.7753113508224487, "adv/std_step_conf": 0.9343364238739014, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6714285714285715, "calib/avg_num_step_conf": 5.66015625, "calib/ece": 0.2685826771653543, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5590551181102362, "calib/gap": 0.20466165413533832, "calib/mean_conf": 0.6938582677165355, "calib/mu_c": 0.7857142857142857, "calib/mu_w": 0.5810526315789474, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20562992125984247, "calib/std_conf": 0.3592556768226254, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5017489986648865, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.08003214152202931, "calib/step_q_w": 0.4217168571428572, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 526.7578125, "completions/mean_terminated_length": 528.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.07573333333333333, "grad_norm": 0.05430913344025612, "kl": 0.04532623291015625, "learning_rate": 3.5833333333333335e-06, "loss": -0.0496, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03347496688365936, "mask/share_reasoning": 0.8458138704299927, "mask/share_step_conf": 0.11680489778518677, "num_tokens": 16885014.0, "reward": 0.913619875907898, "reward_std": 0.18851624429225922, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.69174724817276, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8292425274848938, "step": 71 }, { "adv/mean_abs_final_conf": 0.6513075828552246, "adv/mean_abs_reasoning": 0.4869433641433716, "adv/mean_abs_step_conf": 0.7313523292541504, "adv/ratio_final_to_reasoning": 1.337542783853317, "adv/ratio_step_to_reasoning": 1.5019248296785848, "adv/std_final_conf": 0.8733137845993042, "adv/std_reasoning": 0.7574735283851624, "adv/std_step_conf": 0.9336986541748047, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7550791474510781, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.2041568627450982, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5568627450980392, "calib/gap": 0.29419543811541826, "calib/mean_conf": 0.7094901960784314, "calib/mu_c": 0.8398591549295775, "calib/mu_w": 0.5456637168141593, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17839215686274526, "calib/std_conf": 0.3480626139607879, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48932885906040263, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.07074618189504822, "calib/step_q_w": 0.4185826771653544, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 494.8828125, "completions/mean_terminated_length": 494.8828125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.0768, "grad_norm": 0.04387129843235016, "kl": 0.0503692626953125, "learning_rate": 3.555555555555556e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.032157592475414276, "mask/share_reasoning": 0.8548599481582642, "mask/share_step_conf": 0.11298239976167679, "num_tokens": 17116112.0, "reward": 0.9658874273300171, "reward_std": 0.16145509481430054, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7510405778884888, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8705779314041138, "step": 72 }, { "adv/mean_abs_final_conf": 0.6699906587600708, "adv/mean_abs_reasoning": 0.5513850450515747, "adv/mean_abs_step_conf": 0.7873334884643555, "adv/ratio_final_to_reasoning": 1.2151048795627057, "adv/ratio_step_to_reasoning": 1.4279195555452742, "adv/std_final_conf": 0.8398651480674744, "adv/std_reasoning": 0.7753834128379822, "adv/std_step_conf": 0.9337665438652039, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7150241447323675, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.1591015625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.63671875, "calib/gap": 0.31899952390668573, "calib/mean_conf": 0.7627734374999999, "calib/mu_c": 0.8711834319526628, "calib/mu_w": 0.5521839080459771, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.130859375, "calib/std_conf": 0.3359183474736585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5108766627771295, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.08652022713356511, "calib/step_q_w": 0.4243564356435644, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 474.89453125, "completions/mean_terminated_length": 476.75689697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.07786666666666667, "grad_norm": 0.042493823915719986, "kl": 0.0510711669921875, "learning_rate": 3.5277777777777784e-06, "loss": -0.0119, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033234454691410065, "mask/share_reasoning": 0.8480768203735352, "mask/share_step_conf": 0.11478252708911896, "num_tokens": 17344717.0, "reward": 0.9947052001953125, "reward_std": 0.1547623872756958, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7954136729240417, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.861965537071228, "step": 73 }, { "adv/mean_abs_final_conf": 0.6531205177307129, "adv/mean_abs_reasoning": 0.48068854212760925, "adv/mean_abs_step_conf": 0.7459403276443481, "adv/ratio_final_to_reasoning": 1.3587187138680077, "adv/ratio_step_to_reasoning": 1.5518163265192246, "adv/std_final_conf": 0.8750687837600708, "adv/std_reasoning": 0.7392587065696716, "adv/std_step_conf": 0.9338178038597107, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7663891132790654, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.16885826771653542, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.48031496062992124, "calib/gap": 0.36025166221338456, "calib/mean_conf": 0.6433464566929135, "calib/mu_c": 0.8149624060150376, "calib/mu_w": 0.454710743801653, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14429133858267715, "calib/std_conf": 0.3650221410165784, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47199115044247786, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.09902766729641044, "calib/step_q_w": 0.3729634831460674, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 498.171875, "completions/mean_terminated_length": 498.171875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.07893333333333333, "grad_norm": 0.04365735128521919, "kl": 0.05310821533203125, "learning_rate": 3.5e-06, "loss": 0.0409, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03375405818223953, "mask/share_reasoning": 0.8440734148025513, "mask/share_step_conf": 0.12217249721288681, "num_tokens": 17576177.0, "reward": 0.9710075259208679, "reward_std": 0.15870296955108643, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7765917778015137, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8630794286727905, "step": 74 }, { "adv/mean_abs_final_conf": 0.5426826477050781, "adv/mean_abs_reasoning": 0.3250294327735901, "adv/mean_abs_step_conf": 0.744985818862915, "adv/ratio_final_to_reasoning": 1.6696415554559993, "adv/ratio_step_to_reasoning": 2.2920564839488224, "adv/std_final_conf": 0.7762205600738525, "adv/std_reasoning": 0.6184999346733093, "adv/std_step_conf": 0.9328890442848206, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8371071428571429, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.12968627450980397, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7490196078431373, "calib/gap": 0.4910464285714284, "calib/mean_conf": 0.8121176470588235, "calib/mu_c": 0.9661714285714285, "calib/mu_w": 0.4751250000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.127764705882353, "calib/std_conf": 0.32543218004098545, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5055732758620689, "calib/step_q_c_n": 928.0, "calib/step_q_gap": 0.10974378277451124, "calib/step_q_w": 0.39582949308755766, "calib/step_q_w_n": 434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 473.49609375, "completions/mean_terminated_length": 473.49609375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.08, "grad_norm": 0.06338401883840561, "kl": 0.07183837890625, "learning_rate": 3.4722222222222224e-06, "loss": 0.0384, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035120923072099686, "mask/share_reasoning": 0.8420246839523315, "mask/share_step_conf": 0.12285438925027847, "num_tokens": 17802144.0, "reward": 1.0387458801269531, "reward_std": 0.1380448341369629, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.87017422914505, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8721611499786377, "step": 75 }, { "adv/mean_abs_final_conf": 0.6495287418365479, "adv/mean_abs_reasoning": 0.4122047424316406, "adv/mean_abs_step_conf": 0.7574204802513123, "adv/ratio_final_to_reasoning": 1.5757430106332768, "adv/ratio_step_to_reasoning": 1.8374860895175695, "adv/std_final_conf": 0.848217785358429, "adv/std_reasoning": 0.6815527081489563, "adv/std_step_conf": 0.9339469075202942, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7127142857142856, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.1978039215686274, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5843137254901961, "calib/gap": 0.2991428571428571, "calib/mean_conf": 0.703294117647059, "calib/mu_c": 0.7971428571428572, "calib/mu_w": 0.49800000000000005, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1074117647058823, "calib/std_conf": 0.37153435405074947, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48410377358490564, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.10021763497104424, "calib/step_q_w": 0.3838861386138614, "calib/step_q_w_n": 404.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 494.2109375, "completions/mean_terminated_length": 496.1490478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.08106666666666666, "grad_norm": 0.11171876639127731, "kl": 0.0479583740234375, "learning_rate": 3.444444444444445e-06, "loss": 0.0354, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03396555408835411, "mask/share_reasoning": 0.8541315793991089, "mask/share_step_conf": 0.10799665749073029, "num_tokens": 18031718.0, "reward": 0.987180769443512, "reward_std": 0.15297412872314453, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7706863284111023, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8685189485549927, "step": 76 }, { "adv/mean_abs_final_conf": 0.6657007932662964, "adv/mean_abs_reasoning": 0.45153507590293884, "adv/mean_abs_step_conf": 0.7501556873321533, "adv/ratio_final_to_reasoning": 1.4743058264855469, "adv/ratio_step_to_reasoning": 1.6613453247946688, "adv/std_final_conf": 0.8381210565567017, "adv/std_reasoning": 0.7014268636703491, "adv/std_step_conf": 0.9340924620628357, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6687104430379747, "calib/avg_num_step_conf": 5.26953125, "calib/ece": 0.2486614173228348, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6653543307086615, "calib/gap": 0.1927254746835444, "calib/mean_conf": 0.7751968503937008, "calib/mu_c": 0.8480379746835444, "calib/mu_w": 0.6553125, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20090551181102378, "calib/std_conf": 0.3336150131483204, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4864655172413793, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.07531095485776662, "calib/step_q_w": 0.4111545623836127, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 507.390625, "completions/mean_terminated_length": 507.390625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.08213333333333334, "grad_norm": 0.04007513448596001, "kl": 0.04627227783203125, "learning_rate": 3.416666666666667e-06, "loss": 0.0272, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03483344614505768, "mask/share_reasoning": 0.8469037413597107, "mask/share_step_conf": 0.11826279759407043, "num_tokens": 18266274.0, "reward": 0.9439308047294617, "reward_std": 0.17922864854335785, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7151319980621338, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8508545160293579, "step": 77 }, { "adv/mean_abs_final_conf": 0.6560389995574951, "adv/mean_abs_reasoning": 0.45338594913482666, "adv/mean_abs_step_conf": 0.7529264688491821, "adv/ratio_final_to_reasoning": 1.4469769096492315, "adv/ratio_step_to_reasoning": 1.660674465730474, "adv/std_final_conf": 0.8485627174377441, "adv/std_reasoning": 0.7205855250358582, "adv/std_step_conf": 0.9336483478546143, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6738016584642352, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.23909448818897638, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7362204724409449, "calib/gap": 0.1971381379356838, "calib/mean_conf": 0.8309055118110237, "calib/mu_c": 0.9015337423312882, "calib/mu_w": 0.7043956043956044, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21413385826771655, "calib/std_conf": 0.29396215228749084, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5272615039281705, "calib/step_q_c_n": 891.0, "calib/step_q_gap": 0.06413516559840599, "calib/step_q_w": 0.4631263383297645, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2029.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 541.21875, "completions/mean_terminated_length": 541.21875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.0832, "grad_norm": 0.045858126133680344, "kl": 0.04248809814453125, "learning_rate": 3.3888888888888893e-06, "loss": 0.0349, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02998296171426773, "mask/share_reasoning": 0.8644160032272339, "mask/share_step_conf": 0.10560107231140137, "num_tokens": 18512850.0, "reward": 0.9525356292724609, "reward_std": 0.19739927351474762, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7249546647071838, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8558976650238037, "step": 78 }, { "adv/mean_abs_final_conf": 0.5855348110198975, "adv/mean_abs_reasoning": 0.39158886671066284, "adv/mean_abs_step_conf": 0.7473605871200562, "adv/ratio_final_to_reasoning": 1.4952795158309171, "adv/ratio_step_to_reasoning": 1.9085337982100137, "adv/std_final_conf": 0.796627402305603, "adv/std_reasoning": 0.6815370917320251, "adv/std_step_conf": 0.933944821357727, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6948180379746836, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.26480314960629925, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7598425196850394, "calib/gap": 0.2132502637130802, "calib/mean_conf": 0.842755905511811, "calib/mu_c": 0.9233544303797467, "calib/mu_w": 0.7101041666666665, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24275590551181106, "calib/std_conf": 0.2932969717645023, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5471011025358324, "calib/step_q_c_n": 907.0, "calib/step_q_gap": 0.05951825653193382, "calib/step_q_w": 0.4875828460038986, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 539.546875, "completions/mean_terminated_length": 539.546875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08426666666666667, "grad_norm": 0.7947566509246826, "kl": 0.7817840576171875, "learning_rate": 3.3611111111111117e-06, "loss": 0.0028, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031161334365606308, "mask/share_reasoning": 0.8599764108657837, "mask/share_step_conf": 0.10886222869157791, "num_tokens": 18757350.0, "reward": 0.9420266151428223, "reward_std": 0.17382916808128357, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7247257828712463, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8374522924423218, "step": 79 }, { "adv/mean_abs_final_conf": 0.6251348853111267, "adv/mean_abs_reasoning": 0.5489984750747681, "adv/mean_abs_step_conf": 0.7228517532348633, "adv/ratio_final_to_reasoning": 1.1386823710684981, "adv/ratio_step_to_reasoning": 1.316673517419913, "adv/std_final_conf": 0.8623039126396179, "adv/std_reasoning": 0.8097235560417175, "adv/std_step_conf": 0.9342271685600281, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6589888405008165, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.29584313725490197, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9098039215686274, "calib/gap": 0.12011771910724, "calib/mean_conf": 0.9419607843137255, "calib/mu_c": 0.9834131736526945, "calib/mu_w": 0.8632954545454545, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2914509803921569, "calib/std_conf": 0.18354269266510556, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.585968253968254, "calib/step_q_c_n": 945.0, "calib/step_q_gap": 0.0781834539682541, "calib/step_q_w": 0.5077847999999999, "calib/step_q_w_n": 500.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 490.2109375, "completions/mean_terminated_length": 492.13336181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.08533333333333333, "grad_norm": 0.047408103942871094, "kl": 0.050380706787109375, "learning_rate": 3.3333333333333333e-06, "loss": -0.0121, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032809220254421234, "mask/share_reasoning": 0.8404864072799683, "mask/share_step_conf": 0.12279807031154633, "num_tokens": 18985004.0, "reward": 0.9354228973388672, "reward_std": 0.2329998016357422, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.705510139465332, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8364294767379761, "step": 80 }, { "adv/mean_abs_final_conf": 0.5626240968704224, "adv/mean_abs_reasoning": 0.41431060433387756, "adv/mean_abs_step_conf": 0.7470904588699341, "adv/ratio_final_to_reasoning": 1.357976578405472, "adv/ratio_step_to_reasoning": 1.803213461241464, "adv/std_final_conf": 0.7812780141830444, "adv/std_reasoning": 0.7013968825340271, "adv/std_step_conf": 0.9344127774238586, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6627988748241913, "calib/avg_num_step_conf": 5.5390625, "calib/ece": 0.2665274193548387, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8225806451612904, "calib/gap": 0.2256700421940927, "calib/mean_conf": 0.8794403225806452, "calib/mu_c": 0.9613367088607595, "calib/mu_w": 0.7356666666666668, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25443548387096776, "calib/std_conf": 0.26357985076429535, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6292937853107344, "calib/step_q_c_n": 708.0, "calib/step_q_gap": 0.18060364446566401, "calib/step_q_w": 0.4486901408450704, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 529.61328125, "completions/mean_terminated_length": 533.783447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.0864, "grad_norm": 0.030594119802117348, "kl": 0.040287017822265625, "learning_rate": 3.3055555555555558e-06, "loss": 0.0861, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03291993588209152, "mask/share_reasoning": 0.8520771265029907, "mask/share_step_conf": 0.10719040036201477, "num_tokens": 19226833.0, "reward": 0.919786274433136, "reward_std": 0.21998167037963867, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7177573442459106, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8054088354110718, "step": 81 }, { "adv/mean_abs_final_conf": 0.6789511442184448, "adv/mean_abs_reasoning": 0.5512694120407104, "adv/mean_abs_step_conf": 0.7774230241775513, "adv/ratio_final_to_reasoning": 1.2316140336991983, "adv/ratio_step_to_reasoning": 1.4102415392496686, "adv/std_final_conf": 0.8529064059257507, "adv/std_reasoning": 0.792765200138092, "adv/std_step_conf": 0.9345331788063049, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6438387096774194, "calib/avg_num_step_conf": 4.5390625, "calib/ece": 0.31152941176470605, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8235294117647058, "calib/gap": 0.1329129032258064, "calib/mean_conf": 0.9054901960784314, "calib/mu_c": 0.9576129032258065, "calib/mu_w": 0.8247000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3045882352941178, "calib/std_conf": 0.23330795709760863, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6615191740412979, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.13408115751237237, "calib/step_q_w": 0.5274380165289255, "calib/step_q_w_n": 484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 452.515625, "completions/mean_terminated_length": 452.515625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.08746666666666666, "grad_norm": 0.04082036763429642, "kl": 0.054443359375, "learning_rate": 3.277777777777778e-06, "loss": 0.054, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.036828212440013885, "mask/share_reasoning": 0.8555018901824951, "mask/share_step_conf": 0.1076698899269104, "num_tokens": 19448229.0, "reward": 0.9207143783569336, "reward_std": 0.23406952619552612, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6793046593666077, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8418115377426147, "step": 82 }, { "adv/mean_abs_final_conf": 0.5660687685012817, "adv/mean_abs_reasoning": 0.35447534918785095, "adv/mean_abs_step_conf": 0.7589709162712097, "adv/ratio_final_to_reasoning": 1.5969199827243807, "adv/ratio_step_to_reasoning": 2.141110568083537, "adv/std_final_conf": 0.7831708192825317, "adv/std_reasoning": 0.6612017154693604, "adv/std_step_conf": 0.934005618095398, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7029090430433066, "calib/avg_num_step_conf": 4.85546875, "calib/ece": 0.3333734939759037, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8152610441767069, "calib/gap": 0.17196261682242975, "calib/mean_conf": 0.8761044176706827, "calib/mu_c": 0.9499999999999998, "calib/mu_w": 0.7780373831775701, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3195983935742973, "calib/std_conf": 0.2749218310599134, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.631921921921922, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.12538448691325654, "calib/step_q_w": 0.5065374350086654, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 566.58203125, "completions/mean_terminated_length": 566.58203125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.08853333333333334, "grad_norm": 0.049747027456760406, "kl": 0.040142059326171875, "learning_rate": 3.2500000000000002e-06, "loss": 0.0908, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031713858246803284, "mask/share_reasoning": 0.8682199716567993, "mask/share_step_conf": 0.10006619244813919, "num_tokens": 19700538.0, "reward": 0.8887869119644165, "reward_std": 0.1813461184501648, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6517887115478516, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8203163743019104, "step": 83 }, { "adv/mean_abs_final_conf": 0.645351767539978, "adv/mean_abs_reasoning": 0.5118788480758667, "adv/mean_abs_step_conf": 0.7618239521980286, "adv/ratio_final_to_reasoning": 1.2607509959941323, "adv/ratio_step_to_reasoning": 1.48828957293644, "adv/std_final_conf": 0.8241501450538635, "adv/std_reasoning": 0.7575427293777466, "adv/std_step_conf": 0.9342594146728516, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7364296636085628, "calib/avg_num_step_conf": 4.69140625, "calib/ece": 0.30798418972332026, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7628458498023716, "calib/gap": 0.20664946483180446, "calib/mean_conf": 0.8382608695652174, "calib/mu_c": 0.9272916666666668, "calib/mu_w": 0.7206422018348624, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28853754940711474, "calib/std_conf": 0.3089945110133803, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6382036775106082, "calib/step_q_c_n": 707.0, "calib/step_q_gap": 0.1228996289276123, "calib/step_q_w": 0.5153040485829959, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2183.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 471.50390625, "completions/mean_terminated_length": 473.35296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.0896, "grad_norm": 0.04613294452428818, "kl": 0.047542572021484375, "learning_rate": 3.2222222222222227e-06, "loss": 0.0052, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03553298860788345, "mask/share_reasoning": 0.850753128528595, "mask/share_step_conf": 0.10980760306119919, "num_tokens": 19927163.0, "reward": 0.9026806950569153, "reward_std": 0.21408578753471375, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6801788806915283, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8150261044502258, "step": 84 }, { "adv/mean_abs_final_conf": 0.6471362709999084, "adv/mean_abs_reasoning": 0.4608671963214874, "adv/mean_abs_step_conf": 0.7554687261581421, "adv/ratio_final_to_reasoning": 1.4041708244048794, "adv/ratio_step_to_reasoning": 1.639233020245488, "adv/std_final_conf": 0.8407363891601562, "adv/std_reasoning": 0.7206194400787354, "adv/std_step_conf": 0.9350539445877075, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7098214285714285, "calib/avg_num_step_conf": 4.59765625, "calib/ece": 0.27793522267206483, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7246963562753036, "calib/gap": 0.2943055555555555, "calib/mean_conf": 0.8121052631578948, "calib/mu_c": 0.9455555555555555, "calib/mu_w": 0.65125, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.27174089068825913, "calib/std_conf": 0.32700026162971557, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6017341977309562, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.08634134058809906, "calib/step_q_w": 0.5153928571428571, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 508.65625, "completions/mean_terminated_length": 512.6614379882812, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.09066666666666667, "grad_norm": 0.045355018228292465, "kl": 0.045810699462890625, "learning_rate": 3.1944444444444443e-06, "loss": -0.0691, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033476561307907104, "mask/share_reasoning": 0.8550655841827393, "mask/share_step_conf": 0.10364531725645065, "num_tokens": 20165203.0, "reward": 0.8950651288032532, "reward_std": 0.22769135236740112, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6913609504699707, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8011130094528198, "step": 85 }, { "adv/mean_abs_final_conf": 0.6400004625320435, "adv/mean_abs_reasoning": 0.39856961369514465, "adv/mean_abs_step_conf": 0.7425464391708374, "adv/ratio_final_to_reasoning": 1.6057432391761879, "adv/ratio_step_to_reasoning": 1.8630282230666775, "adv/std_final_conf": 0.8474387526512146, "adv/std_reasoning": 0.7012442946434021, "adv/std_step_conf": 0.9346654415130615, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7519546027742748, "calib/avg_num_step_conf": 4.81640625, "calib/ece": 0.2580158730158731, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6388888888888888, "calib/gap": 0.32935687263556124, "calib/mean_conf": 0.7628571428571429, "calib/mu_c": 0.9223076923076924, "calib/mu_w": 0.5929508196721311, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25250000000000006, "calib/std_conf": 0.341313655363902, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5728436018957346, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.13711026856240127, "calib/step_q_w": 0.4357333333333333, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 499.2265625, "completions/mean_terminated_length": 501.1843566894531, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.09173333333333333, "grad_norm": 0.06268741190433502, "kl": 0.04718017578125, "learning_rate": 3.1666666666666667e-06, "loss": -0.0189, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03539302200078964, "mask/share_reasoning": 0.8536935448646545, "mask/share_step_conf": 0.10700717568397522, "num_tokens": 20398517.0, "reward": 0.9265288710594177, "reward_std": 0.18060728907585144, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7257484197616577, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8288718461990356, "step": 86 }, { "adv/mean_abs_final_conf": 0.6161288022994995, "adv/mean_abs_reasoning": 0.424224853515625, "adv/mean_abs_step_conf": 0.7731361389160156, "adv/ratio_final_to_reasoning": 1.4523637575534134, "adv/ratio_step_to_reasoning": 1.8224678080713619, "adv/std_final_conf": 0.8241699934005737, "adv/std_reasoning": 0.7013879418373108, "adv/std_step_conf": 0.9341001510620117, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6289682539682541, "calib/avg_num_step_conf": 4.453125, "calib/ece": 0.201771653543307, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7322834645669292, "calib/gap": 0.19870115995115967, "calib/mean_conf": 0.8480708661417322, "calib/mu_c": 0.9043956043956042, "calib/mu_w": 0.7056944444444445, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16665354330708657, "calib/std_conf": 0.28637535747981313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5853639846743296, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.1364844328536013, "calib/step_q_w": 0.4488795518207283, "calib/step_q_w_n": 357.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 442.0234375, "completions/mean_terminated_length": 442.0234375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0928, "grad_norm": 0.03550032898783684, "kl": 0.049495697021484375, "learning_rate": 3.138888888888889e-06, "loss": 0.1331, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03907443583011627, "mask/share_reasoning": 0.8512502312660217, "mask/share_step_conf": 0.1096753180027008, "num_tokens": 20617171.0, "reward": 0.9840619564056396, "reward_std": 0.17071446776390076, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7719812393188477, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8562988638877869, "step": 87 }, { "adv/mean_abs_final_conf": 0.6632359623908997, "adv/mean_abs_reasoning": 0.46517929434776306, "adv/mean_abs_step_conf": 0.7715968489646912, "adv/ratio_final_to_reasoning": 1.4257641525529112, "adv/ratio_step_to_reasoning": 1.6587084987231904, "adv/std_final_conf": 0.8690577149391174, "adv/std_reasoning": 0.7205847501754761, "adv/std_step_conf": 0.9343187212944031, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7721543659043659, "calib/avg_num_step_conf": 4.65625, "calib/ece": 0.1628571428571429, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5793650793650794, "calib/gap": 0.38315228690228686, "calib/mean_conf": 0.718968253968254, "calib/mu_c": 0.8770945945945946, "calib/mu_w": 0.49394230769230774, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14726190476190482, "calib/std_conf": 0.3641522648674075, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5372624113475177, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.14943900272328775, "calib/step_q_w": 0.38782340862423, "calib/step_q_w_n": 487.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 481.125, "completions/mean_terminated_length": 483.01177978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.09386666666666667, "grad_norm": 0.03591761738061905, "kl": 0.05088043212890625, "learning_rate": 3.1111111111111116e-06, "loss": 0.0273, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033284686505794525, "mask/share_reasoning": 0.8623548746109009, "mask/share_step_conf": 0.10045421123504639, "num_tokens": 20850187.0, "reward": 0.9585222005844116, "reward_std": 0.19591569900512695, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7771499752998352, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8273943662643433, "step": 88 }, { "adv/mean_abs_final_conf": 0.697031557559967, "adv/mean_abs_reasoning": 0.4544700086116791, "adv/mean_abs_step_conf": 0.7597097754478455, "adv/ratio_final_to_reasoning": 1.5337239957577578, "adv/ratio_step_to_reasoning": 1.6716389663833193, "adv/std_final_conf": 0.8651239275932312, "adv/std_reasoning": 0.7013769745826721, "adv/std_step_conf": 0.9346089959144592, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8085505403687223, "calib/avg_num_step_conf": 4.703125, "calib/ece": 0.13569721115537847, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.398406374501992, "calib/gap": 0.41586967577876677, "calib/mean_conf": 0.5728286852589641, "calib/mu_c": 0.7733076923076924, "calib/mu_w": 0.3574380165289256, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.0952988047808765, "calib/std_conf": 0.3946156112109845, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5142784380305603, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.15806705591673914, "calib/step_q_w": 0.35621138211382114, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 526.33203125, "completions/mean_terminated_length": 526.33203125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.09493333333333333, "grad_norm": 0.03845527768135071, "kl": 0.0560150146484375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0427, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03411491960287094, "mask/share_reasoning": 0.8635823726654053, "mask/share_step_conf": 0.1023026555776596, "num_tokens": 21093816.0, "reward": 0.9638468027114868, "reward_std": 0.18518471717834473, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7836429476737976, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.846394419670105, "step": 89 }, { "adv/mean_abs_final_conf": 0.7023544311523438, "adv/mean_abs_reasoning": 0.39130640029907227, "adv/mean_abs_step_conf": 0.7519776821136475, "adv/ratio_final_to_reasoning": 1.7948963538943907, "adv/ratio_step_to_reasoning": 1.9217106634057535, "adv/std_final_conf": 0.8781015872955322, "adv/std_reasoning": 0.6613117456436157, "adv/std_step_conf": 0.9345213174819946, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7159850034083163, "calib/avg_num_step_conf": 5.1484375, "calib/ece": 0.21081686429512514, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5019762845849802, "calib/gap": 0.31590456714383097, "calib/mean_conf": 0.6464163372859025, "calib/mu_c": 0.7587934560327199, "calib/mu_w": 0.4428888888888889, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10648221343873517, "calib/std_conf": 0.39126191196681814, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4267635933806147, "calib/step_q_c_n": 846.0, "calib/step_q_gap": 0.05034410185519095, "calib/step_q_w": 0.37641949152542376, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 491.67578125, "completions/mean_terminated_length": 491.67578125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.096, "grad_norm": 0.04405174404382706, "kl": 0.0619354248046875, "learning_rate": 3.055555555555556e-06, "loss": 0.016, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03433162346482277, "mask/share_reasoning": 0.8532933592796326, "mask/share_step_conf": 0.11237501353025436, "num_tokens": 21323005.0, "reward": 0.9484930634498596, "reward_std": 0.16438844799995422, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7458431720733643, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8277053833007812, "step": 90 }, { "adv/mean_abs_final_conf": 0.7540473937988281, "adv/mean_abs_reasoning": 0.4702809453010559, "adv/mean_abs_step_conf": 0.7511721849441528, "adv/ratio_final_to_reasoning": 1.6033977164779998, "adv/ratio_step_to_reasoning": 1.5972839053967647, "adv/std_final_conf": 0.926612377166748, "adv/std_reasoning": 0.7392401695251465, "adv/std_step_conf": 0.9344815015792847, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7784903280067283, "calib/avg_num_step_conf": 4.0859375, "calib/ece": 0.15047808764940235, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.36254980079681276, "calib/gap": 0.38301233529576684, "calib/mean_conf": 0.5956573705179283, "calib/mu_c": 0.7284146341463416, "calib/mu_w": 0.3454022988505747, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.04637450199203186, "calib/std_conf": 0.371918502692295, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4663047001620746, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.1303373341947086, "calib/step_q_w": 0.335967365967366, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 485.08203125, "completions/mean_terminated_length": 485.08203125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.09706666666666666, "grad_norm": 0.09253852069377899, "kl": 0.06554412841796875, "learning_rate": 3.0277777777777776e-06, "loss": -0.0087, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03286009281873703, "mask/share_reasoning": 0.8777602910995483, "mask/share_step_conf": 0.08937962353229523, "num_tokens": 21554898.0, "reward": 0.9783428907394409, "reward_std": 0.17696067690849304, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7896254062652588, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8428416848182678, "step": 91 }, { "adv/mean_abs_final_conf": 0.6797538995742798, "adv/mean_abs_reasoning": 0.4213108718395233, "adv/mean_abs_step_conf": 0.7387102246284485, "adv/ratio_final_to_reasoning": 1.6134259640781285, "adv/ratio_step_to_reasoning": 1.7533614107870021, "adv/std_final_conf": 0.8790198564529419, "adv/std_reasoning": 0.7013704180717468, "adv/std_step_conf": 0.9345918297767639, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7346223201763173, "calib/avg_num_step_conf": 4.390625, "calib/ece": 0.21255905511811024, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.48031496062992124, "calib/gap": 0.3103419488412476, "calib/mean_conf": 0.6553149606299212, "calib/mu_c": 0.7689440993788821, "calib/mu_w": 0.4586021505376345, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11700787401574807, "calib/std_conf": 0.39278595482986994, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45211656441717785, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.11465893729853377, "calib/step_q_w": 0.3374576271186441, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 456.37109375, "completions/mean_terminated_length": 456.37109375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.09813333333333334, "grad_norm": 0.040576785802841187, "kl": 0.06185150146484375, "learning_rate": 3e-06, "loss": 0.0198, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03687359765172005, "mask/share_reasoning": 0.8611937761306763, "mask/share_step_conf": 0.1019326001405716, "num_tokens": 21778449.0, "reward": 0.9587187767028809, "reward_std": 0.1689954549074173, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7444359064102173, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8503453135490417, "step": 92 }, { "adv/mean_abs_final_conf": 0.7539007067680359, "adv/mean_abs_reasoning": 0.5408545732498169, "adv/mean_abs_step_conf": 0.7772372961044312, "adv/ratio_final_to_reasoning": 1.3939065028850455, "adv/ratio_step_to_reasoning": 1.4370541260920997, "adv/std_final_conf": 0.906922459602356, "adv/std_reasoning": 0.8098655939102173, "adv/std_step_conf": 0.9350221753120422, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6648159164518476, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.2301626016260163, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.35772357723577236, "calib/gap": 0.23170731707317072, "calib/mean_conf": 0.5529268292682926, "calib/mu_c": 0.6687804878048781, "calib/mu_w": 0.43707317073170737, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1415447154471545, "calib/std_conf": 0.38613728488210686, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4331559633027523, "calib/step_q_c_n": 545.0, "calib/step_q_gap": 0.09087873557998, "calib/step_q_w": 0.3422772277227723, "calib/step_q_w_n": 707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 507.9765625, "completions/mean_terminated_length": 507.9765625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.0992, "grad_norm": 0.035629406571388245, "kl": 0.06622314453125, "learning_rate": 2.9722222222222225e-06, "loss": -0.0348, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03448627144098282, "mask/share_reasoning": 0.8541302680969238, "mask/share_step_conf": 0.11138348281383514, "num_tokens": 22014267.0, "reward": 0.8872026205062866, "reward_std": 0.2100294828414917, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6782886981964111, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8093979358673096, "step": 93 }, { "adv/mean_abs_final_conf": 0.7159183025360107, "adv/mean_abs_reasoning": 0.5076972842216492, "adv/mean_abs_step_conf": 0.7585917115211487, "adv/ratio_final_to_reasoning": 1.4101282886190445, "adv/ratio_step_to_reasoning": 1.4941811490761583, "adv/std_final_conf": 0.9018604755401611, "adv/std_reasoning": 0.7752719521522522, "adv/std_step_conf": 0.9340033531188965, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7935536119209587, "calib/avg_num_step_conf": 4.51953125, "calib/ece": 0.18234126984126986, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3253968253968254, "calib/gap": 0.39174149659863944, "calib/mean_conf": 0.4933730158730159, "calib/mu_c": 0.6565986394557823, "calib/mu_w": 0.2648571428571429, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0461904761904762, "calib/std_conf": 0.39308321817996, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4559424012158054, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.16514079800939258, "calib/step_q_w": 0.29080160320641285, "calib/step_q_w_n": 499.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 464.46484375, "completions/mean_terminated_length": 466.28631591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.10026666666666667, "grad_norm": 0.04146898537874222, "kl": 0.07958221435546875, "learning_rate": 2.944444444444445e-06, "loss": -0.0142, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03604136407375336, "mask/share_reasoning": 0.8550038933753967, "mask/share_step_conf": 0.10504850745201111, "num_tokens": 22241850.0, "reward": 0.96682208776474, "reward_std": 0.14535515010356903, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7686004042625427, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8541063070297241, "step": 94 }, { "adv/mean_abs_final_conf": 0.6310229301452637, "adv/mean_abs_reasoning": 0.401436448097229, "adv/mean_abs_step_conf": 0.7615891695022583, "adv/ratio_final_to_reasoning": 1.5719123989270356, "adv/ratio_step_to_reasoning": 1.8971599940964985, "adv/std_final_conf": 0.8184422254562378, "adv/std_reasoning": 0.6817244291305542, "adv/std_step_conf": 0.9347649812698364, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7642728256946951, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.20983870967741944, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.49193548387096775, "calib/gap": 0.349446409238542, "calib/mean_conf": 0.6425, "calib/mu_c": 0.7622699386503068, "calib/mu_w": 0.4128235294117648, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09754032258064524, "calib/std_conf": 0.3960309636803016, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.455609756097561, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.12491933293497748, "calib/step_q_w": 0.3306904231625835, "calib/step_q_w_n": 449.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 499.47265625, "completions/mean_terminated_length": 503.405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.10133333333333333, "grad_norm": 0.03742410987615585, "kl": 0.05938720703125, "learning_rate": 2.916666666666667e-06, "loss": 0.011, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03435906395316124, "mask/share_reasoning": 0.8466284871101379, "mask/share_step_conf": 0.11119996011257172, "num_tokens": 22475843.0, "reward": 0.952112078666687, "reward_std": 0.18023662269115448, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.750076949596405, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8346158862113953, "step": 95 }, { "adv/mean_abs_final_conf": 0.6180611848831177, "adv/mean_abs_reasoning": 0.3653485178947449, "adv/mean_abs_step_conf": 0.7559719681739807, "adv/ratio_final_to_reasoning": 1.6917030030519464, "adv/ratio_step_to_reasoning": 2.0691803336992667, "adv/std_final_conf": 0.8387512564659119, "adv/std_reasoning": 0.6612295508384705, "adv/std_step_conf": 0.9340872168540955, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8002291543465405, "calib/avg_num_step_conf": 4.27734375, "calib/ece": 0.1503543307086615, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5669291338582677, "calib/gap": 0.41226197516262564, "calib/mean_conf": 0.6899606299212598, "calib/mu_c": 0.813314606741573, "calib/mu_w": 0.4010526315789473, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06976377952755913, "calib/std_conf": 0.3873303520247324, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4998544793087767, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.141871053894412, "calib/step_q_w": 0.3579834254143647, "calib/step_q_w_n": 362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 422.55859375, "completions/mean_terminated_length": 422.55859375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1024, "grad_norm": 0.06945142149925232, "kl": 0.07413482666015625, "learning_rate": 2.888888888888889e-06, "loss": 0.0411, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03754666820168495, "mask/share_reasoning": 0.858115553855896, "mask/share_step_conf": 0.10433775186538696, "num_tokens": 22689834.0, "reward": 0.9810190200805664, "reward_std": 0.16697180271148682, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.8019851446151733, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8248965740203857, "step": 96 }, { "adv/mean_abs_final_conf": 0.6564252972602844, "adv/mean_abs_reasoning": 0.4691106677055359, "adv/mean_abs_step_conf": 0.7472108602523804, "adv/ratio_final_to_reasoning": 1.3992973139385678, "adv/ratio_step_to_reasoning": 1.5928242772799401, "adv/std_final_conf": 0.8560405373573303, "adv/std_reasoning": 0.7207762598991394, "adv/std_step_conf": 0.93504798412323, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7486601307189543, "calib/avg_num_step_conf": 4.359375, "calib/ece": 0.1561904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5158730158730159, "calib/gap": 0.3863372549019608, "calib/mean_conf": 0.6734920634920636, "calib/mu_c": 0.8298666666666666, "calib/mu_w": 0.44352941176470584, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11722222222222226, "calib/std_conf": 0.37891934364008245, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5090322580645161, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.169741935483871, "calib/step_q_w": 0.3392903225806451, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 466.03515625, "completions/mean_terminated_length": 466.03515625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.10346666666666667, "grad_norm": 0.03814903274178505, "kl": 0.067413330078125, "learning_rate": 2.861111111111111e-06, "loss": 0.0792, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.036202527582645416, "mask/share_reasoning": 0.8588676452636719, "mask/share_step_conf": 0.10492978990077972, "num_tokens": 22914211.0, "reward": 0.9657089710235596, "reward_std": 0.19180062413215637, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7771941423416138, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8417237401008606, "step": 97 }, { "adv/mean_abs_final_conf": 0.6895825862884521, "adv/mean_abs_reasoning": 0.4966968595981598, "adv/mean_abs_step_conf": 0.7532524466514587, "adv/ratio_final_to_reasoning": 1.3883369160947419, "adv/ratio_step_to_reasoning": 1.516523473212331, "adv/std_final_conf": 0.8690649271011353, "adv/std_reasoning": 0.7576682567596436, "adv/std_step_conf": 0.9344282746315002, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7004211080405316, "calib/avg_num_step_conf": 3.98046875, "calib/ece": 0.252788844621514, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6254980079681275, "calib/gap": 0.2498420844848006, "calib/mean_conf": 0.7424302788844622, "calib/mu_c": 0.8439597315436241, "calib/mu_w": 0.5941176470588235, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.20079681274900407, "calib/std_conf": 0.3647987904168045, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5504595588235294, "calib/step_q_c_n": 544.0, "calib/step_q_gap": 0.1363332430340557, "calib/step_q_w": 0.4141263157894737, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 484.59765625, "completions/mean_terminated_length": 484.59765625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.10453333333333334, "grad_norm": 0.1584102064371109, "kl": 0.27259063720703125, "learning_rate": 2.8333333333333335e-06, "loss": -0.0204, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037299685180187225, "mask/share_reasoning": 0.8697381615638733, "mask/share_step_conf": 0.09296215325593948, "num_tokens": 23144452.0, "reward": 0.9168623685836792, "reward_std": 0.21771375834941864, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7018964886665344, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8216720223426819, "step": 98 }, { "adv/mean_abs_final_conf": 0.7243151664733887, "adv/mean_abs_reasoning": 0.5069431066513062, "adv/mean_abs_step_conf": 0.7548806071281433, "adv/ratio_final_to_reasoning": 1.4287898522932256, "adv/ratio_step_to_reasoning": 1.4890834833806657, "adv/std_final_conf": 0.8931503295898438, "adv/std_reasoning": 0.7577471137046814, "adv/std_step_conf": 0.9351648092269897, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7371172516803585, "calib/avg_num_step_conf": 4.03125, "calib/ece": 0.1979268292682927, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3821138211382114, "calib/gap": 0.3625969176454613, "calib/mean_conf": 0.5547560975609757, "calib/mu_c": 0.7655339805825243, "calib/mu_w": 0.402937062937063, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1669918699186992, "calib/std_conf": 0.40141789026576946, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5085817307692307, "calib/step_q_c_n": 416.0, "calib/step_q_gap": 0.12142263986013985, "calib/step_q_w": 0.3871590909090909, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 544.375, "completions/mean_terminated_length": 546.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.1056, "grad_norm": 0.04278083145618439, "kl": 0.06156158447265625, "learning_rate": 2.805555555555556e-06, "loss": -0.0756, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03205645829439163, "mask/share_reasoning": 0.8791579604148865, "mask/share_step_conf": 0.08487935364246368, "num_tokens": 23389612.0, "reward": 0.89930260181427, "reward_std": 0.2116352617740631, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.715812087059021, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.812480628490448, "step": 99 }, { "adv/mean_abs_final_conf": 0.7242751717567444, "adv/mean_abs_reasoning": 0.4599572420120239, "adv/mean_abs_step_conf": 0.7632794380187988, "adv/ratio_final_to_reasoning": 1.574657610756373, "adv/ratio_step_to_reasoning": 1.659457376255086, "adv/std_final_conf": 0.9085102677345276, "adv/std_reasoning": 0.7393200397491455, "adv/std_step_conf": 0.934822142124176, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8057424396873938, "calib/avg_num_step_conf": 4.38671875, "calib/ece": 0.192377049180328, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5245901639344263, "calib/gap": 0.4464532789670404, "calib/mean_conf": 0.661967213114754, "calib/mu_c": 0.8614074074074074, "calib/mu_w": 0.414954128440367, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15053278688524602, "calib/std_conf": 0.404920836918677, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5353023255813953, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.1520178067529434, "calib/step_q_w": 0.38328451882845194, "calib/step_q_w_n": 478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 561.5078125, "completions/mean_terminated_length": 561.5078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.10666666666666667, "grad_norm": 0.033582814037799835, "kl": 0.05889892578125, "learning_rate": 2.7777777777777783e-06, "loss": 0.0711, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03251664340496063, "mask/share_reasoning": 0.8713377714157104, "mask/share_step_conf": 0.09614555537700653, "num_tokens": 23640766.0, "reward": 0.9381765127182007, "reward_std": 0.2331477701663971, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7564589977264404, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8238001465797424, "step": 100 }, { "adv/mean_abs_final_conf": 0.7356261014938354, "adv/mean_abs_reasoning": 0.5457586050033569, "adv/mean_abs_step_conf": 0.7550551295280457, "adv/ratio_final_to_reasoning": 1.3478964779479943, "adv/ratio_step_to_reasoning": 1.383496517702, "adv/std_final_conf": 0.9189450740814209, "adv/std_reasoning": 0.7578188180923462, "adv/std_step_conf": 0.9353813529014587, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6785857104772062, "calib/avg_num_step_conf": 4.453125, "calib/ece": 0.25179591836734694, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4, "calib/gap": 0.222920554518795, "calib/mean_conf": 0.5887755102040816, "calib/mu_c": 0.6988709677419355, "calib/mu_w": 0.47595041322314047, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.16722448979591834, "calib/std_conf": 0.3875734698122289, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4499836333878886, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.013632026582595602, "calib/step_q_w": 0.436351606805293, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 535.72265625, "completions/mean_terminated_length": 535.72265625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.10773333333333333, "grad_norm": 0.058739546686410904, "kl": 0.0675811767578125, "learning_rate": 2.7500000000000004e-06, "loss": -0.0847, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03145679086446762, "mask/share_reasoning": 0.8738486766815186, "mask/share_step_conf": 0.09469453990459442, "num_tokens": 23884903.0, "reward": 0.8747799396514893, "reward_std": 0.2343277633190155, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6691246032714844, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.793716549873352, "step": 101 }, { "adv/mean_abs_final_conf": 0.6112939119338989, "adv/mean_abs_reasoning": 0.3416619896888733, "adv/mean_abs_step_conf": 0.7474625706672668, "adv/ratio_final_to_reasoning": 1.7891774045177218, "adv/ratio_step_to_reasoning": 2.187725275931123, "adv/std_final_conf": 0.8120249509811401, "adv/std_reasoning": 0.6186192035675049, "adv/std_step_conf": 0.9343135356903076, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8078691875319367, "calib/avg_num_step_conf": 4.4140625, "calib/ece": 0.16984313725490194, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5843137254901961, "calib/gap": 0.3900453500255494, "calib/mean_conf": 0.7272549019607844, "calib/mu_c": 0.8848026315789475, "calib/mu_w": 0.49475728155339804, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1505098039215686, "calib/std_conf": 0.36354614255976975, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.582176, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.13015619801980205, "calib/step_q_w": 0.452019801980198, "calib/step_q_w_n": 505.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 444.75390625, "completions/mean_terminated_length": 444.75390625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.1088, "grad_norm": 0.03678474575281143, "kl": 0.07524871826171875, "learning_rate": 2.7222222222222224e-06, "loss": 0.0463, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03831843286752701, "mask/share_reasoning": 0.8526753187179565, "mask/share_step_conf": 0.10900621861219406, "num_tokens": 24105456.0, "reward": 0.9682535529136658, "reward_std": 0.16872161626815796, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7835359573364258, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8381274342536926, "step": 102 }, { "adv/mean_abs_final_conf": 0.6591412425041199, "adv/mean_abs_reasoning": 0.47897109389305115, "adv/mean_abs_step_conf": 0.7331361174583435, "adv/ratio_final_to_reasoning": 1.376160797401479, "adv/ratio_step_to_reasoning": 1.5306479384788187, "adv/std_final_conf": 0.8299095034599304, "adv/std_reasoning": 0.7393709421157837, "adv/std_step_conf": 0.9341103434562683, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8233881578947366, "calib/avg_num_step_conf": 4.4375, "calib/ece": 0.18261507936507945, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5753968253968254, "calib/gap": 0.37095657894736844, "calib/mean_conf": 0.7305515873015873, "calib/mu_c": 0.8777565789473685, "calib/mu_w": 0.5068, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15499603174603183, "calib/std_conf": 0.3600417983305542, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.578041074249605, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.1569675156014937, "calib/step_q_w": 0.42107355864811136, "calib/step_q_w_n": 503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 557.08203125, "completions/mean_terminated_length": 557.08203125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.10986666666666667, "grad_norm": 0.039522334933280945, "kl": 0.06630706787109375, "learning_rate": 2.6944444444444444e-06, "loss": -0.0137, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032912421971559525, "mask/share_reasoning": 0.8734282851219177, "mask/share_step_conf": 0.09365926682949066, "num_tokens": 24352621.0, "reward": 0.9609041213989258, "reward_std": 0.20713165402412415, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7670042514801025, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8430851101875305, "step": 103 }, { "adv/mean_abs_final_conf": 0.6705017685890198, "adv/mean_abs_reasoning": 0.47117331624031067, "adv/mean_abs_step_conf": 0.7483392953872681, "adv/ratio_final_to_reasoning": 1.4230469881852275, "adv/ratio_step_to_reasoning": 1.588246340770273, "adv/std_final_conf": 0.8786208033561707, "adv/std_reasoning": 0.7391869425773621, "adv/std_step_conf": 0.9346818327903748, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7892615384615386, "calib/avg_num_step_conf": 4.65625, "calib/ece": 0.23494117647058824, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5333333333333333, "calib/gap": 0.3396707692307692, "calib/mean_conf": 0.6882745098039217, "calib/mu_c": 0.86144, "calib/mu_w": 0.5217692307692308, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21650980392156863, "calib/std_conf": 0.37431055987321926, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5891935483870967, "calib/step_q_c_n": 558.0, "calib/step_q_gap": 0.16376768087921023, "calib/step_q_w": 0.42542586750788647, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 491.43359375, "completions/mean_terminated_length": 493.3608093261719, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.11093333333333333, "grad_norm": 0.04268510267138481, "kl": 0.07636260986328125, "learning_rate": 2.666666666666667e-06, "loss": -0.0347, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03305329009890556, "mask/share_reasoning": 0.8618713617324829, "mask/share_step_conf": 0.10116907954216003, "num_tokens": 24585108.0, "reward": 0.9279952049255371, "reward_std": 0.17701026797294617, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7343195676803589, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8263583183288574, "step": 104 }, { "adv/mean_abs_final_conf": 0.7147749662399292, "adv/mean_abs_reasoning": 0.562514066696167, "adv/mean_abs_step_conf": 0.7437756061553955, "adv/ratio_final_to_reasoning": 1.2706792746322617, "adv/ratio_step_to_reasoning": 1.3222346785456194, "adv/std_final_conf": 0.8912851214408875, "adv/std_reasoning": 0.7929922938346863, "adv/std_step_conf": 0.9355881214141846, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.719147005444646, "calib/avg_num_step_conf": 4.55859375, "calib/ece": 0.25951807228915647, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5502008032128514, "calib/gap": 0.28966554316826554, "calib/mean_conf": 0.6885140562248996, "calib/mu_c": 0.8234586466165414, "calib/mu_w": 0.5337931034482759, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2069477911646585, "calib/std_conf": 0.39121759370677045, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5546408839779005, "calib/step_q_c_n": 543.0, "calib/step_q_gap": 0.13257357628559274, "calib/step_q_w": 0.4220673076923077, "calib/step_q_w_n": 624.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 523.828125, "completions/mean_terminated_length": 525.8823852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.112, "grad_norm": 0.028385218232870102, "kl": 0.07004547119140625, "learning_rate": 2.6388888888888893e-06, "loss": -0.0132, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03344562277197838, "mask/share_reasoning": 0.8649731278419495, "mask/share_step_conf": 0.09767502546310425, "num_tokens": 24824968.0, "reward": 0.9044659733772278, "reward_std": 0.2518778443336487, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6948882341384888, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8163872957229614, "step": 105 }, { "adv/mean_abs_final_conf": 0.6365981101989746, "adv/mean_abs_reasoning": 0.4709445834159851, "adv/mean_abs_step_conf": 0.7567065954208374, "adv/ratio_final_to_reasoning": 1.3517473873070704, "adv/ratio_step_to_reasoning": 1.606784793939203, "adv/std_final_conf": 0.8377167582511902, "adv/std_reasoning": 0.7574999928474426, "adv/std_step_conf": 0.9346597194671631, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7537356321839083, "calib/avg_num_step_conf": 4.48046875, "calib/ece": 0.3117928286852589, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7689243027888446, "calib/gap": 0.2719022988505747, "calib/mean_conf": 0.8416733067729083, "calib/mu_c": 0.9673333333333334, "calib/mu_w": 0.6954310344827587, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.307808764940239, "calib/std_conf": 0.3089456250663391, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6424213836477988, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.1433998572290121, "calib/step_q_w": 0.49902152641878667, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2232.0, "completions/max_terminated_length": 2232.0, "completions/mean_length": 493.40234375, "completions/mean_terminated_length": 493.40234375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11306666666666666, "grad_norm": 0.046452466398477554, "kl": 0.0757904052734375, "learning_rate": 2.6111111111111113e-06, "loss": -0.0227, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033236440271139145, "mask/share_reasoning": 0.8643213510513306, "mask/share_step_conf": 0.10244220495223999, "num_tokens": 25055863.0, "reward": 0.8876609206199646, "reward_std": 0.22679907083511353, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6773473024368286, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7987557649612427, "step": 106 }, { "adv/mean_abs_final_conf": 0.6299135684967041, "adv/mean_abs_reasoning": 0.5634945034980774, "adv/mean_abs_step_conf": 0.726954996585846, "adv/ratio_final_to_reasoning": 1.117869942983132, "adv/ratio_step_to_reasoning": 1.2900835626133598, "adv/std_final_conf": 0.8326076865196228, "adv/std_reasoning": 0.8264657258987427, "adv/std_step_conf": 0.9348495602607727, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6389862409138111, "calib/avg_num_step_conf": 4.92578125, "calib/ece": 0.3347410358565736, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8167330677290837, "calib/gap": 0.15670430944963665, "calib/mean_conf": 0.8908366533864542, "calib/mu_c": 0.9576388888888889, "calib/mu_w": 0.8009345794392523, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.32593625498007955, "calib/std_conf": 0.25053507919555956, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6303727714748785, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.15694109445624488, "calib/step_q_w": 0.4734316770186336, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 486.8671875, "completions/mean_terminated_length": 486.8671875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.11413333333333334, "grad_norm": 212991.3125, "kl": 659456.0815200806, "learning_rate": 2.5833333333333337e-06, "loss": 14848.5596, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034113090485334396, "mask/share_reasoning": 0.855882465839386, "mask/share_step_conf": 0.11000443249940872, "num_tokens": 25285117.0, "reward": 0.8787997961044312, "reward_std": 0.23736529052257538, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6400562524795532, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8120745420455933, "step": 107 }, { "adv/mean_abs_final_conf": 0.6507050395011902, "adv/mean_abs_reasoning": 0.5196617245674133, "adv/mean_abs_step_conf": 0.7388439178466797, "adv/ratio_final_to_reasoning": 1.2521704192142733, "adv/ratio_step_to_reasoning": 1.4217785973398793, "adv/std_final_conf": 0.8490867018699646, "adv/std_reasoning": 0.7753031849861145, "adv/std_step_conf": 0.9350075125694275, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6293950719822813, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.29661417322834643, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7992125984251969, "calib/gap": 0.09142995570321144, "calib/mean_conf": 0.8880314960629921, "calib/mu_c": 0.9189880952380953, "calib/mu_w": 0.8275581395348839, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.26161417322834646, "calib/std_conf": 0.2564599686957617, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.644311814859927, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.0921388658577097, "calib/step_q_w": 0.5521729490022173, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 509.99609375, "completions/mean_terminated_length": 511.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.1152, "grad_norm": 0.03606853261590004, "kl": 0.08648681640625, "learning_rate": 2.5555555555555557e-06, "loss": 0.0144, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033813901245594025, "mask/share_reasoning": 0.8542295694351196, "mask/share_step_conf": 0.10805031657218933, "num_tokens": 25518908.0, "reward": 0.9265960454940796, "reward_std": 0.22009024024009705, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6905062794685364, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8329982757568359, "step": 108 }, { "adv/mean_abs_final_conf": 0.637611448764801, "adv/mean_abs_reasoning": 0.47797125577926636, "adv/mean_abs_step_conf": 0.7113043069839478, "adv/ratio_final_to_reasoning": 1.333995383729223, "adv/ratio_step_to_reasoning": 1.488173814603692, "adv/std_final_conf": 0.8487848043441772, "adv/std_reasoning": 0.7752171754837036, "adv/std_step_conf": 0.9350650906562805, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8020752016653656, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.263991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6451612903225806, "calib/gap": 0.41911657559198556, "calib/mean_conf": 0.7417338709677419, "calib/mu_c": 0.954672131147541, "calib/mu_w": 0.5355555555555555, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.25689516129032264, "calib/std_conf": 0.3758589862733562, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6456351791530945, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.1917124451263486, "calib/step_q_w": 0.45392273402674593, "calib/step_q_w_n": 673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 518.18359375, "completions/mean_terminated_length": 518.18359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.11626666666666667, "grad_norm": 0.03923717141151428, "kl": 0.0962982177734375, "learning_rate": 2.5277777777777778e-06, "loss": 0.0092, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031188976019620895, "mask/share_reasoning": 0.8646241426467896, "mask/share_step_conf": 0.10418689250946045, "num_tokens": 25756163.0, "reward": 0.925527811050415, "reward_std": 0.2139080911874771, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7322777509689331, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8297152519226074, "step": 109 }, { "adv/mean_abs_final_conf": 0.7360544204711914, "adv/mean_abs_reasoning": 0.5907687544822693, "adv/mean_abs_step_conf": 0.752666175365448, "adv/ratio_final_to_reasoning": 1.2459264558029068, "adv/ratio_step_to_reasoning": 1.274045334413565, "adv/std_final_conf": 0.9061128497123718, "adv/std_reasoning": 0.8266500234603882, "adv/std_step_conf": 0.9360246062278748, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6578307712521352, "calib/avg_num_step_conf": 3.92578125, "calib/ece": 0.35874493927125506, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7044534412955465, "calib/gap": 0.14679477072657987, "calib/mean_conf": 0.8360728744939271, "calib/mu_c": 0.906201550387597, "calib/mu_w": 0.7594067796610171, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3362753036437247, "calib/std_conf": 0.2872816240754243, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.6610315789473684, "calib/step_q_c_n": 475.0, "calib/step_q_gap": 0.10944667328699098, "calib/step_q_w": 0.5515849056603774, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 460.0546875, "completions/mean_terminated_length": 460.0546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.11733333333333333, "grad_norm": 0.032077256590127945, "kl": 0.1114044189453125, "learning_rate": 2.5e-06, "loss": -0.1208, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0361558198928833, "mask/share_reasoning": 0.867607057094574, "mask/share_step_conf": 0.09623715281486511, "num_tokens": 25978857.0, "reward": 0.816313624382019, "reward_std": 0.2857520282268524, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6051039099693298, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7368983030319214, "step": 110 }, { "adv/mean_abs_final_conf": 0.725448727607727, "adv/mean_abs_reasoning": 0.6362229585647583, "adv/mean_abs_step_conf": 0.7552950382232666, "adv/ratio_final_to_reasoning": 1.1402429255999362, "adv/ratio_step_to_reasoning": 1.1871546413966583, "adv/std_final_conf": 0.9021727442741394, "adv/std_reasoning": 0.8748306632041931, "adv/std_step_conf": 0.9352055191993713, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6703575782202357, "calib/avg_num_step_conf": 3.89453125, "calib/ece": 0.28967346938775507, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.7428571428571429, "calib/gap": 0.2398273059731817, "calib/mean_conf": 0.8206938775510204, "calib/mu_c": 0.9254347826086956, "calib/mu_w": 0.6856074766355139, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.2735510204081632, "calib/std_conf": 0.3240142478652111, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.6638683127572016, "calib/step_q_c_n": 486.0, "calib/step_q_gap": 0.13273328340299417, "calib/step_q_w": 0.5311350293542074, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 498.30078125, "completions/mean_terminated_length": 498.30078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.1184, "grad_norm": 0.04491841420531273, "kl": 0.11196136474609375, "learning_rate": 2.4722222222222226e-06, "loss": -0.0422, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0340171717107296, "mask/share_reasoning": 0.8737939596176147, "mask/share_step_conf": 0.09218887984752655, "num_tokens": 26213830.0, "reward": 0.8551008105278015, "reward_std": 0.31432783603668213, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6475656032562256, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7696672081947327, "step": 111 }, { "adv/mean_abs_final_conf": 0.7325797080993652, "adv/mean_abs_reasoning": 0.6655578017234802, "adv/mean_abs_step_conf": 0.7522399425506592, "adv/ratio_final_to_reasoning": 1.1007003542026401, "adv/ratio_step_to_reasoning": 1.1302398388279924, "adv/std_final_conf": 0.8928104639053345, "adv/std_reasoning": 0.874744176864624, "adv/std_step_conf": 0.9354557394981384, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7793906196508046, "calib/avg_num_step_conf": 3.77734375, "calib/ece": 0.2210330578512397, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.5206611570247934, "calib/gap": 0.3675172885997943, "calib/mean_conf": 0.6674793388429752, "calib/mu_c": 0.8421259842519683, "calib/mu_w": 0.47460869565217395, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.18185950413223143, "calib/std_conf": 0.39885042659655895, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.6290947368421054, "calib/step_q_c_n": 475.0, "calib/step_q_gap": 0.17698091570389407, "calib/step_q_w": 0.4521138211382113, "calib/step_q_w_n": 492.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 548.1953125, "completions/mean_terminated_length": 552.5117797851562, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.11946666666666667, "grad_norm": 0.02175419218838215, "kl": 0.1053009033203125, "learning_rate": 2.4444444444444447e-06, "loss": -0.085, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.029369987547397614, "mask/share_reasoning": 0.8855926990509033, "mask/share_step_conf": 0.07722484320402145, "num_tokens": 26462088.0, "reward": 0.8555378317832947, "reward_std": 0.2852292060852051, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.685128927230835, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.7431342005729675, "step": 112 }, { "adv/mean_abs_final_conf": 0.7447386384010315, "adv/mean_abs_reasoning": 0.5864541530609131, "adv/mean_abs_step_conf": 0.7527590394020081, "adv/ratio_final_to_reasoning": 1.2699008686595112, "adv/ratio_step_to_reasoning": 1.2835769607446559, "adv/std_final_conf": 0.911855161190033, "adv/std_reasoning": 0.8265925049781799, "adv/std_step_conf": 0.9350982904434204, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.749966078697422, "calib/avg_num_step_conf": 4.12890625, "calib/ece": 0.23784153005464478, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5778688524590164, "calib/gap": 0.2594970601537766, "calib/mean_conf": 0.7608743169398906, "calib/mu_c": 0.8778606965174128, "calib/mu_w": 0.6183636363636362, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.2247677595628415, "calib/std_conf": 0.322982121031701, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.6289224393132031, "calib/step_q_c_n": 563.0, "calib/step_q_gap": 0.14236373485976184, "calib/step_q_w": 0.4865587044534413, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2107.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 461.3515625, "completions/mean_terminated_length": 461.3515625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.12053333333333334, "grad_norm": 0.02832408808171749, "kl": 0.132415771484375, "learning_rate": 2.4166666666666667e-06, "loss": -0.0059, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03460315614938736, "mask/share_reasoning": 0.8678891062736511, "mask/share_step_conf": 0.0975077673792839, "num_tokens": 26685394.0, "reward": 0.8980043530464172, "reward_std": 0.2618659436702728, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6935716867446899, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8071244955062866, "step": 113 }, { "adv/mean_abs_final_conf": 0.6925190091133118, "adv/mean_abs_reasoning": 0.548272967338562, "adv/mean_abs_step_conf": 0.7386399507522583, "adv/ratio_final_to_reasoning": 1.263091653916391, "adv/ratio_step_to_reasoning": 1.3472120544950075, "adv/std_final_conf": 0.8970476984977722, "adv/std_reasoning": 0.7929205298423767, "adv/std_step_conf": 0.9356428384780884, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7543706293706295, "calib/avg_num_step_conf": 4.05859375, "calib/ece": 0.23186234817813772, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.6720647773279352, "calib/gap": 0.34546328671328674, "calib/mean_conf": 0.7889473684210526, "calib/mu_c": 0.9344055944055945, "calib/mu_w": 0.5889423076923077, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.22093117408906887, "calib/std_conf": 0.33682205152856126, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.6588998357963876, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.14983006835452717, "calib/step_q_w": 0.5090697674418604, "calib/step_q_w_n": 430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 474.8203125, "completions/mean_terminated_length": 474.8203125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1216, "grad_norm": 0.03796344995498657, "kl": 0.1222076416015625, "learning_rate": 2.388888888888889e-06, "loss": -0.075, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.035458482801914215, "mask/share_reasoning": 0.8631207346916199, "mask/share_step_conf": 0.1014208048582077, "num_tokens": 26911972.0, "reward": 0.9028903245925903, "reward_std": 0.2743861675262451, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7213600873947144, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7859828472137451, "step": 114 }, { "adv/mean_abs_final_conf": 0.777995228767395, "adv/mean_abs_reasoning": 0.5828838348388672, "adv/mean_abs_step_conf": 0.742296040058136, "adv/ratio_final_to_reasoning": 1.3347346113698015, "adv/ratio_step_to_reasoning": 1.2734888080458378, "adv/std_final_conf": 0.921405017375946, "adv/std_reasoning": 0.8100579977035522, "adv/std_step_conf": 0.9360432624816895, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6451282051282051, "calib/avg_num_step_conf": 3.8359375, "calib/ece": 0.34812, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.616, "calib/gap": 0.1600448717948718, "calib/mean_conf": 0.7616400000000001, "calib/mu_c": 0.8384615384615385, "calib/mu_w": 0.6784166666666667, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.29488000000000003, "calib/std_conf": 0.3515487311881526, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.6039960238568589, "calib/step_q_c_n": 503.0, "calib/step_q_gap": 0.05746157709276711, "calib/step_q_w": 0.5465344467640918, "calib/step_q_w_n": 479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 463.61328125, "completions/mean_terminated_length": 465.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.12266666666666666, "grad_norm": 0.034675490111112595, "kl": 0.124359130859375, "learning_rate": 2.361111111111111e-06, "loss": -0.1689, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03430056944489479, "mask/share_reasoning": 0.8711838722229004, "mask/share_step_conf": 0.09060931950807571, "num_tokens": 27135921.0, "reward": 0.8092306852340698, "reward_std": 0.3144776523113251, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5996066331863403, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7313545942306519, "step": 115 }, { "adv/mean_abs_final_conf": 0.7690123319625854, "adv/mean_abs_reasoning": 0.6008901000022888, "adv/mean_abs_step_conf": 0.7869776487350464, "adv/ratio_final_to_reasoning": 1.2797886534653446, "adv/ratio_step_to_reasoning": 1.3096864946386182, "adv/std_final_conf": 0.922232985496521, "adv/std_reasoning": 0.8430724143981934, "adv/std_step_conf": 0.9357346892356873, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7209302325581395, "calib/avg_num_step_conf": 3.8828125, "calib/ece": 0.28441250000000007, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.5125, "calib/gap": 0.25619589356798655, "calib/mean_conf": 0.7060875000000001, "calib/mu_c": 0.8437927927927928, "calib/mu_w": 0.5875968992248063, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.264, "calib/std_conf": 0.35327000510999607, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.6702894736842107, "calib/step_q_c_n": 380.0, "calib/step_q_gap": 0.15883996228355923, "calib/step_q_w": 0.5114495114006514, "calib/step_q_w_n": 614.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 531.0859375, "completions/mean_terminated_length": 533.1686401367188, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.12373333333333333, "grad_norm": 0.06119803339242935, "kl": 0.111297607421875, "learning_rate": 2.3333333333333336e-06, "loss": -0.0462, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03346026688814163, "mask/share_reasoning": 0.8729349374771118, "mask/share_step_conf": 0.08969855308532715, "num_tokens": 27376399.0, "reward": 0.823566198348999, "reward_std": 0.31112343072891235, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6282831430435181, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.7500991821289062, "step": 116 }, { "adv/mean_abs_final_conf": 0.7061994075775146, "adv/mean_abs_reasoning": 0.5571328401565552, "adv/mean_abs_step_conf": 0.7145150303840637, "adv/ratio_final_to_reasoning": 1.2675601879420204, "adv/ratio_step_to_reasoning": 1.282485933127338, "adv/std_final_conf": 0.8947641849517822, "adv/std_reasoning": 0.7929291129112244, "adv/std_step_conf": 0.9358139038085938, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7025375939849624, "calib/avg_num_step_conf": 3.85546875, "calib/ece": 0.256, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.46938775510204084, "calib/gap": 0.28945018796992494, "calib/mean_conf": 0.6979591836734694, "calib/mu_c": 0.8550892857142858, "calib/mu_w": 0.5656390977443608, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.24840816326530615, "calib/std_conf": 0.34754873152265575, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.6135507246376812, "calib/step_q_c_n": 414.0, "calib/step_q_gap": 0.09999051521359748, "calib/step_q_w": 0.5135602094240838, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 478.453125, "completions/mean_terminated_length": 478.453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1248, "grad_norm": 0.05510564520955086, "kl": 0.1254730224609375, "learning_rate": 2.305555555555556e-06, "loss": -0.0354, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03364469110965729, "mask/share_reasoning": 0.8776097893714905, "mask/share_step_conf": 0.08874553442001343, "num_tokens": 27605483.0, "reward": 0.8502526879310608, "reward_std": 0.25346314907073975, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.664110541343689, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7621760368347168, "step": 117 }, { "adv/mean_abs_final_conf": 0.7624117732048035, "adv/mean_abs_reasoning": 0.5823904871940613, "adv/mean_abs_step_conf": 0.7789553999900818, "adv/ratio_final_to_reasoning": 1.3091075317491516, "adv/ratio_step_to_reasoning": 1.337513948318531, "adv/std_final_conf": 0.9246702194213867, "adv/std_reasoning": 0.8266401290893555, "adv/std_step_conf": 0.9355471730232239, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7529248875043268, "calib/avg_num_step_conf": 4.453125, "calib/ece": 0.18322314049586783, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.4380165289256198, "calib/gap": 0.3315167878158532, "calib/mean_conf": 0.6451239669421488, "calib/mu_c": 0.7917037037037037, "calib/mu_w": 0.4601869158878505, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.13524793388429757, "calib/std_conf": 0.37549001748014366, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5426194398682043, "calib/step_q_c_n": 607.0, "calib/step_q_gap": 0.11141868939916116, "calib/step_q_w": 0.4312007504690431, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 522.21875, "completions/mean_terminated_length": 522.21875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.12586666666666665, "grad_norm": 0.027424253523349762, "kl": 0.1253204345703125, "learning_rate": 2.277777777777778e-06, "loss": -0.0927, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.032755836844444275, "mask/share_reasoning": 0.8705406785011292, "mask/share_step_conf": 0.09670349210500717, "num_tokens": 27843179.0, "reward": 0.8873322010040283, "reward_std": 0.26377955079078674, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7093136310577393, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7731631994247437, "step": 118 }, { "adv/mean_abs_final_conf": 0.7559334635734558, "adv/mean_abs_reasoning": 0.683125376701355, "adv/mean_abs_step_conf": 0.7430295944213867, "adv/ratio_final_to_reasoning": 1.1065808552211502, "adv/ratio_step_to_reasoning": 1.0876913956985386, "adv/std_final_conf": 0.9073584079742432, "adv/std_reasoning": 0.8749279379844666, "adv/std_step_conf": 0.936138391494751, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7755255255255257, "calib/avg_num_step_conf": 3.39453125, "calib/ece": 0.19614718614718613, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.354978354978355, "calib/gap": 0.3422342342342343, "calib/mean_conf": 0.5531168831168831, "calib/mu_c": 0.730900900900901, "calib/mu_w": 0.38866666666666666, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.13437229437229434, "calib/std_conf": 0.391311236963411, "calib/step_conf_rate": 0.90234375, "calib/step_q_c": 0.560694789081886, "calib/step_q_c_n": 403.0, "calib/step_q_gap": 0.1305231152621435, "calib/step_q_w": 0.4301716738197425, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 526.33984375, "completions/mean_terminated_length": 526.33984375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.12693333333333334, "grad_norm": 0.035477571189403534, "kl": 0.1305389404296875, "learning_rate": 2.25e-06, "loss": -0.1697, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.031598348170518875, "mask/share_reasoning": 0.8906428217887878, "mask/share_step_conf": 0.07775881886482239, "num_tokens": 28082986.0, "reward": 0.8124796748161316, "reward_std": 0.3323761820793152, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.645050048828125, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.7189717292785645, "step": 119 }, { "adv/mean_abs_final_conf": 0.7713155746459961, "adv/mean_abs_reasoning": 0.6857198476791382, "adv/mean_abs_step_conf": 0.7484744787216187, "adv/ratio_final_to_reasoning": 1.1248260893374489, "adv/ratio_step_to_reasoning": 1.0915164279623486, "adv/std_final_conf": 0.9290948510169983, "adv/std_reasoning": 0.8750848770141602, "adv/std_step_conf": 0.9358519315719604, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.7467312539382482, "calib/avg_num_step_conf": 3.28515625, "calib/ece": 0.18213043478260874, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.34347826086956523, "calib/gap": 0.3489855072463767, "calib/mean_conf": 0.5565652173913043, "calib/mu_c": 0.696159420289855, "calib/mu_w": 0.3471739130434783, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.0693478260869566, "calib/std_conf": 0.388440508128835, "calib/step_conf_rate": 0.89453125, "calib/step_q_c": 0.5378112449799196, "calib/step_q_c_n": 498.0, "calib/step_q_gap": 0.096907454892456, "calib/step_q_w": 0.4409037900874636, "calib/step_q_w_n": 343.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 455.015625, "completions/mean_terminated_length": 456.8000183105469, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.128, "grad_norm": 0.0382017083466053, "kl": 0.14501953125, "learning_rate": 2.222222222222222e-06, "loss": -0.2528, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.0328923761844635, "mask/share_reasoning": 0.8787045478820801, "mask/share_step_conf": 0.08449685573577881, "num_tokens": 28306158.0, "reward": 0.8309608697891235, "reward_std": 0.33806800842285156, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6615766286849976, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.7190951108932495, "step": 120 }, { "adv/mean_abs_final_conf": 0.8134666681289673, "adv/mean_abs_reasoning": 0.725548267364502, "adv/mean_abs_step_conf": 0.7782011032104492, "adv/ratio_final_to_reasoning": 1.121175123308918, "adv/ratio_step_to_reasoning": 1.07256972170467, "adv/std_final_conf": 0.9362523555755615, "adv/std_reasoning": 0.8904957175254822, "adv/std_step_conf": 0.9359607696533203, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6404618904618905, "calib/avg_num_step_conf": 3.74609375, "calib/ece": 0.23759493670886078, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.19831223628691982, "calib/gap": 0.17561347061347055, "calib/mean_conf": 0.4868776371308017, "calib/mu_c": 0.569126984126984, "calib/mu_w": 0.3935135135135135, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.09641350210970469, "calib/std_conf": 0.35624441339302676, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.4792, "calib/step_q_c_n": 494.0, "calib/step_q_gap": 0.0198236559139785, "calib/step_q_w": 0.4593763440860215, "calib/step_q_w_n": 465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 548.76171875, "completions/mean_terminated_length": 553.0827026367188, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.12906666666666666, "grad_norm": 0.025058355182409286, "kl": 0.12908935546875, "learning_rate": 2.1944444444444445e-06, "loss": -0.2633, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.029778383672237396, "mask/share_reasoning": 0.8835940361022949, "mask/share_step_conf": 0.0788150429725647, "num_tokens": 28551697.0, "reward": 0.8246980309486389, "reward_std": 0.3075857162475586, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6279492378234863, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7425405383110046, "step": 121 }, { "adv/mean_abs_final_conf": 0.7894877791404724, "adv/mean_abs_reasoning": 0.7466657161712646, "adv/mean_abs_step_conf": 0.7526766657829285, "adv/ratio_final_to_reasoning": 1.0573510501979249, "adv/ratio_step_to_reasoning": 1.0080503891922166, "adv/std_final_conf": 0.9356542825698853, "adv/std_reasoning": 0.9207422733306885, "adv/std_step_conf": 0.9358314871788025, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.782716049382716, "calib/avg_num_step_conf": 3.5703125, "calib/ece": 0.16097692307692313, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.2948717948717949, "calib/gap": 0.3536087542087542, "calib/mean_conf": 0.563211111111111, "calib/mu_c": 0.7128148148148148, "calib/mu_w": 0.3592060606060606, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.07363247863247867, "calib/std_conf": 0.3718917056499702, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.5741482965931863, "calib/step_q_c_n": 499.0, "calib/step_q_gap": 0.13718444117149964, "calib/step_q_w": 0.4369638554216867, "calib/step_q_w_n": 415.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 506.76171875, "completions/mean_terminated_length": 506.76171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.13013333333333332, "grad_norm": 0.030653269961476326, "kl": 0.1387176513671875, "learning_rate": 2.166666666666667e-06, "loss": -0.1761, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.03297232836484909, "mask/share_reasoning": 0.8866724371910095, "mask/share_step_conf": 0.08035525679588318, "num_tokens": 28788772.0, "reward": 0.8681167364120483, "reward_std": 0.32529065012931824, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6960824728012085, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.7573385238647461, "step": 122 }, { "adv/mean_abs_final_conf": 0.7737333178520203, "adv/mean_abs_reasoning": 0.6747977137565613, "adv/mean_abs_step_conf": 0.7366330623626709, "adv/ratio_final_to_reasoning": 1.1466152034580703, "adv/ratio_step_to_reasoning": 1.0916353854577776, "adv/std_final_conf": 0.9265235066413879, "adv/std_reasoning": 0.8904935121536255, "adv/std_step_conf": 0.9361698627471924, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.7301136363636364, "calib/avg_num_step_conf": 3.66796875, "calib/ece": 0.15086956521739134, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.19130434782608696, "calib/gap": 0.28080303030303033, "calib/mean_conf": 0.47313043478260874, "calib/mu_c": 0.6196363636363637, "calib/mu_w": 0.3388333333333333, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.07286956521739132, "calib/std_conf": 0.34455734732474236, "calib/step_conf_rate": 0.8984375, "calib/step_q_c": 0.5267430025445292, "calib/step_q_c_n": 393.0, "calib/step_q_gap": 0.11906900987053654, "calib/step_q_w": 0.4076739926739927, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 550.48046875, "completions/mean_terminated_length": 552.6392211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1312, "grad_norm": 0.034598927944898605, "kl": 0.1342010498046875, "learning_rate": 2.138888888888889e-06, "loss": -0.3169, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.029137644916772842, "mask/share_reasoning": 0.8917558193206787, "mask/share_step_conf": 0.07520025223493576, "num_tokens": 29034983.0, "reward": 0.816436767578125, "reward_std": 0.31139829754829407, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6586597561836243, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.7171823978424072, "step": 123 }, { "adv/mean_abs_final_conf": 0.777886152267456, "adv/mean_abs_reasoning": 0.6544324159622192, "adv/mean_abs_step_conf": 0.7390900254249573, "adv/ratio_final_to_reasoning": 1.188642453054104, "adv/ratio_step_to_reasoning": 1.1293603547102187, "adv/std_final_conf": 0.9364475607872009, "adv/std_reasoning": 0.8749727010726929, "adv/std_step_conf": 0.9361278414726257, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7035381610576923, "calib/avg_num_step_conf": 3.421875, "calib/ece": 0.19879310344827583, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.22844827586206898, "calib/gap": 0.24277043269230764, "calib/mean_conf": 0.495, "calib/mu_c": 0.6038281249999999, "calib/mu_w": 0.3610576923076923, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.8828125, "calib/pce": 0.07103448275862068, "calib/std_conf": 0.35718004732473024, "calib/step_conf_rate": 0.8828125, "calib/step_q_c": 0.5541610738255034, "calib/step_q_c_n": 447.0, "calib/step_q_gap": 0.09642214608657562, "calib/step_q_w": 0.45773892773892777, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 500.01953125, "completions/mean_terminated_length": 500.01953125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.13226666666666667, "grad_norm": 0.04210676625370979, "kl": 0.1402130126953125, "learning_rate": 2.1111111111111114e-06, "loss": -0.2335, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.03087177686393261, "mask/share_reasoning": 0.8886597156524658, "mask/share_step_conf": 0.08046852797269821, "num_tokens": 29269804.0, "reward": 0.80426025390625, "reward_std": 0.3211674094200134, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6273288726806641, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.7061915993690491, "step": 124 }, { "adv/mean_abs_final_conf": 0.7988862991333008, "adv/mean_abs_reasoning": 0.6417537927627563, "adv/mean_abs_step_conf": 0.7917139530181885, "adv/ratio_final_to_reasoning": 1.2448485823419717, "adv/ratio_step_to_reasoning": 1.2336724175946856, "adv/std_final_conf": 0.9355748295783997, "adv/std_reasoning": 0.8432921767234802, "adv/std_step_conf": 0.9359493851661682, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6413651188072682, "calib/avg_num_step_conf": 3.40625, "calib/ece": 0.25601731601731603, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.22510822510822512, "calib/gap": 0.16824662214629604, "calib/mean_conf": 0.4827705627705628, "calib/mu_c": 0.5825531914893618, "calib/mu_w": 0.41430656934306576, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.90625, "calib/pce": 0.16593073593073593, "calib/std_conf": 0.36907528555320096, "calib/step_conf_rate": 0.90625, "calib/step_q_c": 0.5112790697674419, "calib/step_q_c_n": 344.0, "calib/step_q_gap": 0.0478889182522903, "calib/step_q_w": 0.46339015151515156, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 551.2265625, "completions/mean_terminated_length": 551.2265625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.13333333333333333, "grad_norm": 0.04054722189903259, "kl": 0.340728759765625, "learning_rate": 2.0833333333333334e-06, "loss": -0.2266, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.031965482980012894, "mask/share_reasoning": 0.8885842561721802, "mask/share_step_conf": 0.07945021241903305, "num_tokens": 29515726.0, "reward": 0.7934260964393616, "reward_std": 0.30349084734916687, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.6105921864509583, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.724697470664978, "step": 125 }, { "adv/mean_abs_final_conf": 0.7436902523040771, "adv/mean_abs_reasoning": 0.5532766580581665, "adv/mean_abs_step_conf": 0.8003737926483154, "adv/ratio_final_to_reasoning": 1.3441562037231152, "adv/ratio_step_to_reasoning": 1.4466068304008795, "adv/std_final_conf": 0.8976423144340515, "adv/std_reasoning": 0.7931344509124756, "adv/std_step_conf": 0.934795618057251, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7650679117147707, "calib/avg_num_step_conf": 3.75390625, "calib/ece": 0.19609243697478987, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.36134453781512604, "calib/gap": 0.36420486700622534, "calib/mean_conf": 0.5386134453781513, "calib/mu_c": 0.7130645161290323, "calib/mu_w": 0.348859649122807, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.10684873949579826, "calib/std_conf": 0.4026552753207035, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.5576223776223777, "calib/step_q_c_n": 429.0, "calib/step_q_gap": 0.15790433250959574, "calib/step_q_w": 0.3997180451127819, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 532.91015625, "completions/mean_terminated_length": 532.91015625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1344, "grad_norm": 0.0251851137727499, "kl": 0.1164093017578125, "learning_rate": 2.0555555555555555e-06, "loss": -0.0353, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.033219676464796066, "mask/share_reasoning": 0.8798425197601318, "mask/share_step_conf": 0.08693777024745941, "num_tokens": 29757615.0, "reward": 0.8525570034980774, "reward_std": 0.25820255279541016, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6813836097717285, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7487304210662842, "step": 126 }, { "adv/mean_abs_final_conf": 0.7592334151268005, "adv/mean_abs_reasoning": 0.6342440843582153, "adv/mean_abs_step_conf": 0.7591673135757446, "adv/ratio_final_to_reasoning": 1.1970681853423364, "adv/ratio_step_to_reasoning": 1.1969639643449537, "adv/std_final_conf": 0.9183486700057983, "adv/std_reasoning": 0.8749744296073914, "adv/std_step_conf": 0.9358720779418945, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.7597771546635183, "calib/avg_num_step_conf": 3.70703125, "calib/ece": 0.18613733905579397, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.24034334763948498, "calib/gap": 0.33792355371900823, "calib/mean_conf": 0.4932618025751073, "calib/mu_c": 0.66875, "calib/mu_w": 0.3308264462809917, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.09935622317596567, "calib/std_conf": 0.3843757607482917, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.5411187214611872, "calib/step_q_c_n": 438.0, "calib/step_q_gap": 0.13560013046314423, "calib/step_q_w": 0.405518590998043, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 515.1875, "completions/mean_terminated_length": 517.2078857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.13546666666666668, "grad_norm": 0.028686698526144028, "kl": 0.1333465576171875, "learning_rate": 2.027777777777778e-06, "loss": -0.3105, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.033604905009269714, "mask/share_reasoning": 0.8754376173019409, "mask/share_step_conf": 0.0870511457324028, "num_tokens": 29993175.0, "reward": 0.8517680764198303, "reward_std": 0.29256191849708557, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6856671571731567, "rewards/format_reward_step": 0.88671875, "rewards/step_l2_reward": 0.7530252933502197, "step": 127 }, { "adv/mean_abs_final_conf": 0.780434787273407, "adv/mean_abs_reasoning": 0.685494065284729, "adv/mean_abs_step_conf": 0.7519875764846802, "adv/ratio_final_to_reasoning": 1.1384996994091308, "adv/ratio_step_to_reasoning": 1.0970008561231412, "adv/std_final_conf": 0.9359096884727478, "adv/std_reasoning": 0.8904690146446228, "adv/std_step_conf": 0.9361345767974854, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.7290708736324719, "calib/avg_num_step_conf": 3.21875, "calib/ece": 0.1712888888888889, "calib/final_conf_rate": 0.87890625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.2311111111111111, "calib/gap": 0.29680910099889013, "calib/mean_conf": 0.4968888888888889, "calib/mu_c": 0.6538679245283018, "calib/mu_w": 0.35705882352941165, "calib/nonempty_final_conf_rate": 0.87890625, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.09853333333333336, "calib/std_conf": 0.368311596476089, "calib/step_conf_rate": 0.8984375, "calib/step_q_c": 0.5329842931937173, "calib/step_q_c_n": 382.0, "calib/step_q_gap": 0.09056347871407927, "calib/step_q_w": 0.44242081447963805, "calib/step_q_w_n": 442.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 554.72265625, "completions/mean_terminated_length": 556.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.13653333333333334, "grad_norm": 0.03147805482149124, "kl": 0.124420166015625, "learning_rate": 2.0000000000000003e-06, "loss": -0.2282, "mask/has_final_conf_rate": 0.87890625, "mask/share_final_conf": 0.032010771334171295, "mask/share_reasoning": 0.8855876922607422, "mask/share_step_conf": 0.07849524170160294, "num_tokens": 30241848.0, "reward": 0.8135828971862793, "reward_std": 0.3208565413951874, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.655642569065094, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.7176169157028198, "step": 128 }, { "adv/mean_abs_final_conf": 0.7757356762886047, "adv/mean_abs_reasoning": 0.6284763813018799, "adv/mean_abs_step_conf": 0.7494632005691528, "adv/ratio_final_to_reasoning": 1.2343115817362609, "adv/ratio_step_to_reasoning": 1.1925081401096578, "adv/std_final_conf": 0.9355270862579346, "adv/std_reasoning": 0.8591558337211609, "adv/std_step_conf": 0.9356926083564758, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7336753731343283, "calib/avg_num_step_conf": 3.76953125, "calib/ece": 0.13034453781512606, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.25210084033613445, "calib/gap": 0.3000838117106774, "calib/mean_conf": 0.5786470588235293, "calib/mu_c": 0.7097761194029851, "calib/mu_w": 0.40969230769230774, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.07298319327731094, "calib/std_conf": 0.3480800180446124, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.5702333931777379, "calib/step_q_c_n": 557.0, "calib/step_q_gap": 0.06636084415813015, "calib/step_q_w": 0.5038725490196078, "calib/step_q_w_n": 408.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 473.734375, "completions/mean_terminated_length": 473.734375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.1376, "grad_norm": 0.03528955578804016, "kl": 0.1468353271484375, "learning_rate": 1.9722222222222224e-06, "loss": -0.1563, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.03455173224210739, "mask/share_reasoning": 0.8706372976303101, "mask/share_step_conf": 0.09481099247932434, "num_tokens": 30465508.0, "reward": 0.8781849145889282, "reward_std": 0.27595558762550354, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7109373807907104, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7579324841499329, "step": 129 }, { "adv/mean_abs_final_conf": 0.7660256028175354, "adv/mean_abs_reasoning": 0.5722837448120117, "adv/mean_abs_step_conf": 0.7249982357025146, "adv/ratio_final_to_reasoning": 1.3385416059111823, "adv/ratio_step_to_reasoning": 1.2668510022780182, "adv/std_final_conf": 0.9354217648506165, "adv/std_reasoning": 0.8429265022277832, "adv/std_step_conf": 0.9357032179832458, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6923210321864595, "calib/avg_num_step_conf": 3.42578125, "calib/ece": 0.20442148760330575, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.2603305785123967, "calib/gap": 0.24038013318534956, "calib/mean_conf": 0.5510330578512398, "calib/mu_c": 0.6563235294117646, "calib/mu_w": 0.4159433962264151, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.09673553719008264, "calib/std_conf": 0.36934295992704985, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.5635294117647058, "calib/step_q_c_n": 442.0, "calib/step_q_gap": 0.10437998647734953, "calib/step_q_w": 0.4591494252873563, "calib/step_q_w_n": 435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 449.43359375, "completions/mean_terminated_length": 451.19610595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.13866666666666666, "grad_norm": 0.03311045095324516, "kl": 0.144561767578125, "learning_rate": 1.944444444444445e-06, "loss": -0.1809, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.034811727702617645, "mask/share_reasoning": 0.875035285949707, "mask/share_step_conf": 0.08624675869941711, "num_tokens": 30685851.0, "reward": 0.8714255094528198, "reward_std": 0.2791735529899597, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6772284507751465, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7749974727630615, "step": 130 }, { "adv/mean_abs_final_conf": 0.7358200550079346, "adv/mean_abs_reasoning": 0.5314429998397827, "adv/mean_abs_step_conf": 0.737650990486145, "adv/ratio_final_to_reasoning": 1.3845700389877496, "adv/ratio_step_to_reasoning": 1.3880152541449018, "adv/std_final_conf": 0.9355961084365845, "adv/std_reasoning": 0.7928544282913208, "adv/std_step_conf": 0.9357744455337524, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8268200897380228, "calib/avg_num_step_conf": 3.73046875, "calib/ece": 0.11046025104602511, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.24686192468619247, "calib/gap": 0.4177652337530756, "calib/mean_conf": 0.49271966527196653, "calib/mu_c": 0.7391836734693877, "calib/mu_w": 0.3214184397163121, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.09656903765690375, "calib/std_conf": 0.37007028744111553, "calib/step_conf_rate": 0.9140625, "calib/step_q_c": 0.5777211796246648, "calib/step_q_c_n": 373.0, "calib/step_q_gap": 0.15392392876555838, "calib/step_q_w": 0.42379725085910647, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 502.39453125, "completions/mean_terminated_length": 502.39453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.13973333333333332, "grad_norm": 0.028213733807206154, "kl": 0.1276397705078125, "learning_rate": 1.916666666666667e-06, "loss": -0.168, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.031309232115745544, "mask/share_reasoning": 0.8836838006973267, "mask/share_step_conf": 0.08500701189041138, "num_tokens": 30920672.0, "reward": 0.8651392459869385, "reward_std": 0.3005208969116211, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.7245796322822571, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7502299547195435, "step": 131 }, { "adv/mean_abs_final_conf": 0.716113805770874, "adv/mean_abs_reasoning": 0.6242179870605469, "adv/mean_abs_step_conf": 0.7061994671821594, "adv/ratio_final_to_reasoning": 1.1472175115348184, "adv/ratio_step_to_reasoning": 1.131334696886363, "adv/std_final_conf": 0.9199298024177551, "adv/std_reasoning": 0.8749078512191772, "adv/std_step_conf": 0.9358008503913879, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7914211560044894, "calib/avg_num_step_conf": 3.94140625, "calib/ece": 0.14213991769547324, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.3991769547325103, "calib/gap": 0.3619949494949495, "calib/mean_conf": 0.6205761316872428, "calib/mu_c": 0.7680555555555555, "calib/mu_w": 0.406060606060606, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.08506172839506172, "calib/std_conf": 0.3684698609691834, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.6124952015355086, "calib/step_q_c_n": 521.0, "calib/step_q_gap": 0.16530257858468894, "calib/step_q_w": 0.4471926229508197, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2177.0, "completions/max_terminated_length": 2177.0, "completions/mean_length": 498.53125, "completions/mean_terminated_length": 498.53125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1408, "grad_norm": 0.024374086409807205, "kl": 0.1346588134765625, "learning_rate": 1.888888888888889e-06, "loss": -0.1078, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.034363698214292526, "mask/share_reasoning": 0.8720892071723938, "mask/share_step_conf": 0.09354706108570099, "num_tokens": 31153888.0, "reward": 0.9181256294250488, "reward_std": 0.278538316488266, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7419136762619019, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7943376302719116, "step": 132 }, { "adv/mean_abs_final_conf": 0.7902538180351257, "adv/mean_abs_reasoning": 0.6631218791007996, "adv/mean_abs_step_conf": 0.743695855140686, "adv/ratio_final_to_reasoning": 1.191717304074959, "adv/ratio_step_to_reasoning": 1.1215070390214625, "adv/std_final_conf": 0.9299471974372864, "adv/std_reasoning": 0.8905849456787109, "adv/std_step_conf": 0.9361699223518372, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.7090786932056772, "calib/avg_num_step_conf": 4.1484375, "calib/ece": 0.16894736842105262, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.19298245614035087, "calib/gap": 0.26773998488284206, "calib/mean_conf": 0.438859649122807, "calib/mu_c": 0.6114814814814815, "calib/mu_w": 0.34374149659863945, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.1262719298245614, "calib/std_conf": 0.35228143347585994, "calib/step_conf_rate": 0.8984375, "calib/step_q_c": 0.4963235294117647, "calib/step_q_c_n": 340.0, "calib/step_q_gap": 0.10362269838683402, "calib/step_q_w": 0.3927008310249307, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 610.578125, "completions/mean_terminated_length": 610.578125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14186666666666667, "grad_norm": 0.03473455831408501, "kl": 0.115020751953125, "learning_rate": 1.8611111111111113e-06, "loss": -0.3156, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.02674572914838791, "mask/share_reasoning": 0.8993955850601196, "mask/share_step_conf": 0.07385867089033127, "num_tokens": 31416540.0, "reward": 0.8048692345619202, "reward_std": 0.3397502303123474, "rewards/accuracy_reward_step": 0.3203125, "rewards/final_brier_reward_step": 0.6518843770027161, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.723479151725769, "step": 133 }, { "adv/mean_abs_final_conf": 0.8093454241752625, "adv/mean_abs_reasoning": 0.658073902130127, "adv/mean_abs_step_conf": 0.7692813873291016, "adv/ratio_final_to_reasoning": 1.2298701126962839, "adv/ratio_step_to_reasoning": 1.1689893564218332, "adv/std_final_conf": 0.936396598815918, "adv/std_reasoning": 0.8594855666160583, "adv/std_step_conf": 0.9363592863082886, "calib/answer_extract_rate": 0.875, "calib/auroc": 0.7485759758753561, "calib/avg_num_step_conf": 3.4296875, "calib/ece": 0.16405731523378586, "calib/final_conf_rate": 0.86328125, "calib/format_rate": 0.83203125, "calib/frac_conf_gt_0.9": 0.2171945701357466, "calib/gap": 0.3283059697325068, "calib/mean_conf": 0.4882503770739065, "calib/mu_c": 0.6769148936170213, "calib/mu_w": 0.3486089238845145, "calib/nonempty_final_conf_rate": 0.86328125, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.87890625, "calib/pce": 0.11348416289592761, "calib/std_conf": 0.36948397379862874, "calib/step_conf_rate": 0.87890625, "calib/step_q_c": 0.5640285714285714, "calib/step_q_c_n": 350.0, "calib/step_q_gap": 0.14052478354978354, "calib/step_q_w": 0.4235037878787879, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 588.9921875, "completions/mean_terminated_length": 591.302001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.14293333333333333, "grad_norm": 0.02548646554350853, "kl": 0.1108551025390625, "learning_rate": 1.8333333333333333e-06, "loss": -0.2445, "mask/has_final_conf_rate": 0.86328125, "mask/share_final_conf": 0.02605942077934742, "mask/share_reasoning": 0.9032557010650635, "mask/share_step_conf": 0.06677865236997604, "num_tokens": 31676274.0, "reward": 0.7943482398986816, "reward_std": 0.3597278892993927, "rewards/accuracy_reward_step": 0.37109375, "rewards/final_brier_reward_step": 0.6471452713012695, "rewards/format_reward_step": 0.83203125, "rewards/step_l2_reward": 0.7009261846542358, "step": 134 }, { "adv/mean_abs_final_conf": 0.8014721870422363, "adv/mean_abs_reasoning": 0.6012312173843384, "adv/mean_abs_step_conf": 0.7397862076759338, "adv/ratio_final_to_reasoning": 1.33305151806496, "adv/ratio_step_to_reasoning": 1.2304520894546696, "adv/std_final_conf": 0.9359004497528076, "adv/std_reasoning": 0.8432109951972961, "adv/std_step_conf": 0.936022937297821, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6532982456140352, "calib/avg_num_step_conf": 3.859375, "calib/ece": 0.21463012552301258, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.27615062761506276, "calib/gap": 0.208172098245614, "calib/mean_conf": 0.5188426778242677, "calib/mu_c": 0.627719298245614, "calib/mu_w": 0.4195472, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.12824267782426782, "calib/std_conf": 0.3687955259163612, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.543871921182266, "calib/step_q_c_n": 406.0, "calib/step_q_gap": 0.10062449850185362, "calib/step_q_w": 0.4432474226804124, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 559.4375, "completions/mean_terminated_length": 559.4375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.144, "grad_norm": 0.04485005512833595, "kl": 0.1193695068359375, "learning_rate": 1.8055555555555557e-06, "loss": -0.0281, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.0306907556951046, "mask/share_reasoning": 0.8863241672515869, "mask/share_step_conf": 0.082985058426857, "num_tokens": 31925370.0, "reward": 0.835256814956665, "reward_std": 0.29443052411079407, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6450504660606384, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7567132711410522, "step": 135 }, { "adv/mean_abs_final_conf": 0.7456756830215454, "adv/mean_abs_reasoning": 0.591724157333374, "adv/mean_abs_step_conf": 0.7272220849990845, "adv/ratio_final_to_reasoning": 1.2601744812683657, "adv/ratio_step_to_reasoning": 1.2289883317867851, "adv/std_final_conf": 0.9112811088562012, "adv/std_reasoning": 0.8431320190429688, "adv/std_step_conf": 0.9358429312705994, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8196135066364074, "calib/avg_num_step_conf": 4.52734375, "calib/ece": 0.135206611570248, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.2727272727272727, "calib/gap": 0.4178254590468332, "calib/mean_conf": 0.4905785123966942, "calib/mu_c": 0.7167567567567569, "calib/mu_w": 0.29893129770992366, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.08355371900826451, "calib/std_conf": 0.3780961387224092, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5441613588110402, "calib/step_q_c_n": 471.0, "calib/step_q_gap": 0.16553913658881803, "calib/step_q_w": 0.3786222222222222, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 527.8125, "completions/mean_terminated_length": 527.8125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.14506666666666668, "grad_norm": 0.024483520537614822, "kl": 0.127044677734375, "learning_rate": 1.777777777777778e-06, "loss": -0.1758, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.033305633813142776, "mask/share_reasoning": 0.8706825971603394, "mask/share_step_conf": 0.09601178765296936, "num_tokens": 32168978.0, "reward": 0.8965328931808472, "reward_std": 0.2803770899772644, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7379539012908936, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.7863619327545166, "step": 136 }, { "adv/mean_abs_final_conf": 0.7307909727096558, "adv/mean_abs_reasoning": 0.5491311550140381, "adv/mean_abs_step_conf": 0.7555731534957886, "adv/ratio_final_to_reasoning": 1.3308131691981193, "adv/ratio_step_to_reasoning": 1.375942972087375, "adv/std_final_conf": 0.907728374004364, "adv/std_reasoning": 0.8099531531333923, "adv/std_step_conf": 0.9357438683509827, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7726775956284153, "calib/avg_num_step_conf": 4.1015625, "calib/ece": 0.16673512396694207, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.3512396694214876, "calib/gap": 0.34956230874316946, "calib/mean_conf": 0.5520252066115703, "calib/mu_c": 0.7282508333333334, "calib/mu_w": 0.37868852459016394, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.11144628099173544, "calib/std_conf": 0.3776974578334926, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.5540661478599223, "calib/step_q_c_n": 514.0, "calib/step_q_gap": 0.14429002845693722, "calib/step_q_w": 0.40977611940298503, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1913.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 514.43359375, "completions/mean_terminated_length": 516.4509887695312, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.14613333333333334, "grad_norm": 0.04350460320711136, "kl": 0.1253662109375, "learning_rate": 1.75e-06, "loss": -0.1918, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.030667105689644814, "mask/share_reasoning": 0.8750026822090149, "mask/share_step_conf": 0.09042397141456604, "num_tokens": 32407657.0, "reward": 0.8747584223747253, "reward_std": 0.2750248908996582, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7067804336547852, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.766173779964447, "step": 137 }, { "adv/mean_abs_final_conf": 0.7335097789764404, "adv/mean_abs_reasoning": 0.662164568901062, "adv/mean_abs_step_conf": 0.7231161594390869, "adv/ratio_final_to_reasoning": 1.1077454358420051, "adv/ratio_step_to_reasoning": 1.0920490062450503, "adv/std_final_conf": 0.9345899224281311, "adv/std_reasoning": 0.8592326045036316, "adv/std_step_conf": 0.9355090856552124, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7949413808076423, "calib/avg_num_step_conf": 3.98046875, "calib/ece": 0.14609958506224066, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.2946058091286307, "calib/gap": 0.38248588797221017, "calib/mean_conf": 0.554896265560166, "calib/mu_c": 0.7040816326530612, "calib/mu_w": 0.32159574468085106, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.045518672199170135, "calib/std_conf": 0.36411902667632423, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5553217391304349, "calib/step_q_c_n": 575.0, "calib/step_q_gap": 0.1506370544457502, "calib/step_q_w": 0.4046846846846847, "calib/step_q_w_n": 444.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 499.91015625, "completions/mean_terminated_length": 501.87060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.1472, "grad_norm": 0.0327833816409111, "kl": 0.13421630859375, "learning_rate": 1.7222222222222224e-06, "loss": -0.1421, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03305238112807274, "mask/share_reasoning": 0.8726654052734375, "mask/share_step_conf": 0.09037593007087708, "num_tokens": 32639970.0, "reward": 0.915382444858551, "reward_std": 0.26028817892074585, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7430488467216492, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7877160906791687, "step": 138 }, { "adv/mean_abs_final_conf": 0.7603707313537598, "adv/mean_abs_reasoning": 0.6023674011230469, "adv/mean_abs_step_conf": 0.7213539481163025, "adv/ratio_final_to_reasoning": 1.2623039193955936, "adv/ratio_step_to_reasoning": 1.197531517760454, "adv/std_final_conf": 0.924269437789917, "adv/std_reasoning": 0.843031644821167, "adv/std_step_conf": 0.9355605244636536, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7862875328628753, "calib/avg_num_step_conf": 3.94921875, "calib/ece": 0.12224489795918367, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.2653061224489796, "calib/gap": 0.35155597066555966, "calib/mean_conf": 0.5563673469387755, "calib/mu_c": 0.6984246575342465, "calib/mu_w": 0.3468686868686869, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.04134693877551021, "calib/std_conf": 0.35494907331624814, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5624642857142857, "calib/step_q_c_n": 560.0, "calib/step_q_gap": 0.12816273360785557, "calib/step_q_w": 0.4343015521064302, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 460.72265625, "completions/mean_terminated_length": 460.72265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.14826666666666666, "grad_norm": 0.04797978699207306, "kl": 0.1426239013671875, "learning_rate": 1.6944444444444446e-06, "loss": -0.1653, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0341273695230484, "mask/share_reasoning": 0.8714209794998169, "mask/share_step_conf": 0.09445163607597351, "num_tokens": 32861011.0, "reward": 0.9247560501098633, "reward_std": 0.2628259062767029, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7417035102844238, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.8062459826469421, "step": 139 }, { "adv/mean_abs_final_conf": 0.783392071723938, "adv/mean_abs_reasoning": 0.5787963271141052, "adv/mean_abs_step_conf": 0.761437177658081, "adv/ratio_final_to_reasoning": 1.3534848702823545, "adv/ratio_step_to_reasoning": 1.3155528844742816, "adv/std_final_conf": 0.9139004349708557, "adv/std_reasoning": 0.7930855751037598, "adv/std_step_conf": 0.9354227781295776, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7346195358877496, "calib/avg_num_step_conf": 4.3515625, "calib/ece": 0.1804897959183673, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.42448979591836733, "calib/gap": 0.3148880194279548, "calib/mean_conf": 0.6738775510204082, "calib/mu_c": 0.8139705882352942, "calib/mu_w": 0.4990825688073394, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.14963265306122445, "calib/std_conf": 0.3511556217982749, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5867399334442596, "calib/step_q_c_n": 601.0, "calib/step_q_gap": 0.14962492369767083, "calib/step_q_w": 0.43711500974658873, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 497.359375, "completions/mean_terminated_length": 497.359375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.14933333333333335, "grad_norm": 0.02688215859234333, "kl": 0.154388427734375, "learning_rate": 1.6666666666666667e-06, "loss": -0.111, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03305169939994812, "mask/share_reasoning": 0.8668557405471802, "mask/share_step_conf": 0.1000925675034523, "num_tokens": 33093351.0, "reward": 0.9080557227134705, "reward_std": 0.26558494567871094, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7214667797088623, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.800113320350647, "step": 140 }, { "adv/mean_abs_final_conf": 0.7365533113479614, "adv/mean_abs_reasoning": 0.5491877794265747, "adv/mean_abs_step_conf": 0.7725829482078552, "adv/ratio_final_to_reasoning": 1.341168428978921, "adv/ratio_step_to_reasoning": 1.406773743244132, "adv/std_final_conf": 0.9102948307991028, "adv/std_reasoning": 0.7931224703788757, "adv/std_step_conf": 0.9355855584144592, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.8940781681800237, "calib/avg_num_step_conf": 3.8125, "calib/ece": 0.07226495726495723, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.42735042735042733, "calib/gap": 0.547531780497434, "calib/mean_conf": 0.6397008547008547, "calib/mu_c": 0.8385906040268457, "calib/mu_w": 0.2910588235294117, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.03760683760683757, "calib/std_conf": 0.3746809415329093, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.6011498257839721, "calib/step_q_c_n": 574.0, "calib/step_q_gap": 0.20943340787352444, "calib/step_q_w": 0.3917164179104477, "calib/step_q_w_n": 402.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 550.11328125, "completions/mean_terminated_length": 552.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.1504, "grad_norm": 0.03140458092093468, "kl": 0.117950439453125, "learning_rate": 1.638888888888889e-06, "loss": -0.2438, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.02958410233259201, "mask/share_reasoning": 0.8877764940261841, "mask/share_step_conf": 0.0787331610918045, "num_tokens": 33341276.0, "reward": 0.9270013570785522, "reward_std": 0.29023104906082153, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7821683883666992, "rewards/format_reward_step": 0.88671875, "rewards/step_l2_reward": 0.7773030996322632, "step": 141 }, { "adv/mean_abs_final_conf": 0.7450395226478577, "adv/mean_abs_reasoning": 0.6056081056594849, "adv/mean_abs_step_conf": 0.7474773526191711, "adv/ratio_final_to_reasoning": 1.2302337364466698, "adv/ratio_step_to_reasoning": 1.2342591613848959, "adv/std_final_conf": 0.9208177328109741, "adv/std_reasoning": 0.8590512275695801, "adv/std_step_conf": 0.9352788329124451, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7868398935425129, "calib/avg_num_step_conf": 4.1484375, "calib/ece": 0.14184100418410048, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3305439330543933, "calib/gap": 0.38385558201428766, "calib/mean_conf": 0.5588284518828452, "calib/mu_c": 0.7483471074380165, "calib/mu_w": 0.3644915254237288, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.09719665271966534, "calib/std_conf": 0.38254855554489575, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.5839204545454546, "calib/step_q_c_n": 528.0, "calib/step_q_gap": 0.15360210248552947, "calib/step_q_w": 0.43031835205992514, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 580.203125, "completions/mean_terminated_length": 582.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 57.0, "epoch": 0.15146666666666667, "grad_norm": 0.02812611497938633, "kl": 0.115814208984375, "learning_rate": 1.6111111111111113e-06, "loss": -0.1799, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.02923966571688652, "mask/share_reasoning": 0.8787957429885864, "mask/share_step_conf": 0.08805837482213974, "num_tokens": 33594968.0, "reward": 0.8962991237640381, "reward_std": 0.2742632031440735, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7293863296508789, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7850868701934814, "step": 142 }, { "adv/mean_abs_final_conf": 0.7679699063301086, "adv/mean_abs_reasoning": 0.570601224899292, "adv/mean_abs_step_conf": 0.757947564125061, "adv/ratio_final_to_reasoning": 1.3458960002507026, "adv/ratio_step_to_reasoning": 1.328331470474559, "adv/std_final_conf": 0.9319179058074951, "adv/std_reasoning": 0.8098957538604736, "adv/std_step_conf": 0.9355419874191284, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7768035210783304, "calib/avg_num_step_conf": 4.66015625, "calib/ece": 0.15472451790633607, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.3512396694214876, "calib/gap": 0.34518189945670874, "calib/mean_conf": 0.5920798898071625, "calib/mu_c": 0.7504071246819339, "calib/mu_w": 0.4052252252252252, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.10274104683195591, "calib/std_conf": 0.3625200530213408, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5489919237147595, "calib/step_q_c_n": 603.0, "calib/step_q_gap": 0.13873768642662387, "calib/step_q_w": 0.4102542372881356, "calib/step_q_w_n": 590.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 565.94140625, "completions/mean_terminated_length": 568.1608276367188, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.15253333333333333, "grad_norm": 0.0339367501437664, "kl": 0.116668701171875, "learning_rate": 1.5833333333333333e-06, "loss": -0.1496, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.029600488021969795, "mask/share_reasoning": 0.8680211305618286, "mask/share_step_conf": 0.09847214818000793, "num_tokens": 33847185.0, "reward": 0.8974969387054443, "reward_std": 0.24187132716178894, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7286787033081055, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7780337333679199, "step": 143 }, { "adv/mean_abs_final_conf": 0.7534208297729492, "adv/mean_abs_reasoning": 0.567805290222168, "adv/mean_abs_step_conf": 0.754891037940979, "adv/ratio_final_to_reasoning": 1.3268999827003276, "adv/ratio_step_to_reasoning": 1.3294892649654761, "adv/std_final_conf": 0.9313053488731384, "adv/std_reasoning": 0.8100166320800781, "adv/std_step_conf": 0.9351264238357544, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7182993197278912, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.18710204081632648, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4204081632653061, "calib/gap": 0.32078571428571434, "calib/mean_conf": 0.6030204081632653, "calib/mu_c": 0.7405, "calib/mu_w": 0.4197142857142857, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.10934693877551015, "calib/std_conf": 0.38181181868772646, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5480421686746988, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.12251585288522504, "calib/step_q_w": 0.42552631578947375, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 558.953125, "completions/mean_terminated_length": 558.953125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1536, "grad_norm": 0.04941343888640404, "kl": 0.14173126220703125, "learning_rate": 1.5555555555555558e-06, "loss": 0.0236, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.031576041132211685, "mask/share_reasoning": 0.8657428026199341, "mask/share_step_conf": 0.10268114507198334, "num_tokens": 34094405.0, "reward": 0.903618335723877, "reward_std": 0.22886237502098083, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7181081771850586, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7914721965789795, "step": 144 }, { "adv/mean_abs_final_conf": 0.752514123916626, "adv/mean_abs_reasoning": 0.6453818082809448, "adv/mean_abs_step_conf": 0.7151585817337036, "adv/ratio_final_to_reasoning": 1.1659983505284128, "adv/ratio_step_to_reasoning": 1.1081170441395272, "adv/std_final_conf": 0.917668879032135, "adv/std_reasoning": 0.8748682737350464, "adv/std_step_conf": 0.9354785680770874, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7588111888111888, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.1354732510288065, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.29730209790209794, "calib/mean_conf": 0.6355555555555555, "calib/mu_c": 0.757902097902098, "calib/mu_w": 0.4606, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.091275720164609, "calib/std_conf": 0.33938457979719, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5594109195402299, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.12373316122149086, "calib/step_q_w": 0.43567775831873906, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 513.16015625, "completions/mean_terminated_length": 515.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.15466666666666667, "grad_norm": 0.02969875931739807, "kl": 0.123565673828125, "learning_rate": 1.527777777777778e-06, "loss": -0.1137, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03126313537359238, "mask/share_reasoning": 0.8572871685028076, "mask/share_step_conf": 0.1075434759259224, "num_tokens": 34328478.0, "reward": 0.908592164516449, "reward_std": 0.2451344132423401, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7336207032203674, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7843448519706726, "step": 145 }, { "adv/mean_abs_final_conf": 0.7495306134223938, "adv/mean_abs_reasoning": 0.5869273543357849, "adv/mean_abs_step_conf": 0.7415024042129517, "adv/ratio_final_to_reasoning": 1.2770415416582928, "adv/ratio_step_to_reasoning": 1.2633631721801355, "adv/std_final_conf": 0.9171761870384216, "adv/std_reasoning": 0.8100182414054871, "adv/std_step_conf": 0.9355935454368591, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7831020048468825, "calib/avg_num_step_conf": 4.828125, "calib/ece": 0.24512396694214883, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.36363636363636365, "calib/gap": 0.36282587941543665, "calib/mean_conf": 0.6061157024793389, "calib/mu_c": 0.8355056179775281, "calib/mu_w": 0.47267973856209144, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.24173553719008273, "calib/std_conf": 0.365805737861786, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5575, "calib/step_q_c_n": 464.0, "calib/step_q_gap": 0.12269430051813468, "calib/step_q_w": 0.4348056994818653, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 552.859375, "completions/mean_terminated_length": 552.859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.15573333333333333, "grad_norm": 0.04507608339190483, "kl": 0.1235198974609375, "learning_rate": 1.5e-06, "loss": -0.0665, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.030464837327599525, "mask/share_reasoning": 0.8707108497619629, "mask/share_step_conf": 0.09882433712482452, "num_tokens": 34577226.0, "reward": 0.855940043926239, "reward_std": 0.29292845726013184, "rewards/accuracy_reward_step": 0.3515625, "rewards/final_brier_reward_step": 0.6881687641143799, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7690237760543823, "step": 146 }, { "adv/mean_abs_final_conf": 0.7158942222595215, "adv/mean_abs_reasoning": 0.5292023420333862, "adv/mean_abs_step_conf": 0.7680728435516357, "adv/ratio_final_to_reasoning": 1.3527797694711587, "adv/ratio_step_to_reasoning": 1.451378390731271, "adv/std_final_conf": 0.8983597159385681, "adv/std_reasoning": 0.7927306890487671, "adv/std_step_conf": 0.93496173620224, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7852531430513082, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.20327868852459016, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.430327868852459, "calib/gap": 0.36922324159021414, "calib/mean_conf": 0.6336065573770493, "calib/mu_c": 0.8378899082568808, "calib/mu_w": 0.4686666666666667, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.19508196721311477, "calib/std_conf": 0.37252256081700147, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.6004770318021202, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.17474951857460697, "calib/step_q_w": 0.42572751322751323, "calib/step_q_w_n": 756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 578.70703125, "completions/mean_terminated_length": 580.9765014648438, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.1568, "grad_norm": 0.028052711859345436, "kl": 0.11347198486328125, "learning_rate": 1.4722222222222225e-06, "loss": -0.0631, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02955162152647972, "mask/share_reasoning": 0.8645089864730835, "mask/share_step_conf": 0.10203312337398529, "num_tokens": 34829055.0, "reward": 0.8883394002914429, "reward_std": 0.24087491631507874, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.7088976502418518, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.795124888420105, "step": 147 }, { "adv/mean_abs_final_conf": 0.7483645081520081, "adv/mean_abs_reasoning": 0.632530927658081, "adv/mean_abs_step_conf": 0.765121340751648, "adv/ratio_final_to_reasoning": 1.1831271411862752, "adv/ratio_step_to_reasoning": 1.2096188617756245, "adv/std_final_conf": 0.9116754531860352, "adv/std_reasoning": 0.8267991542816162, "adv/std_step_conf": 0.9356098771095276, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7764132014114716, "calib/avg_num_step_conf": 4.5078125, "calib/ece": 0.16439024390243906, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.4959349593495935, "calib/gap": 0.34411125717844054, "calib/mean_conf": 0.706260162601626, "calib/mu_c": 0.8419463087248322, "calib/mu_w": 0.4978350515463917, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.13247967479674802, "calib/std_conf": 0.352635601258254, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5817985611510792, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.13275716681556726, "calib/step_q_w": 0.44904139433551193, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 517.2265625, "completions/mean_terminated_length": 519.2549438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.15786666666666666, "grad_norm": 0.030412495136260986, "kl": 0.1313934326171875, "learning_rate": 1.4444444444444445e-06, "loss": -0.1081, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03274453431367874, "mask/share_reasoning": 0.8602826595306396, "mask/share_step_conf": 0.10306654870510101, "num_tokens": 35066577.0, "reward": 0.907225489616394, "reward_std": 0.2797033190727234, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7349027395248413, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7772043943405151, "step": 148 }, { "adv/mean_abs_final_conf": 0.7614589929580688, "adv/mean_abs_reasoning": 0.6402909755706787, "adv/mean_abs_step_conf": 0.7309304475784302, "adv/ratio_final_to_reasoning": 1.1892389897880342, "adv/ratio_step_to_reasoning": 1.1415598149372108, "adv/std_final_conf": 0.9285000562667847, "adv/std_reasoning": 0.859099268913269, "adv/std_step_conf": 0.9354667067527771, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8360772357723576, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.16386831275720157, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.3991769547325103, "calib/gap": 0.4133841463414635, "calib/mean_conf": 0.6461728395061728, "calib/mu_c": 0.8554166666666667, "calib/mu_w": 0.44203252032520324, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1581069958847736, "calib/std_conf": 0.35694947101657876, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5887889273356403, "calib/step_q_c_n": 578.0, "calib/step_q_gap": 0.1774052795368981, "calib/step_q_w": 0.41138364779874215, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 613.9609375, "completions/mean_terminated_length": 613.9609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.15893333333333334, "grad_norm": 0.024701019749045372, "kl": 0.1152801513671875, "learning_rate": 1.4166666666666667e-06, "loss": -0.0094, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.027972478419542313, "mask/share_reasoning": 0.8716949224472046, "mask/share_step_conf": 0.10033257305622101, "num_tokens": 35328207.0, "reward": 0.9243149161338806, "reward_std": 0.2653235197067261, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7544609308242798, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8129189014434814, "step": 149 }, { "adv/mean_abs_final_conf": 0.7447877526283264, "adv/mean_abs_reasoning": 0.5853399038314819, "adv/mean_abs_step_conf": 0.7393544316291809, "adv/ratio_final_to_reasoning": 1.272402150875313, "adv/ratio_step_to_reasoning": 1.2631198160070074, "adv/std_final_conf": 0.9059438705444336, "adv/std_reasoning": 0.8266521692276001, "adv/std_step_conf": 0.9355303645133972, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7687435098650052, "calib/avg_num_step_conf": 4.73046875, "calib/ece": 0.24756198347107436, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5909090909090909, "calib/gap": 0.27484735202492194, "calib/mean_conf": 0.7878099173553718, "calib/mu_c": 0.9093333333333332, "calib/mu_w": 0.6344859813084113, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.23876033057851237, "calib/std_conf": 0.3046861225754724, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.60217503900156, "calib/step_q_c_n": 641.0, "calib/step_q_gap": 0.11470135479103366, "calib/step_q_w": 0.48747368421052634, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 486.609375, "completions/mean_terminated_length": 486.609375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.16, "grad_norm": 0.0389578640460968, "kl": 0.1333160400390625, "learning_rate": 1.3888888888888892e-06, "loss": -0.0673, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03438667953014374, "mask/share_reasoning": 0.8491482734680176, "mask/share_step_conf": 0.11646504700183868, "num_tokens": 35557739.0, "reward": 0.889436662197113, "reward_std": 0.26576003432273865, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.69478440284729, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.791120171546936, "step": 150 }, { "adv/mean_abs_final_conf": 0.7764695882797241, "adv/mean_abs_reasoning": 0.6994028091430664, "adv/mean_abs_step_conf": 0.7512112855911255, "adv/ratio_final_to_reasoning": 1.110189404630906, "adv/ratio_step_to_reasoning": 1.0740753050613805, "adv/std_final_conf": 0.9359955787658691, "adv/std_reasoning": 0.8903999924659729, "adv/std_step_conf": 0.9358260035514832, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7411612193588937, "calib/avg_num_step_conf": 4.6015625, "calib/ece": 0.25863247863247874, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.3888888888888889, "calib/gap": 0.3447061596480202, "calib/mean_conf": 0.6183760683760683, "calib/mu_c": 0.8363953488372093, "calib/mu_w": 0.49168918918918914, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.2547435897435898, "calib/std_conf": 0.3808581342201271, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5588489208633094, "calib/step_q_c_n": 417.0, "calib/step_q_gap": 0.13473591166488624, "calib/step_q_w": 0.42411300919842315, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 573.1015625, "completions/mean_terminated_length": 575.3490600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 82.0, "epoch": 0.16106666666666666, "grad_norm": 0.03305526822805405, "kl": 0.1082305908203125, "learning_rate": 1.3611111111111112e-06, "loss": -0.2121, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.027798429131507874, "mask/share_reasoning": 0.8735820055007935, "mask/share_step_conf": 0.09471327811479568, "num_tokens": 35811477.0, "reward": 0.8235229849815369, "reward_std": 0.28725647926330566, "rewards/accuracy_reward_step": 0.33984375, "rewards/final_brier_reward_step": 0.6552902460098267, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7433182001113892, "step": 151 }, { "adv/mean_abs_final_conf": 0.8092677593231201, "adv/mean_abs_reasoning": 0.6949714422225952, "adv/mean_abs_step_conf": 0.7740131616592407, "adv/ratio_final_to_reasoning": 1.164461890311626, "adv/ratio_step_to_reasoning": 1.1137337660722595, "adv/std_final_conf": 0.9358307123184204, "adv/std_reasoning": 0.8591673374176025, "adv/std_step_conf": 0.9356124401092529, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6775272727272726, "calib/avg_num_step_conf": 4.77734375, "calib/ece": 0.23902127659574468, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.39148936170212767, "calib/gap": 0.25546181818181823, "calib/mean_conf": 0.6322978723404256, "calib/mu_c": 0.7681818181818182, "calib/mu_w": 0.51272, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.20161702127659575, "calib/std_conf": 0.3734107267427342, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5278633975481611, "calib/step_q_c_n": 571.0, "calib/step_q_gap": 0.10728057546227154, "calib/step_q_w": 0.4205828220858896, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 567.6328125, "completions/mean_terminated_length": 567.6328125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.16213333333333332, "grad_norm": 0.03677584230899811, "kl": 0.1179046630859375, "learning_rate": 1.3333333333333334e-06, "loss": -0.1297, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.02956000715494156, "mask/share_reasoning": 0.86711585521698, "mask/share_step_conf": 0.10332408547401428, "num_tokens": 36062183.0, "reward": 0.8294390439987183, "reward_std": 0.3084717392921448, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6317847967147827, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7606871128082275, "step": 152 }, { "adv/mean_abs_final_conf": 0.7374790906906128, "adv/mean_abs_reasoning": 0.5551480650901794, "adv/mean_abs_step_conf": 0.7407870292663574, "adv/ratio_final_to_reasoning": 1.3284367487992867, "adv/ratio_step_to_reasoning": 1.3343954088104808, "adv/std_final_conf": 0.9241796731948853, "adv/std_reasoning": 0.826519787311554, "adv/std_step_conf": 0.9351882338523865, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.719258064516129, "calib/avg_num_step_conf": 4.8203125, "calib/ece": 0.20381526104417674, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.357429718875502, "calib/gap": 0.28110064516129024, "calib/mean_conf": 0.5935341365461848, "calib/mu_c": 0.73352, "calib/mu_w": 0.4524193548387097, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.14767068273092374, "calib/std_conf": 0.37711699610913185, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5223010380622837, "calib/step_q_c_n": 578.0, "calib/step_q_gap": 0.14027359903789338, "calib/step_q_w": 0.3820274390243903, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 541.921875, "completions/mean_terminated_length": 541.921875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.1632, "grad_norm": 0.03512969985604286, "kl": 0.1247406005859375, "learning_rate": 1.3055555555555556e-06, "loss": -0.0449, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0296354778110981, "mask/share_reasoning": 0.8736000657081604, "mask/share_step_conf": 0.09676443040370941, "num_tokens": 36308235.0, "reward": 0.8974446654319763, "reward_std": 0.22768069803714752, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7062948942184448, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8010944128036499, "step": 153 }, { "adv/mean_abs_final_conf": 0.7104947566986084, "adv/mean_abs_reasoning": 0.5184451937675476, "adv/mean_abs_step_conf": 0.7680416107177734, "adv/ratio_final_to_reasoning": 1.3704336837138642, "adv/ratio_step_to_reasoning": 1.4814325987601613, "adv/std_final_conf": 0.9080306887626648, "adv/std_reasoning": 0.7754558324813843, "adv/std_step_conf": 0.9346005916595459, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7593478260869566, "calib/avg_num_step_conf": 4.74609375, "calib/ece": 0.2056862745098039, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4117647058823529, "calib/gap": 0.36628260869565216, "calib/mean_conf": 0.609686274509804, "calib/mu_c": 0.8107826086956521, "calib/mu_w": 0.44449999999999995, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.18219607843137253, "calib/std_conf": 0.38449240744160135, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5532495164410057, "calib/step_q_c_n": 517.0, "calib/step_q_gap": 0.13836412962152145, "calib/step_q_w": 0.4148853868194843, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 518.19140625, "completions/mean_terminated_length": 520.2235717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.16426666666666667, "grad_norm": 0.03980162367224693, "kl": 0.1353759765625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0146, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031303271651268005, "mask/share_reasoning": 0.8627222180366516, "mask/share_step_conf": 0.10206824541091919, "num_tokens": 36545332.0, "reward": 0.9230555295944214, "reward_std": 0.19609469175338745, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7388566136360168, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8236606121063232, "step": 154 }, { "adv/mean_abs_final_conf": 0.7740155458450317, "adv/mean_abs_reasoning": 0.6197652220726013, "adv/mean_abs_step_conf": 0.7698578238487244, "adv/ratio_final_to_reasoning": 1.248885091126267, "adv/ratio_step_to_reasoning": 1.2421765475548752, "adv/std_final_conf": 0.936392068862915, "adv/std_reasoning": 0.826673686504364, "adv/std_step_conf": 0.9354244470596313, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6865111022470416, "calib/avg_num_step_conf": 4.83984375, "calib/ece": 0.20919028340080975, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.22672064777327935, "calib/gap": 0.25856069671586235, "calib/mean_conf": 0.48489878542510123, "calib/mu_c": 0.6293577981651377, "calib/mu_w": 0.37079710144927536, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.12639676113360326, "calib/std_conf": 0.3776904697899919, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.4596691176470588, "calib/step_q_c_n": 544.0, "calib/step_q_gap": 0.061090700380871776, "calib/step_q_w": 0.39857841726618704, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 509.3359375, "completions/mean_terminated_length": 509.3359375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.16533333333333333, "grad_norm": 0.046036504209041595, "kl": 0.1375579833984375, "learning_rate": 1.25e-06, "loss": -0.0607, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03247381001710892, "mask/share_reasoning": 0.8612602949142456, "mask/share_step_conf": 0.10626588761806488, "num_tokens": 36782938.0, "reward": 0.879797637462616, "reward_std": 0.24189157783985138, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.6951472759246826, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7894479632377625, "step": 155 }, { "adv/mean_abs_final_conf": 0.7444911003112793, "adv/mean_abs_reasoning": 0.5075423717498779, "adv/mean_abs_step_conf": 0.7232406139373779, "adv/ratio_final_to_reasoning": 1.4668550681679284, "adv/ratio_step_to_reasoning": 1.4249856843357274, "adv/std_final_conf": 0.9285725355148315, "adv/std_reasoning": 0.7755016088485718, "adv/std_step_conf": 0.9355357885360718, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7278023302531136, "calib/avg_num_step_conf": 5.05078125, "calib/ece": 0.24865306122448982, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.42857142857142855, "calib/gap": 0.29628230882549883, "calib/mean_conf": 0.6317551020408164, "calib/mu_c": 0.7901754385964912, "calib/mu_w": 0.49389312977099237, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.2075510204081633, "calib/std_conf": 0.3830433731828229, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.504851294498382, "calib/step_q_c_n": 618.0, "calib/step_q_gap": 0.09015499820208561, "calib/step_q_w": 0.41469629629629634, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 526.1875, "completions/mean_terminated_length": 528.2510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.1664, "grad_norm": 0.04315432161092758, "kl": 0.126495361328125, "learning_rate": 1.2222222222222223e-06, "loss": -0.0911, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.031666506081819534, "mask/share_reasoning": 0.8552207946777344, "mask/share_step_conf": 0.1092064157128334, "num_tokens": 37022402.0, "reward": 0.8750288486480713, "reward_std": 0.24085386097431183, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6780582070350647, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7946557998657227, "step": 156 }, { "adv/mean_abs_final_conf": 0.708011269569397, "adv/mean_abs_reasoning": 0.5233821868896484, "adv/mean_abs_step_conf": 0.7319657206535339, "adv/ratio_final_to_reasoning": 1.3527614949545013, "adv/ratio_step_to_reasoning": 1.3985300588914844, "adv/std_final_conf": 0.9187631011009216, "adv/std_reasoning": 0.7928596138954163, "adv/std_step_conf": 0.9353150725364685, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8377544529262086, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.13828685258964146, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4302788844621514, "calib/gap": 0.4907907124681934, "calib/mean_conf": 0.6067330677290836, "calib/mu_c": 0.8413740458015267, "calib/mu_w": 0.35058333333333336, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11155378486055781, "calib/std_conf": 0.39990298183762135, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5631897711978466, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.1772710720108548, "calib/step_q_w": 0.38591869918699184, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 547.1328125, "completions/mean_terminated_length": 547.1328125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.16746666666666668, "grad_norm": 0.036939799785614014, "kl": 0.121429443359375, "learning_rate": 1.1944444444444446e-06, "loss": -0.0261, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03212355449795723, "mask/share_reasoning": 0.8532459735870361, "mask/share_step_conf": 0.11463050544261932, "num_tokens": 37266196.0, "reward": 0.970373272895813, "reward_std": 0.21288573741912842, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.8059629201889038, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8379085063934326, "step": 157 }, { "adv/mean_abs_final_conf": 0.7646756768226624, "adv/mean_abs_reasoning": 0.6326032876968384, "adv/mean_abs_step_conf": 0.7455942034721375, "adv/ratio_final_to_reasoning": 1.208776008115084, "adv/ratio_step_to_reasoning": 1.178612596508426, "adv/std_final_conf": 0.9190730452537537, "adv/std_reasoning": 0.8590472340583801, "adv/std_step_conf": 0.935570478439331, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6551655891278533, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.24987951807228914, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4859437751004016, "calib/gap": 0.18371684918854747, "calib/mean_conf": 0.7106024096385543, "calib/mu_c": 0.788811188811189, "calib/mu_w": 0.6050943396226415, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.19309236947791164, "calib/std_conf": 0.34552218579669786, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5519859154929578, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.0916965845707155, "calib/step_q_w": 0.46028933092224233, "calib/step_q_w_n": 553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 517.7265625, "completions/mean_terminated_length": 517.7265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16853333333333334, "grad_norm": 0.0262977983802557, "kl": 0.1364593505859375, "learning_rate": 1.1666666666666668e-06, "loss": 0.0338, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03399697691202164, "mask/share_reasoning": 0.8510940670967102, "mask/share_step_conf": 0.11490896344184875, "num_tokens": 37503974.0, "reward": 0.8746780157089233, "reward_std": 0.24372267723083496, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.674838662147522, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.771392285823822, "step": 158 }, { "adv/mean_abs_final_conf": 0.7417625188827515, "adv/mean_abs_reasoning": 0.5368902683258057, "adv/mean_abs_step_conf": 0.7408811450004578, "adv/ratio_final_to_reasoning": 1.3815905458592173, "adv/ratio_step_to_reasoning": 1.3799489182598903, "adv/std_final_conf": 0.9166836738586426, "adv/std_reasoning": 0.7928199768066406, "adv/std_step_conf": 0.9350181221961975, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7813659247482776, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.16489878542510122, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3805668016194332, "calib/gap": 0.37062930577636466, "calib/mean_conf": 0.6106477732793523, "calib/mu_c": 0.7772058823529412, "calib/mu_w": 0.40657657657657653, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1124696356275304, "calib/std_conf": 0.3754937173998822, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5579135618479881, "calib/step_q_c_n": 671.0, "calib/step_q_gap": 0.14823669109968884, "calib/step_q_w": 0.4096768707482993, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 493.08203125, "completions/mean_terminated_length": 495.0157165527344, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1696, "grad_norm": 0.04497016221284866, "kl": 0.1365509033203125, "learning_rate": 1.138888888888889e-06, "loss": -0.0894, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.032456062734127045, "mask/share_reasoning": 0.8510378003120422, "mask/share_step_conf": 0.11259990930557251, "num_tokens": 37734987.0, "reward": 0.9372793436050415, "reward_std": 0.2131517231464386, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7540343999862671, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8220866918563843, "step": 159 }, { "adv/mean_abs_final_conf": 0.7424492835998535, "adv/mean_abs_reasoning": 0.5817896127700806, "adv/mean_abs_step_conf": 0.7358402013778687, "adv/ratio_final_to_reasoning": 1.2761473689171288, "adv/ratio_step_to_reasoning": 1.2647874510414607, "adv/std_final_conf": 0.9117441177368164, "adv/std_reasoning": 0.8267434239387512, "adv/std_step_conf": 0.9357884526252747, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7352083747432585, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.19443089430894323, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.3699186991869919, "calib/gap": 0.3141780958060029, "calib/mean_conf": 0.5548373983739838, "calib/mu_c": 0.7042635658914729, "calib/mu_w": 0.39008547008546995, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.11243902439024402, "calib/std_conf": 0.39687572288358697, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5302148760330578, "calib/step_q_c_n": 605.0, "calib/step_q_gap": 0.1507111166345616, "calib/step_q_w": 0.37950375939849623, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 554.3203125, "completions/mean_terminated_length": 554.3203125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.17066666666666666, "grad_norm": 0.02868187241256237, "kl": 0.1332244873046875, "learning_rate": 1.111111111111111e-06, "loss": -0.0085, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.029985826462507248, "mask/share_reasoning": 0.8714199662208557, "mask/share_step_conf": 0.09859418123960495, "num_tokens": 37981733.0, "reward": 0.8914778828620911, "reward_std": 0.25153106451034546, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6966515779495239, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7980228066444397, "step": 160 }, { "adv/mean_abs_final_conf": 0.7147070169448853, "adv/mean_abs_reasoning": 0.47162288427352905, "adv/mean_abs_step_conf": 0.7598281502723694, "adv/ratio_final_to_reasoning": 1.515420563287115, "adv/ratio_step_to_reasoning": 1.6110926242325612, "adv/std_final_conf": 0.8813939690589905, "adv/std_reasoning": 0.7208422422409058, "adv/std_step_conf": 0.9348978996276855, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.786046511627907, "calib/avg_num_step_conf": 4.71484375, "calib/ece": 0.18036585365853652, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.3699186991869919, "calib/gap": 0.39735755813953494, "calib/mean_conf": 0.5832113821138211, "calib/mu_c": 0.722125, "calib/mu_w": 0.3247674418604651, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.056585365853658504, "calib/std_conf": 0.3924985785698982, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5363151041666666, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.12002808822133632, "calib/step_q_w": 0.4162870159453303, "calib/step_q_w_n": 439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 510.08203125, "completions/mean_terminated_length": 510.08203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.17173333333333332, "grad_norm": 0.04517321288585663, "kl": 0.13018798828125, "learning_rate": 1.0833333333333335e-06, "loss": -0.0389, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03298802673816681, "mask/share_reasoning": 0.8571385741233826, "mask/share_step_conf": 0.10987342894077301, "num_tokens": 38216234.0, "reward": 0.9338958859443665, "reward_std": 0.20754006505012512, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.756743311882019, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7946420907974243, "step": 161 }, { "adv/mean_abs_final_conf": 0.7190934419631958, "adv/mean_abs_reasoning": 0.580523669719696, "adv/mean_abs_step_conf": 0.7531288862228394, "adv/ratio_final_to_reasoning": 1.2386978851532577, "adv/ratio_step_to_reasoning": 1.2973267508394364, "adv/std_final_conf": 0.8893725872039795, "adv/std_reasoning": 0.809906005859375, "adv/std_step_conf": 0.9347851276397705, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7156410071423418, "calib/avg_num_step_conf": 4.7109375, "calib/ece": 0.21260000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.452, "calib/gap": 0.275508982035928, "calib/mean_conf": 0.6440400000000001, "calib/mu_c": 0.7355089820359281, "calib/mu_w": 0.4600000000000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.09432000000000004, "calib/std_conf": 0.3745665206608835, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5134777376654633, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.062251070998796676, "calib/step_q_w": 0.45122666666666666, "calib/step_q_w_n": 375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 495.19921875, "completions/mean_terminated_length": 495.19921875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1728, "grad_norm": 0.027035508304834366, "kl": 0.135833740234375, "learning_rate": 1.0555555555555557e-06, "loss": -0.0237, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03328103572130203, "mask/share_reasoning": 0.8577634692192078, "mask/share_step_conf": 0.1089554876089096, "num_tokens": 38447149.0, "reward": 0.9405144453048706, "reward_std": 0.21334876120090485, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7292284965515137, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8291440010070801, "step": 162 }, { "adv/mean_abs_final_conf": 0.7169132232666016, "adv/mean_abs_reasoning": 0.6146094799041748, "adv/mean_abs_step_conf": 0.7588496208190918, "adv/ratio_final_to_reasoning": 1.1664532466670985, "adv/ratio_step_to_reasoning": 1.2346858381315657, "adv/std_final_conf": 0.9145965576171875, "adv/std_reasoning": 0.8429942727088928, "adv/std_step_conf": 0.9352772831916809, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7947131608548932, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.17284552845528456, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.34959349593495936, "calib/gap": 0.3886931780586249, "calib/mean_conf": 0.5401626016260163, "calib/mu_c": 0.7281889763779527, "calib/mu_w": 0.33949579831932775, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.0983739837398374, "calib/std_conf": 0.3966280306520489, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5100657894736842, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.18380101405997262, "calib/step_q_w": 0.32626477541371157, "calib/step_q_w_n": 846.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 586.7109375, "completions/mean_terminated_length": 586.7109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.17386666666666667, "grad_norm": 0.028267567977309227, "kl": 0.12030029296875, "learning_rate": 1.0277777777777777e-06, "loss": -0.0168, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.031356584280729294, "mask/share_reasoning": 0.8597544431686401, "mask/share_step_conf": 0.10888896882534027, "num_tokens": 38702179.0, "reward": 0.9220717549324036, "reward_std": 0.22672028839588165, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7462793588638306, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8088016510009766, "step": 163 }, { "adv/mean_abs_final_conf": 0.7333135008811951, "adv/mean_abs_reasoning": 0.6133211255073547, "adv/mean_abs_step_conf": 0.763792872428894, "adv/ratio_final_to_reasoning": 1.1956436365608303, "adv/ratio_step_to_reasoning": 1.245339253228992, "adv/std_final_conf": 0.899974524974823, "adv/std_reasoning": 0.8268165588378906, "adv/std_step_conf": 0.9351148009300232, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8030457197123865, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.1518518518518519, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.3004115226337449, "calib/gap": 0.38663614163614163, "calib/mean_conf": 0.5279835390946501, "calib/mu_c": 0.7284615384615385, "calib/mu_w": 0.34182539682539687, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.09917695473251034, "calib/std_conf": 0.3845199665392035, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5197297297297298, "calib/step_q_c_n": 592.0, "calib/step_q_gap": 0.1706783463305202, "calib/step_q_w": 0.34905138339920955, "calib/step_q_w_n": 759.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 588.58984375, "completions/mean_terminated_length": 588.58984375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.17493333333333333, "grad_norm": 0.023231087252497673, "kl": 0.1348876953125, "learning_rate": 1.0000000000000002e-06, "loss": -0.0142, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.027646934613585472, "mask/share_reasoning": 0.8728476166725159, "mask/share_step_conf": 0.09950542449951172, "num_tokens": 38958994.0, "reward": 0.9149197340011597, "reward_std": 0.24809305369853973, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7365156412124634, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.8151988387107849, "step": 164 }, { "adv/mean_abs_final_conf": 0.747089684009552, "adv/mean_abs_reasoning": 0.5145565271377563, "adv/mean_abs_step_conf": 0.730810284614563, "adv/ratio_final_to_reasoning": 1.4519098380993662, "adv/ratio_step_to_reasoning": 1.4202721101989082, "adv/std_final_conf": 0.9169217348098755, "adv/std_reasoning": 0.7754154801368713, "adv/std_step_conf": 0.9349702000617981, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7628610261637784, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.18192622950819667, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.35655737704918034, "calib/gap": 0.39493510023785255, "calib/mean_conf": 0.5325, "calib/mu_c": 0.7510091743119266, "calib/mu_w": 0.35607407407407404, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.13385245901639342, "calib/std_conf": 0.4102656826105114, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5064244741873806, "calib/step_q_c_n": 523.0, "calib/step_q_gap": 0.11672177148467788, "calib/step_q_w": 0.3897027027027027, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 564.90625, "completions/mean_terminated_length": 564.90625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.176, "grad_norm": 0.028785517439246178, "kl": 0.135772705078125, "learning_rate": 9.722222222222224e-07, "loss": -0.0701, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02926689013838768, "mask/share_reasoning": 0.8692151308059692, "mask/share_step_conf": 0.10151800513267517, "num_tokens": 39209186.0, "reward": 0.9045617580413818, "reward_std": 0.2308696061372757, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7248390913009644, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8108468651771545, "step": 165 }, { "adv/mean_abs_final_conf": 0.7528688907623291, "adv/mean_abs_reasoning": 0.48331791162490845, "adv/mean_abs_step_conf": 0.7518739700317383, "adv/ratio_final_to_reasoning": 1.5577094758007082, "adv/ratio_step_to_reasoning": 1.555650953435489, "adv/std_final_conf": 0.9187546372413635, "adv/std_reasoning": 0.7395718693733215, "adv/std_step_conf": 0.9352030754089355, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7663355408388521, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.1821991701244813, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.4190871369294606, "calib/gap": 0.4205172921265635, "calib/mean_conf": 0.5820331950207468, "calib/mu_c": 0.7390728476821191, "calib/mu_w": 0.3185555555555556, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.06883817427385891, "calib/std_conf": 0.4090059994814056, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.4997358943577431, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.13204358666543548, "calib/step_q_w": 0.36769230769230765, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 581.94921875, "completions/mean_terminated_length": 581.94921875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.17706666666666668, "grad_norm": 0.04616737365722656, "kl": 0.11749267578125, "learning_rate": 9.444444444444445e-07, "loss": -0.0654, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02924707904458046, "mask/share_reasoning": 0.8627008199691772, "mask/share_step_conf": 0.1080520898103714, "num_tokens": 39464349.0, "reward": 0.924762487411499, "reward_std": 0.23554068803787231, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7360988855361938, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.80951988697052, "step": 166 }, { "adv/mean_abs_final_conf": 0.7144575119018555, "adv/mean_abs_reasoning": 0.5261666178703308, "adv/mean_abs_step_conf": 0.7523694634437561, "adv/ratio_final_to_reasoning": 1.3578541238393944, "adv/ratio_step_to_reasoning": 1.4299072535026747, "adv/std_final_conf": 0.882361888885498, "adv/std_reasoning": 0.7754969596862793, "adv/std_step_conf": 0.9349962472915649, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7076388888888889, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.19746987951807232, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.46586345381526106, "calib/gap": 0.286376984126984, "calib/mean_conf": 0.677710843373494, "calib/mu_c": 0.7984722222222221, "calib/mu_w": 0.5120952380952382, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1484337349397591, "calib/std_conf": 0.36235727106055615, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.512751677852349, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.07468476335420776, "calib/step_q_w": 0.4380669144981412, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 519.20703125, "completions/mean_terminated_length": 519.20703125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.17813333333333334, "grad_norm": 0.046507444232702255, "kl": 0.1273956298828125, "learning_rate": 9.166666666666666e-07, "loss": -0.072, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03145875409245491, "mask/share_reasoning": 0.864819347858429, "mask/share_step_conf": 0.10372191667556763, "num_tokens": 39702874.0, "reward": 0.9286771416664124, "reward_std": 0.20551800727844238, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7300738096237183, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8218116760253906, "step": 167 }, { "adv/mean_abs_final_conf": 0.7601954936981201, "adv/mean_abs_reasoning": 0.7227966785430908, "adv/mean_abs_step_conf": 0.733101487159729, "adv/ratio_final_to_reasoning": 1.0517418193321149, "adv/ratio_step_to_reasoning": 1.0142568566272456, "adv/std_final_conf": 0.9152436256408691, "adv/std_reasoning": 0.9055777788162231, "adv/std_step_conf": 0.935141921043396, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7616805635422657, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.167603305785124, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.34710743801652894, "calib/gap": 0.35553910293271984, "calib/mean_conf": 0.5681818181818182, "calib/mu_c": 0.7062837837837838, "calib/mu_w": 0.3507446808510639, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.062107438016528946, "calib/std_conf": 0.386343314973604, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5048571428571428, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.1718331907613344, "calib/step_q_w": 0.3330239520958084, "calib/step_q_w_n": 668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 628.80078125, "completions/mean_terminated_length": 628.80078125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.1792, "grad_norm": 0.02880941890180111, "kl": 0.1226959228515625, "learning_rate": 8.88888888888889e-07, "loss": -0.0246, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.02778783068060875, "mask/share_reasoning": 0.8679161667823792, "mask/share_step_conf": 0.104296013712883, "num_tokens": 39968519.0, "reward": 0.9214756488800049, "reward_std": 0.24216872453689575, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7268156409263611, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8130106925964355, "step": 168 }, { "adv/mean_abs_final_conf": 0.7137185335159302, "adv/mean_abs_reasoning": 0.45779332518577576, "adv/mean_abs_step_conf": 0.7305762767791748, "adv/ratio_final_to_reasoning": 1.5590409345227962, "adv/ratio_step_to_reasoning": 1.5958648512026727, "adv/std_final_conf": 0.8747919797897339, "adv/std_reasoning": 0.7394520044326782, "adv/std_step_conf": 0.9345008134841919, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7390500767192077, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.203402489626556, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.42738589211618255, "calib/gap": 0.3206653647649603, "calib/mean_conf": 0.6016597510373444, "calib/mu_c": 0.7440298507462687, "calib/mu_w": 0.4233644859813084, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12452282157676348, "calib/std_conf": 0.3937295662857602, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5413274336283186, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.16220978656949508, "calib/step_q_w": 0.3791176470588235, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 574.625, "completions/mean_terminated_length": 574.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.18026666666666666, "grad_norm": 0.03632810339331627, "kl": 0.124847412109375, "learning_rate": 8.611111111111112e-07, "loss": -0.035, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02968929149210453, "mask/share_reasoning": 0.8687570691108704, "mask/share_step_conf": 0.10155363380908966, "num_tokens": 40219807.0, "reward": 0.9066611528396606, "reward_std": 0.2341693490743637, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7024269700050354, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8194890022277832, "step": 169 }, { "adv/mean_abs_final_conf": 0.7154579162597656, "adv/mean_abs_reasoning": 0.5741154551506042, "adv/mean_abs_step_conf": 0.7530225515365601, "adv/ratio_final_to_reasoning": 1.246191702106476, "adv/ratio_step_to_reasoning": 1.3116221567994963, "adv/std_final_conf": 0.8930612802505493, "adv/std_reasoning": 0.8098737001419067, "adv/std_step_conf": 0.9352511763572693, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7849570585077832, "calib/avg_num_step_conf": 4.94140625, "calib/ece": 0.15979674796747967, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4268292682926829, "calib/gap": 0.41097826086956524, "calib/mean_conf": 0.6163821138211382, "calib/mu_c": 0.7968115942028986, "calib/mu_w": 0.38583333333333336, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.10760162601626017, "calib/std_conf": 0.39772973665533484, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5032117812061712, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.12346540439457693, "calib/step_q_w": 0.37974637681159423, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 582.06640625, "completions/mean_terminated_length": 582.06640625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.18133333333333335, "grad_norm": 0.030894074589014053, "kl": 0.1164398193359375, "learning_rate": 8.333333333333333e-07, "loss": -0.083, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.028358610346913338, "mask/share_reasoning": 0.8738337755203247, "mask/share_step_conf": 0.09780760109424591, "num_tokens": 40472968.0, "reward": 0.9447938799858093, "reward_std": 0.2374449223279953, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7596187591552734, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8315315246582031, "step": 170 }, { "adv/mean_abs_final_conf": 0.7335254549980164, "adv/mean_abs_reasoning": 0.5795778036117554, "adv/mean_abs_step_conf": 0.7598888874053955, "adv/ratio_final_to_reasoning": 1.2656203367811971, "adv/ratio_step_to_reasoning": 1.3111076419248553, "adv/std_final_conf": 0.9092419743537903, "adv/std_reasoning": 0.8100183010101318, "adv/std_step_conf": 0.9349356889724731, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7129726205997393, "calib/avg_num_step_conf": 4.97265625, "calib/ece": 0.2564112903225807, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3870967741935484, "calib/gap": 0.28493741851368964, "calib/mean_conf": 0.5499596774193548, "calib/mu_c": 0.699322033898305, "calib/mu_w": 0.4143846153846154, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.16528225806451619, "calib/std_conf": 0.41306632481560407, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5213624124932688, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.13478748894586817, "calib/step_q_w": 0.3865749235474006, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 512.66796875, "completions/mean_terminated_length": 512.66796875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1824, "grad_norm": 0.02686500735580921, "kl": 0.128143310546875, "learning_rate": 8.055555555555557e-07, "loss": -0.0714, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031494684517383575, "mask/share_reasoning": 0.863057553768158, "mask/share_step_conf": 0.10544778406620026, "num_tokens": 40711107.0, "reward": 0.8865392208099365, "reward_std": 0.2288045734167099, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6851121187210083, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8028100728988647, "step": 171 }, { "adv/mean_abs_final_conf": 0.6808489561080933, "adv/mean_abs_reasoning": 0.5288216471672058, "adv/mean_abs_step_conf": 0.7524287700653076, "adv/ratio_final_to_reasoning": 1.2874831424834214, "adv/ratio_step_to_reasoning": 1.422840335859777, "adv/std_final_conf": 0.8914951682090759, "adv/std_reasoning": 0.7754030823707581, "adv/std_step_conf": 0.9347220063209534, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7453017832647463, "calib/avg_num_step_conf": 4.90234375, "calib/ece": 0.1977380952380953, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.48412698412698413, "calib/gap": 0.28341975308641987, "calib/mean_conf": 0.7096428571428572, "calib/mu_c": 0.8108641975308643, "calib/mu_w": 0.5274444444444444, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.13226190476190483, "calib/std_conf": 0.3410526879252575, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5031578947368421, "calib/step_q_c_n": 798.0, "calib/step_q_gap": 0.06532419670620754, "calib/step_q_w": 0.4378336980306346, "calib/step_q_w_n": 457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 500.8203125, "completions/mean_terminated_length": 500.8203125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.18346666666666667, "grad_norm": 0.039231326431035995, "kl": 0.140899658203125, "learning_rate": 7.777777777777779e-07, "loss": -0.0453, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033453211188316345, "mask/share_reasoning": 0.855216383934021, "mask/share_step_conf": 0.11133037507534027, "num_tokens": 40942669.0, "reward": 0.9580105543136597, "reward_std": 0.19059154391288757, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7589675784111023, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8359596729278564, "step": 172 }, { "adv/mean_abs_final_conf": 0.7563506960868835, "adv/mean_abs_reasoning": 0.640105128288269, "adv/mean_abs_step_conf": 0.7533974051475525, "adv/ratio_final_to_reasoning": 1.1816038688980222, "adv/ratio_step_to_reasoning": 1.17699010967502, "adv/std_final_conf": 0.9212831854820251, "adv/std_reasoning": 0.8430480360984802, "adv/std_step_conf": 0.9354016780853271, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7427426416110999, "calib/avg_num_step_conf": 5.15234375, "calib/ece": 0.21116935483870966, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6129032258064516, "calib/gap": 0.2908378797063381, "calib/mean_conf": 0.7775403225806451, "calib/mu_c": 0.8959863945578231, "calib/mu_w": 0.605148514851485, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.1979838709677419, "calib/std_conf": 0.3265669397168442, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5286137281292059, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.0913915059069837, "calib/step_q_w": 0.43722222222222223, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 547.046875, "completions/mean_terminated_length": 547.046875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.18453333333333333, "grad_norm": 0.0344572588801384, "kl": 0.128143310546875, "learning_rate": 7.5e-07, "loss": -0.0827, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03216134011745453, "mask/share_reasoning": 0.8585292100906372, "mask/share_step_conf": 0.10930944979190826, "num_tokens": 41185873.0, "reward": 0.9170801639556885, "reward_std": 0.2532092332839966, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7265589237213135, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8013514280319214, "step": 173 }, { "adv/mean_abs_final_conf": 0.7960557341575623, "adv/mean_abs_reasoning": 0.6628378629684448, "adv/mean_abs_step_conf": 0.7811141610145569, "adv/ratio_final_to_reasoning": 1.2009810824513194, "adv/ratio_step_to_reasoning": 1.1784392604194711, "adv/std_final_conf": 0.9070166349411011, "adv/std_reasoning": 0.8432024717330933, "adv/std_step_conf": 0.9354541897773743, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5976460331299042, "calib/avg_num_step_conf": 5.31640625, "calib/ece": 0.3175319148936169, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.3574468085106383, "calib/gap": 0.12810883464109252, "calib/mean_conf": 0.5589787234042554, "calib/mu_c": 0.6265765765765764, "calib/mu_w": 0.4984677419354839, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.2020851063829786, "calib/std_conf": 0.39205923832479334, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.46542757417102976, "calib/step_q_c_n": 573.0, "calib/step_q_gap": 0.06306716807965917, "calib/step_q_w": 0.4023604060913706, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 612.0703125, "completions/mean_terminated_length": 614.4706420898438, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.1856, "grad_norm": 0.03898928686976433, "kl": 0.1152191162109375, "learning_rate": 7.222222222222222e-07, "loss": -0.0874, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.026619601994752884, "mask/share_reasoning": 0.8688848614692688, "mask/share_step_conf": 0.10058927536010742, "num_tokens": 41446795.0, "reward": 0.7974745035171509, "reward_std": 0.2710039019584656, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.5851151943206787, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7434275150299072, "step": 174 }, { "adv/mean_abs_final_conf": 0.7434352040290833, "adv/mean_abs_reasoning": 0.5331301689147949, "adv/mean_abs_step_conf": 0.7573065757751465, "adv/ratio_final_to_reasoning": 1.3944722084333956, "adv/ratio_step_to_reasoning": 1.4204909418588532, "adv/std_final_conf": 0.9098041653633118, "adv/std_reasoning": 0.7928568720817566, "adv/std_step_conf": 0.9353412985801697, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7717921146953404, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.19048979591836732, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3346938775510204, "calib/gap": 0.41721863799283143, "calib/mean_conf": 0.4824897959183674, "calib/mu_c": 0.7464444444444444, "calib/mu_w": 0.3292258064516129, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.15281632653061222, "calib/std_conf": 0.41405107258912305, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5602347417840375, "calib/step_q_c_n": 426.0, "calib/step_q_gap": 0.19993202779656366, "calib/step_q_w": 0.3603027139874739, "calib/step_q_w_n": 958.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 574.67578125, "completions/mean_terminated_length": 576.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.18666666666666668, "grad_norm": 0.033894505351781845, "kl": 0.120086669921875, "learning_rate": 6.944444444444446e-07, "loss": -0.0982, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.028536062687635422, "mask/share_reasoning": 0.8603699803352356, "mask/share_step_conf": 0.10718771815299988, "num_tokens": 41699736.0, "reward": 0.903607964515686, "reward_std": 0.2339988648891449, "rewards/accuracy_reward_step": 0.35546875, "rewards/final_brier_reward_step": 0.7357914447784424, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.810486912727356, "step": 175 }, { "adv/mean_abs_final_conf": 0.7065272927284241, "adv/mean_abs_reasoning": 0.5671974420547485, "adv/mean_abs_step_conf": 0.7515043020248413, "adv/ratio_final_to_reasoning": 1.245646119575107, "adv/ratio_step_to_reasoning": 1.3249430380052782, "adv/std_final_conf": 0.8785935640335083, "adv/std_reasoning": 0.8099661469459534, "adv/std_step_conf": 0.9349579215049744, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7959815546772069, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.1721862348178138, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.48582995951417, "calib/gap": 0.39831620553359687, "calib/mean_conf": 0.6418218623481781, "calib/mu_c": 0.8272727272727273, "calib/mu_w": 0.4289565217391304, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.13979757085020245, "calib/std_conf": 0.39943421606484, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5598706896551724, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.16825874935666496, "calib/step_q_w": 0.39161194029850743, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 547.34375, "completions/mean_terminated_length": 547.34375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.18773333333333334, "grad_norm": 0.02775077521800995, "kl": 0.1219940185546875, "learning_rate": 6.666666666666667e-07, "loss": -0.048, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031332723796367645, "mask/share_reasoning": 0.8548117876052856, "mask/share_step_conf": 0.1138555184006691, "num_tokens": 41943920.0, "reward": 0.9143821001052856, "reward_std": 0.24923500418663025, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7271945476531982, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8101633787155151, "step": 176 }, { "adv/mean_abs_final_conf": 0.7506808042526245, "adv/mean_abs_reasoning": 0.5949095487594604, "adv/mean_abs_step_conf": 0.7533080577850342, "adv/ratio_final_to_reasoning": 1.2618402340624508, "adv/ratio_step_to_reasoning": 1.2662564575671635, "adv/std_final_conf": 0.9032699465751648, "adv/std_reasoning": 0.8100223541259766, "adv/std_step_conf": 0.9353511333465576, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7749007936507937, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.17178861788617883, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4186991869918699, "calib/gap": 0.40023412698412675, "calib/mean_conf": 0.5900813008130081, "calib/mu_c": 0.7853174603174601, "calib/mu_w": 0.38508333333333333, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.12483739837398371, "calib/std_conf": 0.4068688607069138, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5241347626339969, "calib/step_q_c_n": 653.0, "calib/step_q_gap": 0.12902231885588605, "calib/step_q_w": 0.3951124437781109, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 542.515625, "completions/mean_terminated_length": 546.7874145507812, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1888, "grad_norm": 0.04885758087038994, "kl": 0.118408203125, "learning_rate": 6.388888888888889e-07, "loss": -0.1204, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03018646314740181, "mask/share_reasoning": 0.8533045053482056, "mask/share_step_conf": 0.10869648307561874, "num_tokens": 42186636.0, "reward": 0.9126665592193604, "reward_std": 0.23725032806396484, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.733467161655426, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8043658137321472, "step": 177 }, { "adv/mean_abs_final_conf": 0.712067186832428, "adv/mean_abs_reasoning": 0.5390047430992126, "adv/mean_abs_step_conf": 0.7495079636573792, "adv/ratio_final_to_reasoning": 1.3210777751938267, "adv/ratio_step_to_reasoning": 1.390540571772705, "adv/std_final_conf": 0.8925026059150696, "adv/std_reasoning": 0.7754620909690857, "adv/std_step_conf": 0.9346250891685486, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8412121212121212, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.12073469387755098, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.44081632653061226, "calib/gap": 0.47537710437710434, "calib/mean_conf": 0.6293061224489797, "calib/mu_c": 0.8427407407407408, "calib/mu_w": 0.36736363636363645, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09951020408163262, "calib/std_conf": 0.38333542641685114, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5588975155279502, "calib/step_q_c_n": 644.0, "calib/step_q_gap": 0.19666287865644178, "calib/step_q_w": 0.3622346368715084, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 527.1328125, "completions/mean_terminated_length": 527.1328125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.18986666666666666, "grad_norm": 0.03510009124875069, "kl": 0.1291351318359375, "learning_rate": 6.111111111111112e-07, "loss": 0.0409, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03149116411805153, "mask/share_reasoning": 0.8561463356018066, "mask/share_step_conf": 0.11236252635717392, "num_tokens": 42427654.0, "reward": 0.9619373083114624, "reward_std": 0.21293267607688904, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7950069904327393, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8304301500320435, "step": 178 }, { "adv/mean_abs_final_conf": 0.7379262447357178, "adv/mean_abs_reasoning": 0.6535248756408691, "adv/mean_abs_step_conf": 0.723167359828949, "adv/ratio_final_to_reasoning": 1.1291479058269691, "adv/ratio_step_to_reasoning": 1.1065643968330754, "adv/std_final_conf": 0.9199758172035217, "adv/std_reasoning": 0.8747087717056274, "adv/std_step_conf": 0.9348064064979553, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7792224935647811, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.13942204301075267, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4153225806451613, "calib/gap": 0.4033064924207423, "calib/mean_conf": 0.6219489247311828, "calib/mu_c": 0.7992086330935252, "calib/mu_w": 0.3959021406727829, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.10044354838709676, "calib/std_conf": 0.3861761245642092, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.503463476070529, "calib/step_q_c_n": 794.0, "calib/step_q_gap": 0.11443121800601286, "calib/step_q_w": 0.38903225806451613, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 550.66796875, "completions/mean_terminated_length": 552.8274536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.19093333333333334, "grad_norm": 0.03115103580057621, "kl": 0.1255645751953125, "learning_rate": 5.833333333333334e-07, "loss": -0.0055, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030252009630203247, "mask/share_reasoning": 0.8536373972892761, "mask/share_step_conf": 0.11220435798168182, "num_tokens": 42674889.0, "reward": 0.9555507302284241, "reward_std": 0.22918008267879486, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.770561695098877, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8389773368835449, "step": 179 }, { "adv/mean_abs_final_conf": 0.7011657953262329, "adv/mean_abs_reasoning": 0.5596412420272827, "adv/mean_abs_step_conf": 0.7473223209381104, "adv/ratio_final_to_reasoning": 1.2528844242898933, "adv/ratio_step_to_reasoning": 1.3353596283057318, "adv/std_final_conf": 0.8937993049621582, "adv/std_reasoning": 0.8098291158676147, "adv/std_step_conf": 0.9352113604545593, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7252700210748156, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.20264, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.436, "calib/gap": 0.2982336670179136, "calib/mean_conf": 0.62888, "calib/mu_c": 0.7529452054794521, "calib/mu_w": 0.4547115384615385, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.12376000000000001, "calib/std_conf": 0.38166208300013243, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5047162162162162, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.07845367141334875, "calib/step_q_w": 0.42626254480286746, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 600.7265625, "completions/mean_terminated_length": 603.0823974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.192, "grad_norm": 0.030468562617897987, "kl": 0.122283935546875, "learning_rate": 5.555555555555555e-07, "loss": -0.1283, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02861013635993004, "mask/share_reasoning": 0.8696733117103577, "mask/share_step_conf": 0.09781032800674438, "num_tokens": 42932531.0, "reward": 0.9047421216964722, "reward_std": 0.2203877568244934, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7101773023605347, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7954006195068359, "step": 180 }, { "adv/mean_abs_final_conf": 0.7361574172973633, "adv/mean_abs_reasoning": 0.5702579021453857, "adv/mean_abs_step_conf": 0.7411701083183289, "adv/ratio_final_to_reasoning": 1.290920151264615, "adv/ratio_step_to_reasoning": 1.299710368817247, "adv/std_final_conf": 0.9074108600616455, "adv/std_reasoning": 0.8100024461746216, "adv/std_step_conf": 0.9351939558982849, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8395074432324895, "calib/avg_num_step_conf": 4.76171875, "calib/ece": 0.14921487603305783, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.384297520661157, "calib/gap": 0.4427721753447212, "calib/mean_conf": 0.590702479338843, "calib/mu_c": 0.8267256637168142, "calib/mu_w": 0.38395348837209303, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.13648760330578508, "calib/std_conf": 0.39298066624636846, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5355160142348754, "calib/step_q_c_n": 562.0, "calib/step_q_gap": 0.1532176885118921, "calib/step_q_w": 0.3822983257229833, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 516.21875, "completions/mean_terminated_length": 516.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.19306666666666666, "grad_norm": 0.03454497084021568, "kl": 0.1400146484375, "learning_rate": 5.277777777777779e-07, "loss": -0.1265, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.030553974211215973, "mask/share_reasoning": 0.8623343706130981, "mask/share_step_conf": 0.10711166262626648, "num_tokens": 43170947.0, "reward": 0.9192020297050476, "reward_std": 0.24850967526435852, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7540082335472107, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8078334331512451, "step": 181 }, { "adv/mean_abs_final_conf": 0.749147891998291, "adv/mean_abs_reasoning": 0.5334514379501343, "adv/mean_abs_step_conf": 0.7665261030197144, "adv/ratio_final_to_reasoning": 1.4043413115109449, "adv/ratio_step_to_reasoning": 1.4369182431398138, "adv/std_final_conf": 0.8957116007804871, "adv/std_reasoning": 0.7754759192466736, "adv/std_step_conf": 0.9350084662437439, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7560219794262346, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.17150793650793655, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4642857142857143, "calib/gap": 0.34841863139735474, "calib/mean_conf": 0.6640476190476191, "calib/mu_c": 0.8175177304964538, "calib/mu_w": 0.4690990990990991, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1380158730158731, "calib/std_conf": 0.37316679655334634, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5327656675749318, "calib/step_q_c_n": 734.0, "calib/step_q_gap": 0.12168413679290185, "calib/step_q_w": 0.41108153078202997, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 523.50390625, "completions/mean_terminated_length": 523.50390625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.19413333333333332, "grad_norm": 0.035674456506967545, "kl": 0.13336181640625, "learning_rate": 5.000000000000001e-07, "loss": -0.0079, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03135332465171814, "mask/share_reasoning": 0.8553948998451233, "mask/share_step_conf": 0.11325179040431976, "num_tokens": 43411124.0, "reward": 0.9420223236083984, "reward_std": 0.22298182547092438, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7525800466537476, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8275582194328308, "step": 182 }, { "adv/mean_abs_final_conf": 0.7440370321273804, "adv/mean_abs_reasoning": 0.666670560836792, "adv/mean_abs_step_conf": 0.7296310663223267, "adv/ratio_final_to_reasoning": 1.1160490290638896, "adv/ratio_step_to_reasoning": 1.0944402065789554, "adv/std_final_conf": 0.9097049236297607, "adv/std_reasoning": 0.8749170899391174, "adv/std_step_conf": 0.935576319694519, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7327666666666666, "calib/avg_num_step_conf": 4.421875, "calib/ece": 0.2053061224489796, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.39591836734693875, "calib/gap": 0.31537, "calib/mean_conf": 0.5926530612244898, "calib/mu_c": 0.74712, "calib/mu_w": 0.43175, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1438775510204082, "calib/std_conf": 0.391795320940714, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.525043630017452, "calib/step_q_c_n": 573.0, "calib/step_q_gap": 0.10416706472228204, "calib/step_q_w": 0.42087656529517, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 562.40625, "completions/mean_terminated_length": 564.61181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.1952, "grad_norm": 0.041152384132146835, "kl": 0.1201171875, "learning_rate": 4.7222222222222226e-07, "loss": -0.1033, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029298175126314163, "mask/share_reasoning": 0.8750410079956055, "mask/share_step_conf": 0.09175451099872589, "num_tokens": 43661780.0, "reward": 0.8991619944572449, "reward_std": 0.2737298905849457, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7152284979820251, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7948142290115356, "step": 183 }, { "adv/mean_abs_final_conf": 0.7543465495109558, "adv/mean_abs_reasoning": 0.5843380689620972, "adv/mean_abs_step_conf": 0.7728927731513977, "adv/ratio_final_to_reasoning": 1.290941989884089, "adv/ratio_step_to_reasoning": 1.3226808489890312, "adv/std_final_conf": 0.9035541415214539, "adv/std_reasoning": 0.7930352687835693, "adv/std_step_conf": 0.9352104663848877, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6918325326012355, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.2148995983935742, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5742971887550201, "calib/gap": 0.25845710363761143, "calib/mean_conf": 0.7393975903614458, "calib/mu_c": 0.8369677419354838, "calib/mu_w": 0.5785106382978724, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16590361445783122, "calib/std_conf": 0.3532716262621504, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5172530120481927, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.1111004696753114, "calib/step_q_w": 0.40615254237288134, "calib/step_q_w_n": 590.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 557.3359375, "completions/mean_terminated_length": 557.3359375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.19626666666666667, "grad_norm": 0.042663298547267914, "kl": 0.132781982421875, "learning_rate": 4.444444444444445e-07, "loss": -0.0198, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.029847152531147003, "mask/share_reasoning": 0.8627386093139648, "mask/share_step_conf": 0.10741420835256577, "num_tokens": 43909738.0, "reward": 0.9294754266738892, "reward_std": 0.23524896800518036, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7246004343032837, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8202879428863525, "step": 184 }, { "adv/mean_abs_final_conf": 0.7113676071166992, "adv/mean_abs_reasoning": 0.5241381525993347, "adv/mean_abs_step_conf": 0.7602637410163879, "adv/ratio_final_to_reasoning": 1.3572139398531589, "adv/ratio_step_to_reasoning": 1.4505025769371038, "adv/std_final_conf": 0.9085453748703003, "adv/std_reasoning": 0.7929010391235352, "adv/std_step_conf": 0.9355183839797974, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8112326667131209, "calib/avg_num_step_conf": 5.34375, "calib/ece": 0.19073611111111122, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5333333333333333, "calib/gap": 0.41783127772745216, "calib/mean_conf": 0.681013888888889, "calib/mu_c": 0.8777427821522309, "calib/mu_w": 0.45991150442477874, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.1712916666666668, "calib/std_conf": 0.3895297970246458, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5331184407796102, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.1487104521918784, "calib/step_q_w": 0.38440798858773184, "calib/step_q_w_n": 701.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 575.1015625, "completions/mean_terminated_length": 577.3568725585938, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.19733333333333333, "grad_norm": 0.04241366311907768, "kl": 0.11263275146484375, "learning_rate": 4.1666666666666667e-07, "loss": 0.024, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.029185505583882332, "mask/share_reasoning": 0.8618265986442566, "mask/share_step_conf": 0.10508161783218384, "num_tokens": 44163884.0, "reward": 0.8901246786117554, "reward_std": 0.2551291584968567, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7246838212013245, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7719718217849731, "step": 185 }, { "adv/mean_abs_final_conf": 0.7126970291137695, "adv/mean_abs_reasoning": 0.5148271918296814, "adv/mean_abs_step_conf": 0.7509140968322754, "adv/ratio_final_to_reasoning": 1.3843422422597071, "adv/ratio_step_to_reasoning": 1.4585750495492433, "adv/std_final_conf": 0.8954695463180542, "adv/std_reasoning": 0.7754849195480347, "adv/std_step_conf": 0.9343070387840271, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7468113975576662, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.2148360655737705, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.3423364993215739, "calib/mean_conf": 0.6474590163934427, "calib/mu_c": 0.8017910447761194, "calib/mu_w": 0.4594545454545455, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15655737704918035, "calib/std_conf": 0.39706414701523246, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5039824120603015, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.10428413619823246, "calib/step_q_w": 0.399698275862069, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2154.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 559.98046875, "completions/mean_terminated_length": 562.176513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.1984, "grad_norm": 0.0349576398730278, "kl": 0.1226654052734375, "learning_rate": 3.8888888888888895e-07, "loss": 0.0252, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.02924538031220436, "mask/share_reasoning": 0.8526691198348999, "mask/share_step_conf": 0.11417928338050842, "num_tokens": 44412279.0, "reward": 0.9230327010154724, "reward_std": 0.2111315280199051, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7192398309707642, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8315130472183228, "step": 186 }, { "adv/mean_abs_final_conf": 0.7762144804000854, "adv/mean_abs_reasoning": 0.705559492111206, "adv/mean_abs_step_conf": 0.7481155395507812, "adv/ratio_final_to_reasoning": 1.1001403695632561, "adv/ratio_step_to_reasoning": 1.0603153212668674, "adv/std_final_conf": 0.9334425926208496, "adv/std_reasoning": 0.8903596997261047, "adv/std_step_conf": 0.9350556135177612, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6508196721311476, "calib/avg_num_step_conf": 5.796875, "calib/ece": 0.29305785123966943, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.4834710743801653, "calib/gap": 0.18424316939890717, "calib/mean_conf": 0.6740495867768596, "calib/mu_c": 0.7654098360655739, "calib/mu_w": 0.5811666666666667, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2314876033057851, "calib/std_conf": 0.37280504402024484, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.52334375, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.12317787322274881, "calib/step_q_w": 0.4001658767772512, "calib/step_q_w_n": 844.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2330.0, "completions/max_terminated_length": 2330.0, "completions/mean_length": 618.01953125, "completions/mean_terminated_length": 618.01953125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.19946666666666665, "grad_norm": 0.034964174032211304, "kl": 0.1192626953125, "learning_rate": 3.611111111111111e-07, "loss": 0.043, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.027607450261712074, "mask/share_reasoning": 0.8714186549186707, "mask/share_step_conf": 0.10097391903400421, "num_tokens": 44672036.0, "reward": 0.8439823389053345, "reward_std": 0.2684392035007477, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6231184005737305, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7828149199485779, "step": 187 }, { "adv/mean_abs_final_conf": 0.732832670211792, "adv/mean_abs_reasoning": 0.6510793566703796, "adv/mean_abs_step_conf": 0.7332849502563477, "adv/ratio_final_to_reasoning": 1.1255658203625114, "adv/ratio_step_to_reasoning": 1.1262604822956874, "adv/std_final_conf": 0.9215667247772217, "adv/std_reasoning": 0.8902121186256409, "adv/std_step_conf": 0.9355761408805847, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7742424242424243, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.19988980716253446, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.47520661157024796, "calib/gap": 0.36629797979797984, "calib/mean_conf": 0.6526170798898072, "calib/mu_c": 0.8191161616161616, "calib/mu_w": 0.4528181818181818, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.15352617079889808, "calib/std_conf": 0.39370417374071326, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5479680696661829, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.14402616095249, "calib/step_q_w": 0.40394190871369295, "calib/step_q_w_n": 723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 588.5859375, "completions/mean_terminated_length": 593.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.20053333333333334, "grad_norm": 0.029144667088985443, "kl": 0.112518310546875, "learning_rate": 3.3333333333333335e-07, "loss": -0.0333, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.029021695256233215, "mask/share_reasoning": 0.8537598848342896, "mask/share_step_conf": 0.10940589010715485, "num_tokens": 44926786.0, "reward": 0.8943223357200623, "reward_std": 0.26499199867248535, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7098451852798462, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7897369265556335, "step": 188 }, { "adv/mean_abs_final_conf": 0.6746816635131836, "adv/mean_abs_reasoning": 0.500690758228302, "adv/mean_abs_step_conf": 0.7490466833114624, "adv/ratio_final_to_reasoning": 1.347501731209399, "adv/ratio_step_to_reasoning": 1.4960265812813676, "adv/std_final_conf": 0.8674516081809998, "adv/std_reasoning": 0.7394338846206665, "adv/std_step_conf": 0.9350956082344055, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.749208547685002, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.2093117408906882, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.41700404858299595, "calib/gap": 0.35126566416040106, "calib/mean_conf": 0.5776518218623482, "calib/mu_c": 0.7397744360902256, "calib/mu_w": 0.3885087719298246, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.12425101214574895, "calib/std_conf": 0.4089823356048055, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5151793400286945, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.14206376213924726, "calib/step_q_w": 0.3731155778894472, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 531.33984375, "completions/mean_terminated_length": 533.423583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.2016, "grad_norm": 0.03737876936793327, "kl": 0.137725830078125, "learning_rate": 3.055555555555556e-07, "loss": -0.0113, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030570272356271744, "mask/share_reasoning": 0.8609588146209717, "mask/share_step_conf": 0.10456467419862747, "num_tokens": 45170577.0, "reward": 0.9082374572753906, "reward_std": 0.20283764600753784, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7151933312416077, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8059688806533813, "step": 189 }, { "adv/mean_abs_final_conf": 0.7458434700965881, "adv/mean_abs_reasoning": 0.6624995470046997, "adv/mean_abs_step_conf": 0.7595421075820923, "adv/ratio_final_to_reasoning": 1.12580223408258, "adv/ratio_step_to_reasoning": 1.146479436878323, "adv/std_final_conf": 0.9165393710136414, "adv/std_reasoning": 0.8747674822807312, "adv/std_step_conf": 0.9350847005844116, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.79893524409027, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.14926829268292682, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.36585365853658536, "calib/gap": 0.39336101252260086, "calib/mean_conf": 0.5851219512195123, "calib/mu_c": 0.7594160583941605, "calib/mu_w": 0.3660550458715596, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08873983739837399, "calib/std_conf": 0.3849577441781069, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.49921896792189685, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.14427302197595093, "calib/step_q_w": 0.3549459459459459, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 589.5859375, "completions/mean_terminated_length": 591.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 49.0, "epoch": 0.20266666666666666, "grad_norm": 0.07689023017883301, "kl": 0.1240234375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0043, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02882501296699047, "mask/share_reasoning": 0.8632330894470215, "mask/share_step_conf": 0.1040356308221817, "num_tokens": 45427119.0, "reward": 0.9469718933105469, "reward_std": 0.21352709829807281, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7646961212158203, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8308101296424866, "step": 190 }, { "adv/mean_abs_final_conf": 0.7412534356117249, "adv/mean_abs_reasoning": 0.5375125408172607, "adv/mean_abs_step_conf": 0.7689924240112305, "adv/ratio_final_to_reasoning": 1.3790439837639628, "adv/ratio_step_to_reasoning": 1.4306501999786205, "adv/std_final_conf": 0.8983718752861023, "adv/std_reasoning": 0.7928674817085266, "adv/std_step_conf": 0.9350568652153015, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7611643281165679, "calib/avg_num_step_conf": 6.05078125, "calib/ece": 0.2628979591836735, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5510204081632653, "calib/gap": 0.3385462763086886, "calib/mean_conf": 0.6828979591836735, "calib/mu_c": 0.8708256880733946, "calib/mu_w": 0.532279411764706, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25044897959183676, "calib/std_conf": 0.38765460351772957, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5619716088328075, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.15599346675630477, "calib/step_q_w": 0.40597814207650273, "calib/step_q_w_n": 915.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 563.8671875, "completions/mean_terminated_length": 566.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.20373333333333332, "grad_norm": 0.056538574397563934, "kl": 0.1218719482421875, "learning_rate": 2.5000000000000004e-07, "loss": -0.0491, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030824054032564163, "mask/share_reasoning": 0.8450208902359009, "mask/share_step_conf": 0.12024883925914764, "num_tokens": 45675637.0, "reward": 0.8770207166671753, "reward_std": 0.24598871171474457, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6788250207901001, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7994351387023926, "step": 191 }, { "adv/mean_abs_final_conf": 0.7328107357025146, "adv/mean_abs_reasoning": 0.6302567720413208, "adv/mean_abs_step_conf": 0.7598267793655396, "adv/ratio_final_to_reasoning": 1.1627177496705585, "adv/ratio_step_to_reasoning": 1.2055828879149655, "adv/std_final_conf": 0.9117442965507507, "adv/std_reasoning": 0.843053936958313, "adv/std_step_conf": 0.9347034692764282, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7464337604015322, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.1904453441295546, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.44534412955465585, "calib/gap": 0.36322480517765154, "calib/mean_conf": 0.6183805668016195, "calib/mu_c": 0.7845522388059701, "calib/mu_w": 0.4213274336283186, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1331578947368421, "calib/std_conf": 0.4011287324877253, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5290642458100558, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.10758883597399022, "calib/step_q_w": 0.42147540983606563, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 567.3671875, "completions/mean_terminated_length": 569.5921630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.2048, "grad_norm": 0.04736936837434769, "kl": 0.1262664794921875, "learning_rate": 2.2222222222222224e-07, "loss": -0.0253, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030947385355830193, "mask/share_reasoning": 0.8560739159584045, "mask/share_step_conf": 0.10907246917486191, "num_tokens": 45925859.0, "reward": 0.929151177406311, "reward_std": 0.2332858443260193, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7307624816894531, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.832227349281311, "step": 192 }, { "adv/mean_abs_final_conf": 0.7735573053359985, "adv/mean_abs_reasoning": 0.6546435356140137, "adv/mean_abs_step_conf": 0.7471445798873901, "adv/ratio_final_to_reasoning": 1.1816465958232543, "adv/ratio_step_to_reasoning": 1.1412998666314125, "adv/std_final_conf": 0.9217936992645264, "adv/std_reasoning": 0.8267903923988342, "adv/std_step_conf": 0.9356114268302917, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6999547832827338, "calib/avg_num_step_conf": 5.25, "calib/ece": 0.23779599999999992, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.332, "calib/gap": 0.24896563529487753, "calib/mean_conf": 0.570204, "calib/mu_c": 0.7066371681415928, "calib/mu_w": 0.4576715328467153, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17799999999999994, "calib/std_conf": 0.3786099343440422, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5068074324324324, "calib/step_q_c_n": 592.0, "calib/step_q_gap": 0.09998562392179411, "calib/step_q_w": 0.4068218085106383, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 560.5546875, "completions/mean_terminated_length": 562.7529907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.20586666666666667, "grad_norm": 0.03374604508280754, "kl": 0.1214447021484375, "learning_rate": 1.9444444444444447e-07, "loss": -0.0719, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02906222455203533, "mask/share_reasoning": 0.8611117601394653, "mask/share_step_conf": 0.1059197410941124, "num_tokens": 46175073.0, "reward": 0.8976184725761414, "reward_std": 0.24325111508369446, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7013315558433533, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8095303773880005, "step": 193 }, { "adv/mean_abs_final_conf": 0.7530584335327148, "adv/mean_abs_reasoning": 0.595274806022644, "adv/mean_abs_step_conf": 0.7521034479141235, "adv/ratio_final_to_reasoning": 1.2650601468661329, "adv/ratio_step_to_reasoning": 1.2634558699692622, "adv/std_final_conf": 0.919442355632782, "adv/std_reasoning": 0.8100360035896301, "adv/std_step_conf": 0.9344271421432495, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7624487967229903, "calib/avg_num_step_conf": 5.046875, "calib/ece": 0.21579640000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.468, "calib/gap": 0.38410724526369705, "calib/mean_conf": 0.6102835999999999, "calib/mu_c": 0.8008007936507938, "calib/mu_w": 0.41669354838709677, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16104000000000004, "calib/std_conf": 0.41774921042539387, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5432770061728395, "calib/step_q_c_n": 648.0, "calib/step_q_gap": 0.13268508070700097, "calib/step_q_w": 0.4105919254658385, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 529.07421875, "completions/mean_terminated_length": 529.07421875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.20693333333333333, "grad_norm": 0.04108177125453949, "kl": 0.1233978271484375, "learning_rate": 1.6666666666666668e-07, "loss": -0.0005, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03036157228052616, "mask/share_reasoning": 0.864544153213501, "mask/share_step_conf": 0.10509428381919861, "num_tokens": 46416460.0, "reward": 0.9311327934265137, "reward_std": 0.2457653433084488, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7367550134658813, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8325417637825012, "step": 194 }, { "adv/mean_abs_final_conf": 0.7122936844825745, "adv/mean_abs_reasoning": 0.6348901391029358, "adv/mean_abs_step_conf": 0.7122489213943481, "adv/ratio_final_to_reasoning": 1.1219164397308259, "adv/ratio_step_to_reasoning": 1.121845934480438, "adv/std_final_conf": 0.9117943048477173, "adv/std_reasoning": 0.8591422438621521, "adv/std_step_conf": 0.9355720281600952, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.789870541247762, "calib/avg_num_step_conf": 5.28125, "calib/ece": 0.1592181069958848, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.42386831275720166, "calib/gap": 0.41256369646054264, "calib/mean_conf": 0.592880658436214, "calib/mu_c": 0.7728467153284672, "calib/mu_w": 0.3602830188679245, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.09415637860082306, "calib/std_conf": 0.40700139805775315, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5230487804878049, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.1276579010089775, "calib/step_q_w": 0.3953908794788274, "calib/step_q_w_n": 614.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 574.4609375, "completions/mean_terminated_length": 574.4609375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.208, "grad_norm": 0.031227953732013702, "kl": 0.122222900390625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0479, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.02997458539903164, "mask/share_reasoning": 0.8616489171981812, "mask/share_step_conf": 0.10837653279304504, "num_tokens": 46669506.0, "reward": 0.9046463966369629, "reward_std": 0.2648537755012512, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7336082458496094, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7834970951080322, "step": 195 }, { "adv/mean_abs_final_conf": 0.7024111151695251, "adv/mean_abs_reasoning": 0.5777875185012817, "adv/mean_abs_step_conf": 0.7787004709243774, "adv/ratio_final_to_reasoning": 1.21569105021774, "adv/ratio_step_to_reasoning": 1.347728093788253, "adv/std_final_conf": 0.9007546901702881, "adv/std_reasoning": 0.8098658919334412, "adv/std_step_conf": 0.9352697134017944, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7357881136950903, "calib/avg_num_step_conf": 5.28125, "calib/ece": 0.24082156862745094, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.31706367663344404, "calib/mean_conf": 0.7101588235294118, "calib/mu_c": 0.8668255813953489, "calib/mu_w": 0.5497619047619049, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2225490196078431, "calib/std_conf": 0.3690129591095315, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5717097791798108, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.1130997513246576, "calib/step_q_w": 0.4586100278551532, "calib/step_q_w_n": 718.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 465.85546875, "completions/mean_terminated_length": 467.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.20906666666666668, "grad_norm": 0.028292890638113022, "kl": 0.14324951171875, "learning_rate": 1.1111111111111112e-07, "loss": 0.031, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03412295877933502, "mask/share_reasoning": 0.8412783145904541, "mask/share_step_conf": 0.12069250643253326, "num_tokens": 46891309.0, "reward": 0.9191616773605347, "reward_std": 0.21659040451049805, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7205460667610168, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8209022283554077, "step": 196 }, { "adv/mean_abs_final_conf": 0.7571711540222168, "adv/mean_abs_reasoning": 0.5579431653022766, "adv/mean_abs_step_conf": 0.7533473372459412, "adv/ratio_final_to_reasoning": 1.357075776010276, "adv/ratio_step_to_reasoning": 1.3502223597233252, "adv/std_final_conf": 0.9181612133979797, "adv/std_reasoning": 0.7928898334503174, "adv/std_step_conf": 0.9353706240653992, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7829932851239669, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.20963855421686742, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.41365461847389556, "calib/gap": 0.3733709969008265, "calib/mean_conf": 0.5954216867469879, "calib/mu_c": 0.7873553719008265, "calib/mu_w": 0.413984375, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15955823293172686, "calib/std_conf": 0.4018290547590611, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5099694189602446, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.11003327592065332, "calib/step_q_w": 0.3999361430395913, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 568.50390625, "completions/mean_terminated_length": 568.50390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.21013333333333334, "grad_norm": 0.025571517646312714, "kl": 0.12994384765625, "learning_rate": 8.333333333333334e-08, "loss": 0.0078, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.028912434354424477, "mask/share_reasoning": 0.8599445819854736, "mask/share_step_conf": 0.11114296317100525, "num_tokens": 47141902.0, "reward": 0.9254905581474304, "reward_std": 0.2386348396539688, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7346000075340271, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8280998468399048, "step": 197 }, { "adv/mean_abs_final_conf": 0.7414839267730713, "adv/mean_abs_reasoning": 0.5541132688522339, "adv/mean_abs_step_conf": 0.7658023238182068, "adv/ratio_final_to_reasoning": 1.3381450480493795, "adv/ratio_step_to_reasoning": 1.3820320986076662, "adv/std_final_conf": 0.9041799306869507, "adv/std_reasoning": 0.7755098342895508, "adv/std_step_conf": 0.9351991415023804, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8141909469302809, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.16147849462365585, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4314516129032258, "calib/gap": 0.4539784946236559, "calib/mean_conf": 0.6096505376344086, "calib/mu_c": 0.8366397849462365, "calib/mu_w": 0.38266129032258056, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1355645161290322, "calib/std_conf": 0.4050996201538763, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5397018813314037, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.1324096561303314, "calib/step_q_w": 0.40729222520107233, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 507.2734375, "completions/mean_terminated_length": 507.2734375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2112, "grad_norm": 0.03272956982254982, "kl": 0.1305694580078125, "learning_rate": 5.555555555555556e-08, "loss": -0.0318, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033308904618024826, "mask/share_reasoning": 0.8394599556922913, "mask/share_step_conf": 0.127231165766716, "num_tokens": 47377148.0, "reward": 0.9434477090835571, "reward_std": 0.203046053647995, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7719271183013916, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8235619068145752, "step": 198 }, { "adv/mean_abs_final_conf": 0.794552206993103, "adv/mean_abs_reasoning": 0.6170809864997864, "adv/mean_abs_step_conf": 0.7630276679992676, "adv/ratio_final_to_reasoning": 1.2875979399397328, "adv/ratio_step_to_reasoning": 1.2365113894163577, "adv/std_final_conf": 0.9217703342437744, "adv/std_reasoning": 0.8431092500686646, "adv/std_step_conf": 0.9354465007781982, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.69229709246245, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.2812033195020747, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4730290456431535, "calib/gap": 0.22925795783381575, "calib/mean_conf": 0.6927385892116182, "calib/mu_c": 0.8097457627118645, "calib/mu_w": 0.5804878048780487, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.24215767634854776, "calib/std_conf": 0.3642794999324539, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.554266441821248, "calib/step_q_c_n": 593.0, "calib/step_q_gap": 0.1362556163814645, "calib/step_q_w": 0.41801082543978346, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 583.765625, "completions/mean_terminated_length": 583.765625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.21226666666666666, "grad_norm": 0.04038415104150772, "kl": 0.1252593994140625, "learning_rate": 2.777777777777778e-08, "loss": -0.0451, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.030942562967538834, "mask/share_reasoning": 0.8595815300941467, "mask/share_step_conf": 0.10947591811418533, "num_tokens": 47630792.0, "reward": 0.8547995090484619, "reward_std": 0.28671619296073914, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6443082094192505, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7863845825195312, "step": 199 }, { "adv/mean_abs_final_conf": 0.6942980289459229, "adv/mean_abs_reasoning": 0.4747547507286072, "adv/mean_abs_step_conf": 0.7476445436477661, "adv/ratio_final_to_reasoning": 1.4624351370478803, "adv/ratio_step_to_reasoning": 1.57480160546125, "adv/std_final_conf": 0.884438693523407, "adv/std_reasoning": 0.7394700646400452, "adv/std_step_conf": 0.9350288510322571, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8390983153824909, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.16172131147540994, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5368852459016393, "calib/gap": 0.45574150787075374, "calib/mean_conf": 0.6743442622950819, "calib/mu_c": 0.8648591549295773, "calib/mu_w": 0.4091176470588236, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12704918032786897, "calib/std_conf": 0.39565568676196067, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5899313186813188, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.22475042143095697, "calib/step_q_w": 0.3651808972503618, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 578.265625, "completions/mean_terminated_length": 580.5333862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.21333333333333335, "grad_norm": 0.03473372384905815, "kl": 0.1212615966796875, "learning_rate": 0.0, "loss": 0.0451, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.028863336890935898, "mask/share_reasoning": 0.8620182275772095, "mask/share_step_conf": 0.10521218180656433, "num_tokens": 47886876.0, "reward": 0.9555322527885437, "reward_std": 0.22584620118141174, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7749546766281128, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8361097574234009, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 74.20911938300357, "train_runtime": 12329.5189, "train_samples_per_second": 4.153, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47886876, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }