{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.47760647535324097, "adv/mean_abs_reasoning": 0.4569147527217865, "adv/mean_abs_step_conf": 0.7636127471923828, "adv/ratio_final_to_reasoning": 1.0452857398632815, "adv/ratio_step_to_reasoning": 1.6712367955808674, "adv/std_final_conf": 0.7227410674095154, "adv/std_reasoning": 0.7206857204437256, "adv/std_step_conf": 0.9327075481414795, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.02494852803647518, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.1254, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018737709149718285, "mask/share_reasoning": 0.845859944820404, "mask/share_step_conf": 0.10805858671665192, "num_tokens": 300991.0, "reward": 0.8340441584587097, "reward_std": 0.239455908536911, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6532504558563232, "step": 1 }, { "adv/mean_abs_final_conf": 0.437887966632843, "adv/mean_abs_reasoning": 0.4207462966442108, "adv/mean_abs_step_conf": 0.7377474308013916, "adv/ratio_final_to_reasoning": 1.0407411072310102, "adv/ratio_step_to_reasoning": 1.7534258451839484, "adv/std_final_conf": 0.6832791566848755, "adv/std_reasoning": 0.6817297339439392, "adv/std_step_conf": 0.9317809343338013, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.02235162816941738, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": -0.0082, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01788702979683876, "mask/share_reasoning": 0.8706268668174744, "mask/share_step_conf": 0.09976735711097717, "num_tokens": 619483.0, "reward": 0.7749876976013184, "reward_std": 0.22135242819786072, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6109234094619751, "step": 2 }, { "adv/mean_abs_final_conf": 0.4195403754711151, "adv/mean_abs_reasoning": 0.39365625381469727, "adv/mean_abs_step_conf": 0.7564386129379272, "adv/ratio_final_to_reasoning": 1.065753106690392, "adv/ratio_step_to_reasoning": 1.9215714360122924, "adv/std_final_conf": 0.6908338665962219, "adv/std_reasoning": 0.6816103458404541, "adv/std_step_conf": 0.9311906695365906, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49362723046933576, "calib/avg_num_step_conf": 7.76953125, "calib/ece": 0.3032128514056225, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024516419253228783, "calib/mean_conf": 0.9899598393574297, "calib/mu_c": 0.9898830409356724, "calib/mu_w": 0.9901282051282047, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3032128514056225, "calib/std_conf": 0.002101441839670866, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9130769230769231, "calib/step_q_c_n": 1287.0, "calib/step_q_gap": 0.0029487179487178716, "calib/step_q_w": 0.9101282051282052, "calib/step_q_w_n": 702.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 794.72265625, "completions/mean_terminated_length": 810.5538330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.022143350914120674, "kl": 0.0004946589469909668, "learning_rate": 7.5e-07, "loss": -0.1178, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018371999263763428, "mask/share_reasoning": 0.8613105416297913, "mask/share_step_conf": 0.1007862240076065, "num_tokens": 928188.0, "reward": 0.8207460641860962, "reward_std": 0.19138771295547485, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.673882007598877, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6394850611686707, "step": 3 }, { "adv/mean_abs_final_conf": 0.4774017930030823, "adv/mean_abs_reasoning": 0.4621298909187317, "adv/mean_abs_step_conf": 0.7561236619949341, "adv/ratio_final_to_reasoning": 1.0330467740443914, "adv/ratio_step_to_reasoning": 1.6361712948100624, "adv/std_final_conf": 0.7400308847427368, "adv/std_reasoning": 0.7393571734428406, "adv/std_step_conf": 0.9313958287239075, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49078106852497094, "calib/avg_num_step_conf": 7.82421875, "calib/ece": 0.31812000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00018437862950060335, "calib/mean_conf": 0.9901200000000001, "calib/mu_c": 0.9900595238095237, "calib/mu_w": 0.9902439024390243, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31812000000000007, "calib/std_conf": 0.001088852607105297, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9106230529595015, "calib/step_q_c_n": 1284.0, "calib/step_q_gap": 0.0040305633906558835, "calib/step_q_w": 0.9065924895688456, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 786.1640625, "completions/mean_terminated_length": 792.3543090820312, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.004266666666666667, "grad_norm": 0.020883694291114807, "kl": 0.0005683302879333496, "learning_rate": 1.0000000000000002e-06, "loss": 0.017, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018774813041090965, "mask/share_reasoning": 0.8692216873168945, "mask/share_step_conf": 0.10419103503227234, "num_tokens": 1235614.0, "reward": 0.8037224411964417, "reward_std": 0.24146485328674316, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6623257398605347, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6193378567695618, "step": 4 }, { "adv/mean_abs_final_conf": 0.4146665036678314, "adv/mean_abs_reasoning": 0.40848180651664734, "adv/mean_abs_step_conf": 0.7514338493347168, "adv/ratio_final_to_reasoning": 1.015140691831356, "adv/ratio_step_to_reasoning": 1.8395772769970178, "adv/std_final_conf": 0.7020092010498047, "adv/std_reasoning": 0.7015328407287598, "adv/std_step_conf": 0.9299687147140503, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5117074181516784, "calib/avg_num_step_conf": 7.5, "calib/ece": 0.46298340248962655, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005958695952477111, "calib/mean_conf": 0.989954356846473, "calib/mu_c": 0.9902362204724408, "calib/mu_w": 0.989640350877193, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46298340248962655, "calib/std_conf": 0.0030761415755768723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.913924949290061, "calib/step_q_c_n": 986.0, "calib/step_q_gap": 0.011276127020253712, "calib/step_q_w": 0.9026488222698072, "calib/step_q_w_n": 934.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2891.0, "completions/max_terminated_length": 2891.0, "completions/mean_length": 818.61328125, "completions/mean_terminated_length": 851.8901977539062, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.005333333333333333, "grad_norm": 0.024567440152168274, "kl": 0.000652611255645752, "learning_rate": 1.25e-06, "loss": -0.1111, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017622657120227814, "mask/share_reasoning": 0.8454317450523376, "mask/share_step_conf": 0.09788313508033752, "num_tokens": 1551867.0, "reward": 0.6550729274749756, "reward_std": 0.2066141963005066, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.50483238697052, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.5193760395050049, "step": 5 }, { "adv/mean_abs_final_conf": 0.43477505445480347, "adv/mean_abs_reasoning": 0.397172749042511, "adv/mean_abs_step_conf": 0.7250959277153015, "adv/ratio_final_to_reasoning": 1.094674938053889, "adv/ratio_step_to_reasoning": 1.8256437015463305, "adv/std_final_conf": 0.7204685211181641, "adv/std_reasoning": 0.7015130519866943, "adv/std_step_conf": 0.9325665235519409, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.479936221100186, "calib/avg_num_step_conf": 8.28125, "calib/ece": 0.4176612903225808, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00040127557799640723, "calib/mean_conf": 0.9902419354838712, "calib/mu_c": 0.990070422535211, "calib/mu_w": 0.9904716981132075, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4176612903225808, "calib/std_conf": 0.0015364966841336825, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9122771403353929, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": -0.003421947810503867, "calib/step_q_w": 0.9156990881458967, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 726.96484375, "completions/mean_terminated_length": 747.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.0064, "grad_norm": 0.03217747434973717, "kl": 0.0018596649169921875, "learning_rate": 1.5e-06, "loss": -0.1455, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019399240612983704, "mask/share_reasoning": 0.8410153388977051, "mask/share_step_conf": 0.1122417226433754, "num_tokens": 1843922.0, "reward": 0.7091950178146362, "reward_std": 0.22388148307800293, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.562483549118042, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.5512189269065857, "step": 6 }, { "adv/mean_abs_final_conf": 0.44878897070884705, "adv/mean_abs_reasoning": 0.37493398785591125, "adv/mean_abs_step_conf": 0.7872533202171326, "adv/ratio_final_to_reasoning": 1.196981296028352, "adv/ratio_step_to_reasoning": 2.0997118045208465, "adv/std_final_conf": 0.7194306254386902, "adv/std_reasoning": 0.6612817049026489, "adv/std_step_conf": 0.9330607652664185, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4866102889358703, "calib/avg_num_step_conf": 7.48828125, "calib/ece": 0.33301593625498005, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024348132487661545, "calib/mean_conf": 0.9903864541832669, "calib/mu_c": 0.9903030303030301, "calib/mu_w": 0.9905465116279067, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33301593625498005, "calib/std_conf": 0.0018993749697701293, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9096, "calib/step_q_c_n": 1250.0, "calib/step_q_gap": 0.0018638680659669449, "calib/step_q_w": 0.907736131934033, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 862.09375, "completions/mean_terminated_length": 868.8818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.007466666666666667, "grad_norm": 0.041077230125665665, "kl": 0.0005017518997192383, "learning_rate": 1.75e-06, "loss": -0.007, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.017408661544322968, "mask/share_reasoning": 0.8800719380378723, "mask/share_step_conf": 0.09470690786838531, "num_tokens": 2172042.0, "reward": 0.8021071553230286, "reward_std": 0.20688104629516602, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6507886648178101, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6284255981445312, "step": 7 }, { "adv/mean_abs_final_conf": 0.38437363505363464, "adv/mean_abs_reasoning": 0.3675369918346405, "adv/mean_abs_step_conf": 0.7491437196731567, "adv/ratio_final_to_reasoning": 1.0458093840702956, "adv/ratio_step_to_reasoning": 2.0382811426236134, "adv/std_final_conf": 0.6616430282592773, "adv/std_reasoning": 0.6612933278083801, "adv/std_step_conf": 0.932352602481842, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5152616829508445, "calib/avg_num_step_conf": 7.53125, "calib/ece": 0.32172131147540983, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.0012951601908653965, "calib/mean_conf": 0.9897540983606558, "calib/mu_c": 0.9901840490797544, "calib/mu_w": 0.988888888888889, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32172131147540983, "calib/std_conf": 0.005862231818340563, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9095775792038994, "calib/step_q_c_n": 1231.0, "calib/step_q_gap": 0.006005125832306835, "calib/step_q_w": 0.9035724533715925, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 813.83984375, "completions/mean_terminated_length": 833.3720092773438, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.008533333333333334, "grad_norm": 0.020675357431173325, "kl": 0.0009307861328125, "learning_rate": 2.0000000000000003e-06, "loss": -0.1701, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01763816736638546, "mask/share_reasoning": 0.8619741201400757, "mask/share_step_conf": 0.0969502180814743, "num_tokens": 2486897.0, "reward": 0.790767252445221, "reward_std": 0.17744673788547516, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6436171531677246, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6199485659599304, "step": 8 }, { "adv/mean_abs_final_conf": 0.4865243136882782, "adv/mean_abs_reasoning": 0.4591737985610962, "adv/mean_abs_step_conf": 0.7573858499526978, "adv/ratio_final_to_reasoning": 1.059564625013208, "adv/ratio_step_to_reasoning": 1.6494535453157448, "adv/std_final_conf": 0.7589576244354248, "adv/std_reasoning": 0.7576943039894104, "adv/std_step_conf": 0.9348429441452026, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5019674935842601, "calib/avg_num_step_conf": 8.203125, "calib/ece": 0.2855232067510547, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.1069289991545475e-05, "calib/mean_conf": 0.9901645569620252, "calib/mu_c": 0.9901796407185626, "calib/mu_w": 0.990128571428571, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2855232067510547, "calib/std_conf": 0.001257185723335205, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9104324324324324, "calib/step_q_c_n": 1295.0, "calib/step_q_gap": 0.05269951317777399, "calib/step_q_w": 0.8577329192546584, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 785.21875, "completions/mean_terminated_length": 823.8359985351562, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.0096, "grad_norm": 0.021421346813440323, "kl": 0.0006436705589294434, "learning_rate": 2.25e-06, "loss": -0.1237, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017515994608402252, "mask/share_reasoning": 0.837012529373169, "mask/share_step_conf": 0.09859649091959, "num_tokens": 2795449.0, "reward": 0.7842223644256592, "reward_std": 0.26130688190460205, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6576511859893799, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.5951685905456543, "step": 9 }, { "adv/mean_abs_final_conf": 0.48653459548950195, "adv/mean_abs_reasoning": 0.47618716955184937, "adv/mean_abs_step_conf": 0.786577582359314, "adv/ratio_final_to_reasoning": 1.0217297453591427, "adv/ratio_step_to_reasoning": 1.6518243931258798, "adv/std_final_conf": 0.7380922436714172, "adv/std_reasoning": 0.7393761873245239, "adv/std_step_conf": 0.9339465498924255, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4885918253079507, "calib/avg_num_step_conf": 7.77734375, "calib/ece": 0.37236585365853647, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00017749160134328257, "calib/mean_conf": 0.9902520325203251, "calib/mu_c": 0.9901842105263159, "calib/mu_w": 0.9903617021276592, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37236585365853647, "calib/std_conf": 0.0014984734559308761, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9109103690685413, "calib/step_q_c_n": 1138.0, "calib/step_q_gap": -0.004569115104963939, "calib/step_q_w": 0.9154794841735052, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 806.5625, "completions/mean_terminated_length": 825.9200439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.010666666666666666, "grad_norm": 0.016852308064699173, "kl": 0.0006428956985473633, "learning_rate": 2.5e-06, "loss": -0.1333, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01787388324737549, "mask/share_reasoning": 0.8611484169960022, "mask/share_step_conf": 0.09754019975662231, "num_tokens": 3108729.0, "reward": 0.7515081167221069, "reward_std": 0.24571159482002258, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6007345914840698, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5913439989089966, "step": 10 }, { "adv/mean_abs_final_conf": 0.4119076430797577, "adv/mean_abs_reasoning": 0.3775365948677063, "adv/mean_abs_step_conf": 0.7417552471160889, "adv/ratio_final_to_reasoning": 1.0910403088847465, "adv/ratio_step_to_reasoning": 1.964724101450376, "adv/std_final_conf": 0.7001214623451233, "adv/std_reasoning": 0.6815639734268188, "adv/std_step_conf": 0.9311968088150024, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49538041576258135, "calib/avg_num_step_conf": 7.19140625, "calib/ece": 0.35708870967741946, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.77721005110299e-05, "calib/mean_conf": 0.9901532258064517, "calib/mu_c": 0.9901210191082801, "calib/mu_w": 0.9902087912087911, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35708870967741946, "calib/std_conf": 0.0011984153168213823, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9105598650927487, "calib/step_q_c_n": 1186.0, "calib/step_q_gap": 0.0014102467721378575, "calib/step_q_w": 0.9091496183206108, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 855.82421875, "completions/mean_terminated_length": 865.9723510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.011733333333333333, "grad_norm": 0.017827648669481277, "kl": 0.0007291436195373535, "learning_rate": 2.7500000000000004e-06, "loss": -0.0407, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017785221338272095, "mask/share_reasoning": 0.8739660978317261, "mask/share_step_conf": 0.09652996808290482, "num_tokens": 3432300.0, "reward": 0.7854070663452148, "reward_std": 0.1878044605255127, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6201468706130981, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6342610120773315, "step": 11 }, { "adv/mean_abs_final_conf": 0.5210590362548828, "adv/mean_abs_reasoning": 0.4791218936443329, "adv/mean_abs_step_conf": 0.7472091913223267, "adv/ratio_final_to_reasoning": 1.0875291719432074, "adv/ratio_step_to_reasoning": 1.5595388172284261, "adv/std_final_conf": 0.7741323709487915, "adv/std_reasoning": 0.7577143907546997, "adv/std_step_conf": 0.9301572442054749, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.49634538152610447, "calib/avg_num_step_conf": 8.14453125, "calib/ece": 0.301286307053942, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.309236947772302e-05, "calib/mean_conf": 0.9900829875518673, "calib/mu_c": 0.9900602409638553, "calib/mu_w": 0.990133333333333, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.301286307053942, "calib/std_conf": 0.0009071871829491895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9150713266761767, "calib/step_q_c_n": 1402.0, "calib/step_q_gap": 0.01657937938481513, "calib/step_q_w": 0.8984919472913616, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 728.11328125, "completions/mean_terminated_length": 770.2355346679688, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.0128, "grad_norm": 0.046004943549633026, "kl": 0.0010104775428771973, "learning_rate": 3e-06, "loss": -0.2455, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018598269671201706, "mask/share_reasoning": 0.8154910802841187, "mask/share_step_conf": 0.11122316122055054, "num_tokens": 3722873.0, "reward": 0.7990297675132751, "reward_std": 0.24911078810691833, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6502195000648499, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6306524276733398, "step": 12 }, { "adv/mean_abs_final_conf": 0.3958421051502228, "adv/mean_abs_reasoning": 0.36470189690589905, "adv/mean_abs_step_conf": 0.7317424416542053, "adv/ratio_final_to_reasoning": 1.085385375037845, "adv/ratio_step_to_reasoning": 2.006412491578048, "adv/std_final_conf": 0.6820527911186218, "adv/std_reasoning": 0.6613563299179077, "adv/std_step_conf": 0.9328783750534058, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5020423892100193, "calib/avg_num_step_conf": 7.953125, "calib/ece": 0.2925, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.552986512504578e-05, "calib/mean_conf": 0.9900806451612904, "calib/mu_c": 0.9900578034682079, "calib/mu_w": 0.990133333333333, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2925, "calib/std_conf": 0.0023745857845702367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9135067873303169, "calib/step_q_c_n": 1326.0, "calib/step_q_gap": 0.008746223950034993, "calib/step_q_w": 0.9047605633802819, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 771.23046875, "completions/mean_terminated_length": 786.5936279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 247.0, "epoch": 0.013866666666666666, "grad_norm": 0.025861399248242378, "kl": 0.010151028633117676, "learning_rate": 3.2500000000000002e-06, "loss": -0.1128, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01878371834754944, "mask/share_reasoning": 0.8545902967453003, "mask/share_step_conf": 0.10709473490715027, "num_tokens": 4024900.0, "reward": 0.8200558423995972, "reward_std": 0.19809523224830627, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6814616918563843, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.629743754863739, "step": 13 }, { "adv/mean_abs_final_conf": 0.4357926845550537, "adv/mean_abs_reasoning": 0.4326225519180298, "adv/mean_abs_step_conf": 0.757068932056427, "adv/ratio_final_to_reasoning": 1.007327710085776, "adv/ratio_step_to_reasoning": 1.7499525364546207, "adv/std_final_conf": 0.720037579536438, "adv/std_reasoning": 0.7205063104629517, "adv/std_step_conf": 0.933967649936676, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5018094486116594, "calib/avg_num_step_conf": 7.5078125, "calib/ece": 0.3964820717131474, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.2570075009896726e-05, "calib/mean_conf": 0.9901075697211156, "calib/mu_c": 0.9901208053691273, "calib/mu_w": 0.9900882352941174, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3964820717131474, "calib/std_conf": 0.0009780369344452016, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9137217847769029, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.014770565264707769, "calib/step_q_w": 0.8989512195121951, "calib/step_q_w_n": 779.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 817.81640625, "completions/mean_terminated_length": 827.5138549804688, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.014933333333333333, "grad_norm": 0.0291427131742239, "kl": 0.001428365707397461, "learning_rate": 3.5e-06, "loss": -0.0811, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018288377672433853, "mask/share_reasoning": 0.8677775859832764, "mask/share_step_conf": 0.10221526026725769, "num_tokens": 4339661.0, "reward": 0.7529997825622559, "reward_std": 0.21627689898014069, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5898327827453613, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6036667227745056, "step": 14 }, { "adv/mean_abs_final_conf": 0.43864238262176514, "adv/mean_abs_reasoning": 0.41422170400619507, "adv/mean_abs_step_conf": 0.7357077598571777, "adv/ratio_final_to_reasoning": 1.0589555747064496, "adv/ratio_step_to_reasoning": 1.7761207409985802, "adv/std_final_conf": 0.7389703989028931, "adv/std_reasoning": 0.7204132080078125, "adv/std_step_conf": 0.9317857623100281, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48568211068211065, "calib/avg_num_step_conf": 7.5625, "calib/ece": 0.432398406374502, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00021685971685947436, "calib/mean_conf": 0.9901673306772908, "calib/mu_c": 0.9900714285714285, "calib/mu_w": 0.990288288288288, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.432398406374502, "calib/std_conf": 0.0011992230412249103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9132129629629631, "calib/step_q_c_n": 1080.0, "calib/step_q_gap": 0.0035552526825892716, "calib/step_q_w": 0.9096577102803738, "calib/step_q_w_n": 856.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 737.5234375, "completions/mean_terminated_length": 752.2151489257812, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.016, "grad_norm": 0.025324687361717224, "kl": 0.0011706352233886719, "learning_rate": 3.7500000000000005e-06, "loss": -0.0424, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01934191584587097, "mask/share_reasoning": 0.8563178777694702, "mask/share_step_conf": 0.1048089861869812, "num_tokens": 4636347.0, "reward": 0.7159147262573242, "reward_std": 0.21545015275478363, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5552006363868713, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.571160078048706, "step": 15 }, { "adv/mean_abs_final_conf": 0.3485468029975891, "adv/mean_abs_reasoning": 0.341163694858551, "adv/mean_abs_step_conf": 0.7923579216003418, "adv/ratio_final_to_reasoning": 1.0216409549149101, "adv/ratio_step_to_reasoning": 2.322515360049841, "adv/std_final_conf": 0.6164584159851074, "adv/std_reasoning": 0.6186478734016418, "adv/std_step_conf": 0.9324037432670593, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.78515625, "calib/ece": 0.3124489795918367, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.3306690738754696e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999995, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3124489795918367, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9126959247648903, "calib/step_q_c_n": 1276.0, "calib/step_q_gap": 0.005485325043830258, "calib/step_q_w": 0.90721059972106, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 886.8671875, "completions/mean_terminated_length": 900.9445190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.017066666666666667, "grad_norm": 0.026192739605903625, "kl": 0.0015244483947753906, "learning_rate": 4.000000000000001e-06, "loss": -0.0456, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016579438000917435, "mask/share_reasoning": 0.8733184337615967, "mask/share_step_conf": 0.09447716176509857, "num_tokens": 4972233.0, "reward": 0.7912975549697876, "reward_std": 0.16759978234767914, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6545136570930481, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6062064170837402, "step": 16 }, { "adv/mean_abs_final_conf": 0.5062482357025146, "adv/mean_abs_reasoning": 0.4920458197593689, "adv/mean_abs_step_conf": 0.7667862176895142, "adv/ratio_final_to_reasoning": 1.0288640109778624, "adv/ratio_step_to_reasoning": 1.5583634427877162, "adv/std_final_conf": 0.7398620247840881, "adv/std_reasoning": 0.7394651770591736, "adv/std_step_conf": 0.932716965675354, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5029411764705882, "calib/avg_num_step_conf": 7.98046875, "calib/ece": 0.29616326530612247, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.8823529411999864e-05, "calib/mean_conf": 0.9900408163265306, "calib/mu_c": 0.9900588235294118, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29616326530612247, "calib/std_conf": 0.0006375714021148296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9090542521994135, "calib/step_q_c_n": 1364.0, "calib/step_q_gap": 0.0075225879873369594, "calib/step_q_w": 0.9015316642120765, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 786.265625, "completions/mean_terminated_length": 808.3694458007812, "completions/min_length": 0.0, "completions/min_terminated_length": 350.0, "epoch": 0.018133333333333335, "grad_norm": 0.03347507119178772, "kl": 0.002835869789123535, "learning_rate": 4.25e-06, "loss": -0.1046, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018158677965402603, "mask/share_reasoning": 0.8490204811096191, "mask/share_step_conf": 0.10547702014446259, "num_tokens": 5277045.0, "reward": 0.8143203258514404, "reward_std": 0.26758062839508057, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6698265075683594, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6338140964508057, "step": 17 }, { "adv/mean_abs_final_conf": 0.4228481650352478, "adv/mean_abs_reasoning": 0.4182824492454529, "adv/mean_abs_step_conf": 0.738125205039978, "adv/ratio_final_to_reasoning": 1.0109153893452405, "adv/ratio_step_to_reasoning": 1.7646573657859543, "adv/std_final_conf": 0.6994585990905762, "adv/std_reasoning": 0.7013540863990784, "adv/std_step_conf": 0.9333730340003967, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.36328125, "calib/ece": 0.4357831325301206, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4357831325301206, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9089214758751183, "calib/step_q_c_n": 1057.0, "calib/step_q_gap": 0.00040698312149511917, "calib/step_q_w": 0.9085144927536232, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 816.140625, "completions/mean_terminated_length": 825.8182373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.0192, "grad_norm": 0.03060261346399784, "kl": 0.0027538537979125977, "learning_rate": 4.5e-06, "loss": -0.0052, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01828933134675026, "mask/share_reasoning": 0.8699996471405029, "mask/share_step_conf": 0.09999223798513412, "num_tokens": 5596697.0, "reward": 0.6926410794258118, "reward_std": 0.19804269075393677, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5476371049880981, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5353012681007385, "step": 18 }, { "adv/mean_abs_final_conf": 0.3152657747268677, "adv/mean_abs_reasoning": 0.2920442521572113, "adv/mean_abs_step_conf": 0.7524843811988831, "adv/ratio_final_to_reasoning": 1.0795137120423652, "adv/ratio_step_to_reasoning": 2.5766108240124197, "adv/std_final_conf": 0.595735490322113, "adv/std_reasoning": 0.5726684331893921, "adv/std_step_conf": 0.9318787455558777, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4954545454545455, "calib/avg_num_step_conf": 7.546875, "calib/ece": 0.421407843137255, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.181818181784983e-05, "calib/mean_conf": 0.9900352941176471, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.990081818181818, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.421407843137255, "calib/std_conf": 0.0005624956747238556, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9118380952380952, "calib/step_q_c_n": 1050.0, "calib/step_q_gap": -0.001271882086167908, "calib/step_q_w": 0.9131099773242631, "calib/step_q_w_n": 882.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 774.671875, "completions/mean_terminated_length": 777.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.020266666666666665, "grad_norm": 0.024774307385087013, "kl": 0.0026723146438598633, "learning_rate": 4.75e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.018909579142928123, "mask/share_reasoning": 0.87503582239151, "mask/share_step_conf": 0.10214833915233612, "num_tokens": 5899773.0, "reward": 0.7455166578292847, "reward_std": 0.15792196989059448, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5748304128646851, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6037027835845947, "step": 19 }, { "adv/mean_abs_final_conf": 0.46466466784477234, "adv/mean_abs_reasoning": 0.42911815643310547, "adv/mean_abs_step_conf": 0.7354490756988525, "adv/ratio_final_to_reasoning": 1.0828361859752913, "adv/ratio_step_to_reasoning": 1.7138614730544512, "adv/std_final_conf": 0.7561859488487244, "adv/std_reasoning": 0.739301323890686, "adv/std_step_conf": 0.9323933720588684, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5048543689320388, "calib/avg_num_step_conf": 7.84765625, "calib/ece": 0.4063157894736842, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.0016504854368935007, "calib/mean_conf": 0.9893117408906882, "calib/mu_c": 0.99, "calib/mu_w": 0.9883495145631065, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4063157894736842, "calib/std_conf": 0.008094331487680583, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9133758503401361, "calib/step_q_c_n": 1176.0, "calib/step_q_gap": 0.004708383353341339, "calib/step_q_w": 0.9086674669867948, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 775.45703125, "completions/mean_terminated_length": 790.9044189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.021333333333333333, "grad_norm": 0.03557265177369118, "kl": 0.0034775733947753906, "learning_rate": 5e-06, "loss": -0.0898, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0192178413271904, "mask/share_reasoning": 0.8497927784919739, "mask/share_step_conf": 0.11145815998315811, "num_tokens": 6203162.0, "reward": 0.7248783111572266, "reward_std": 0.22109970450401306, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5716238021850586, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5734453201293945, "step": 20 }, { "adv/mean_abs_final_conf": 0.49779677391052246, "adv/mean_abs_reasoning": 0.483091801404953, "adv/mean_abs_step_conf": 0.728674054145813, "adv/ratio_final_to_reasoning": 1.030439292206582, "adv/ratio_step_to_reasoning": 1.5083552484779181, "adv/std_final_conf": 0.7767319679260254, "adv/std_reasoning": 0.775422990322113, "adv/std_step_conf": 0.9327322840690613, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4968944099378882, "calib/avg_num_step_conf": 8.02734375, "calib/ece": 0.34333333333333327, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00012422360248454112, "calib/mean_conf": 0.9899196787148594, "calib/mu_c": 0.9898757763975156, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34333333333333327, "calib/std_conf": 0.0012649008632950713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9148021722265323, "calib/step_q_c_n": 1289.0, "calib/step_q_gap": 0.005350475098594765, "calib/step_q_w": 0.9094516971279375, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 776.6953125, "completions/mean_terminated_length": 789.0238647460938, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.0224, "grad_norm": 6.381322860717773, "kl": 0.11067986488342285, "learning_rate": 4.9722222222222224e-06, "loss": 0.0292, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019109871238470078, "mask/share_reasoning": 0.8561791181564331, "mask/share_step_conf": 0.10908600687980652, "num_tokens": 6504956.0, "reward": 0.7912448644638062, "reward_std": 0.26544326543807983, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6356808543205261, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6264961957931519, "step": 21 }, { "adv/mean_abs_final_conf": 0.2924252152442932, "adv/mean_abs_reasoning": 0.291792631149292, "adv/mean_abs_step_conf": 0.7459397315979004, "adv/ratio_final_to_reasoning": 1.002167923475345, "adv/ratio_step_to_reasoning": 2.5564035961423914, "adv/std_final_conf": 0.5933868885040283, "adv/std_reasoning": 0.5960041880607605, "adv/std_step_conf": 0.932509183883667, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.94140625, "calib/ece": 0.33262948207171317, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33262948207171317, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9163239875389407, "calib/step_q_c_n": 1284.0, "calib/step_q_gap": 0.003173119715175643, "calib/step_q_w": 0.9131508678237651, "calib/step_q_w_n": 749.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 726.9375, "completions/mean_terminated_length": 741.4183349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.023466666666666667, "grad_norm": 0.01994170807301998, "kl": 0.004857063293457031, "learning_rate": 4.944444444444445e-06, "loss": -0.128, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01959868147969246, "mask/share_reasoning": 0.8490219116210938, "mask/share_step_conf": 0.11184822022914886, "num_tokens": 6792868.0, "reward": 0.8057518005371094, "reward_std": 0.1436442732810974, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6511518955230713, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6353515982627869, "step": 22 }, { "adv/mean_abs_final_conf": 0.5219694375991821, "adv/mean_abs_reasoning": 0.5116857290267944, "adv/mean_abs_step_conf": 0.740062952041626, "adv/ratio_final_to_reasoning": 1.0200977044873754, "adv/ratio_step_to_reasoning": 1.4463232215782054, "adv/std_final_conf": 0.7941882610321045, "adv/std_reasoning": 0.7927516102790833, "adv/std_step_conf": 0.9338461756706238, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.1015625, "calib/ece": 0.382857142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.382857142857143, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9151562499999999, "calib/step_q_c_n": 1216.0, "calib/step_q_gap": 0.003699373543123441, "calib/step_q_w": 0.9114568764568765, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 794.86328125, "completions/mean_terminated_length": 797.98046875, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.024533333333333334, "grad_norm": 0.0262162946164608, "kl": 0.005928754806518555, "learning_rate": 4.9166666666666665e-06, "loss": -0.0118, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018801795318722725, "mask/share_reasoning": 0.8657557368278503, "mask/share_step_conf": 0.11153615266084671, "num_tokens": 7100289.0, "reward": 0.7581058740615845, "reward_std": 0.28139084577560425, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6052922010421753, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5945133566856384, "step": 23 }, { "adv/mean_abs_final_conf": 0.5681447982788086, "adv/mean_abs_reasoning": 0.54884934425354, "adv/mean_abs_step_conf": 0.7714293003082275, "adv/ratio_final_to_reasoning": 1.0351561940034952, "adv/ratio_step_to_reasoning": 1.4055392584232862, "adv/std_final_conf": 0.7933010458946228, "adv/std_reasoning": 0.7930247187614441, "adv/std_step_conf": 0.9330933094024658, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5047169811320755, "calib/avg_num_step_conf": 7.83203125, "calib/ece": 0.42052845528455285, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.0008490566037736569, "calib/mean_conf": 0.9896341463414634, "calib/mu_c": 0.99, "calib/mu_w": 0.9891509433962263, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.42052845528455285, "calib/std_conf": 0.005726515552133605, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9156255625562556, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": 0.005949947343727668, "calib/step_q_w": 0.9096756152125279, "calib/step_q_w_n": 894.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 764.26171875, "completions/mean_terminated_length": 792.1093139648438, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0256, "grad_norm": 0.025009525939822197, "kl": 0.007487773895263672, "learning_rate": 4.888888888888889e-06, "loss": -0.1196, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01863842085003853, "mask/share_reasoning": 0.8378759026527405, "mask/share_step_conf": 0.1083294153213501, "num_tokens": 7400452.0, "reward": 0.7031575441360474, "reward_std": 0.30026620626449585, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5555691123008728, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.550745964050293, "step": 24 }, { "adv/mean_abs_final_conf": 0.3911153972148895, "adv/mean_abs_reasoning": 0.389061838388443, "adv/mean_abs_step_conf": 0.7705321907997131, "adv/ratio_final_to_reasoning": 1.0052782324654423, "adv/ratio_step_to_reasoning": 1.9804877137047983, "adv/std_final_conf": 0.659353494644165, "adv/std_reasoning": 0.6613019704818726, "adv/std_step_conf": 0.9323583841323853, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.1015625, "calib/ece": 0.3645019920318725, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3645019920318725, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9160833333333332, "calib/step_q_c_n": 1200.0, "calib/step_q_gap": -0.00015236460717016342, "calib/step_q_w": 0.9162356979405034, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 728.5859375, "completions/mean_terminated_length": 743.099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.02666666666666667, "grad_norm": 0.0166942048817873, "kl": 0.00994873046875, "learning_rate": 4.861111111111111e-06, "loss": -0.0865, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019598091021180153, "mask/share_reasoning": 0.8502811789512634, "mask/share_step_conf": 0.11058945208787918, "num_tokens": 7690194.0, "reward": 0.7656707763671875, "reward_std": 0.18828433752059937, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6204491853713989, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.5929235219955444, "step": 25 }, { "adv/mean_abs_final_conf": 0.39370179176330566, "adv/mean_abs_reasoning": 0.391094446182251, "adv/mean_abs_step_conf": 0.7542855739593506, "adv/ratio_final_to_reasoning": 1.0066667926545796, "adv/ratio_step_to_reasoning": 1.9286532481411194, "adv/std_final_conf": 0.6602107882499695, "adv/std_reasoning": 0.6612504720687866, "adv/std_step_conf": 0.9315699934959412, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.80859375, "calib/ece": 0.3457312252964426, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3457312252964426, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9167488076311605, "calib/step_q_c_n": 1258.0, "calib/step_q_gap": 0.0005747185623345974, "calib/step_q_w": 0.9161740890688259, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 758.3515625, "completions/mean_terminated_length": 761.3255615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 510.0, "epoch": 0.027733333333333332, "grad_norm": 0.025107277557253838, "kl": 0.012006759643554688, "learning_rate": 4.833333333333333e-06, "loss": -0.0148, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019063986837863922, "mask/share_reasoning": 0.8701338171958923, "mask/share_step_conf": 0.10689593106508255, "num_tokens": 7989572.0, "reward": 0.7914588451385498, "reward_std": 0.19029450416564941, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6436511278152466, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6142666339874268, "step": 26 }, { "adv/mean_abs_final_conf": 0.4913161098957062, "adv/mean_abs_reasoning": 0.49099820852279663, "adv/mean_abs_step_conf": 0.7479568719863892, "adv/ratio_final_to_reasoning": 1.0006474593336419, "adv/ratio_step_to_reasoning": 1.5233393096008865, "adv/std_final_conf": 0.7396231889724731, "adv/std_reasoning": 0.7393175363540649, "adv/std_step_conf": 0.9338703751564026, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4966216216216216, "calib/avg_num_step_conf": 8.2109375, "calib/ece": 0.40716535433070866, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00027027027027037853, "calib/mean_conf": 0.9898425196850393, "calib/mu_c": 0.9897297297297296, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40716535433070866, "calib/std_conf": 0.0025048777512735247, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9176302931596092, "calib/step_q_c_n": 1228.0, "calib/step_q_gap": 0.0008911627248265885, "calib/step_q_w": 0.9167391304347826, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 780.6796875, "completions/mean_terminated_length": 783.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 474.0, "epoch": 0.0288, "grad_norm": 0.0525457039475441, "kl": 0.013294219970703125, "learning_rate": 4.805555555555556e-06, "loss": -0.0255, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01896524243056774, "mask/share_reasoning": 0.8649869561195374, "mask/share_step_conf": 0.11214153468608856, "num_tokens": 8294642.0, "reward": 0.7358760833740234, "reward_std": 0.25070199370384216, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5862976312637329, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.5713919401168823, "step": 27 }, { "adv/mean_abs_final_conf": 0.3187648057937622, "adv/mean_abs_reasoning": 0.3049775958061218, "adv/mean_abs_step_conf": 0.7408556342124939, "adv/ratio_final_to_reasoning": 1.045207287936013, "adv/ratio_step_to_reasoning": 2.4292133074701834, "adv/std_final_conf": 0.6184374690055847, "adv/std_reasoning": 0.6186066269874573, "adv/std_step_conf": 0.930253803730011, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.18359375, "calib/ece": 0.2976923076923076, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9899999999999999, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2976923076923076, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9170390206579956, "calib/step_q_c_n": 1307.0, "calib/step_q_gap": 0.007283381560251212, "calib/step_q_w": 0.9097556390977444, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 770.26953125, "completions/mean_terminated_length": 785.6135864257812, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.029866666666666666, "grad_norm": 0.02304774895310402, "kl": 0.012969017028808594, "learning_rate": 4.777777777777778e-06, "loss": -0.1212, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018717553466558456, "mask/share_reasoning": 0.8605189323425293, "mask/share_step_conf": 0.10123226046562195, "num_tokens": 8598775.0, "reward": 0.815354585647583, "reward_std": 0.18047857284545898, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6738097667694092, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6303368806838989, "step": 28 }, { "adv/mean_abs_final_conf": 0.30036962032318115, "adv/mean_abs_reasoning": 0.2887539565563202, "adv/mean_abs_step_conf": 0.7428664565086365, "adv/ratio_final_to_reasoning": 1.0402268557819583, "adv/ratio_step_to_reasoning": 2.5726624333327313, "adv/std_final_conf": 0.5961382389068604, "adv/std_reasoning": 0.5960955619812012, "adv/std_step_conf": 0.9326738119125366, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.88671875, "calib/ece": 0.39800000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39800000000000013, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9174784110535407, "calib/step_q_c_n": 1158.0, "calib/step_q_gap": -0.0008607294807219112, "calib/step_q_w": 0.9183391405342626, "calib/step_q_w_n": 861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 838.57421875, "completions/mean_terminated_length": 855.2789306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.030933333333333334, "grad_norm": 0.024902043864130974, "kl": 0.012516975402832031, "learning_rate": 4.75e-06, "loss": -0.0868, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017539583146572113, "mask/share_reasoning": 0.864224374294281, "mask/share_step_conf": 0.09870478510856628, "num_tokens": 8920578.0, "reward": 0.7263821363449097, "reward_std": 0.16157840192317963, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5859960317611694, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.555830717086792, "step": 29 }, { "adv/mean_abs_final_conf": 0.5582062602043152, "adv/mean_abs_reasoning": 0.520081102848053, "adv/mean_abs_step_conf": 0.7749965190887451, "adv/ratio_final_to_reasoning": 1.073306176954868, "adv/ratio_step_to_reasoning": 1.4901455077770211, "adv/std_final_conf": 0.7905355095863342, "adv/std_reasoning": 0.7754203081130981, "adv/std_step_conf": 0.9350129961967468, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4897959183673469, "calib/avg_num_step_conf": 8.23046875, "calib/ece": 0.39616935483870974, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9879032258064516, "calib/gap": -0.001836734693877351, "calib/mean_conf": 0.9889112903225807, "calib/mu_c": 0.9881632653061224, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.39616935483870974, "calib/std_conf": 0.009838627048833353, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9200964630225079, "calib/step_q_c_n": 1244.0, "calib/step_q_gap": 0.0009608894419749214, "calib/step_q_w": 0.919135573580533, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 820.453125, "completions/mean_terminated_length": 840.14404296875, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.032, "grad_norm": 0.052083615213632584, "kl": 0.0153656005859375, "learning_rate": 4.722222222222222e-06, "loss": -0.1069, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017525548115372658, "mask/share_reasoning": 0.8551870584487915, "mask/share_step_conf": 0.10384992510080338, "num_tokens": 9237598.0, "reward": 0.7097210884094238, "reward_std": 0.25035855174064636, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5817409753799438, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5306699275970459, "step": 30 }, { "adv/mean_abs_final_conf": 0.5183556079864502, "adv/mean_abs_reasoning": 0.5065758228302002, "adv/mean_abs_step_conf": 0.7664437294006348, "adv/ratio_final_to_reasoning": 1.0232537452941146, "adv/ratio_step_to_reasoning": 1.512989161461699, "adv/std_final_conf": 0.7564479112625122, "adv/std_reasoning": 0.7576651573181152, "adv/std_step_conf": 0.9329509735107422, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5004040948275862, "calib/avg_num_step_conf": 8.58984375, "calib/ece": 0.4646721311475409, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": 7.273706896571142e-05, "calib/mean_conf": 0.9892622950819672, "calib/mu_c": 0.9892968750000002, "calib/mu_w": 0.9892241379310345, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4646721311475409, "calib/std_conf": 0.008114754098360653, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9168671454219032, "calib/step_q_c_n": 1114.0, "calib/step_q_gap": -0.004450826928327323, "calib/step_q_w": 0.9213179723502305, "calib/step_q_w_n": 1085.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 832.390625, "completions/mean_terminated_length": 859.2418823242188, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.03306666666666667, "grad_norm": 0.025735603645443916, "kl": 0.018184661865234375, "learning_rate": 4.694444444444445e-06, "loss": -0.1554, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017364859580993652, "mask/share_reasoning": 0.847489595413208, "mask/share_step_conf": 0.10389558970928192, "num_tokens": 9556602.0, "reward": 0.6444910168647766, "reward_std": 0.24960076808929443, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5095929503440857, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.48876410722732544, "step": 31 }, { "adv/mean_abs_final_conf": 0.3183901906013489, "adv/mean_abs_reasoning": 0.30197247862815857, "adv/mean_abs_step_conf": 0.7719812989234924, "adv/ratio_final_to_reasoning": 1.054368239277218, "adv/ratio_step_to_reasoning": 2.556462438002806, "adv/std_final_conf": 0.5945860147476196, "adv/std_reasoning": 0.5727756023406982, "adv/std_step_conf": 0.9287157654762268, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5103276353276354, "calib/avg_num_step_conf": 8.21875, "calib/ece": 0.41924302788844625, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.0009447034447033209, "calib/mean_conf": 0.9889641434262949, "calib/mu_c": 0.9893706293706291, "calib/mu_w": 0.9884259259259258, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41924302788844625, "calib/std_conf": 0.008730282802526973, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.920017316017316, "calib/step_q_c_n": 1155.0, "calib/step_q_gap": 3.8390832911239237e-05, "calib/step_q_w": 0.9199789251844047, "calib/step_q_w_n": 949.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 782.78125, "completions/mean_terminated_length": 798.37451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.034133333333333335, "grad_norm": 0.04063679277896881, "kl": 0.023967742919921875, "learning_rate": 4.666666666666667e-06, "loss": -0.0673, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018448833376169205, "mask/share_reasoning": 0.8532680869102478, "mask/share_step_conf": 0.10875185579061508, "num_tokens": 9863698.0, "reward": 0.7091785073280334, "reward_std": 0.1623982936143875, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5681651830673218, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.5423792600631714, "step": 32 }, { "adv/mean_abs_final_conf": 0.4482853412628174, "adv/mean_abs_reasoning": 0.4236627519130707, "adv/mean_abs_step_conf": 0.7881948947906494, "adv/ratio_final_to_reasoning": 1.05811837183836, "adv/ratio_step_to_reasoning": 1.8604300029481358, "adv/std_final_conf": 0.7025040984153748, "adv/std_reasoning": 0.6816445589065552, "adv/std_step_conf": 0.9337459206581116, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5040322580645161, "calib/avg_num_step_conf": 7.80859375, "calib/ece": 0.47783464566929135, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0007258064516131313, "calib/mean_conf": 0.9896456692913386, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.989274193548387, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.47783464566929135, "calib/std_conf": 0.005635974940365422, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.92001953125, "calib/step_q_c_n": 1024.0, "calib/step_q_gap": 0.00918876201923069, "calib/step_q_w": 0.9108307692307693, "calib/step_q_w_n": 975.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 764.6796875, "completions/mean_terminated_length": 767.678466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.0352, "grad_norm": 0.061604708433151245, "kl": 0.03219795227050781, "learning_rate": 4.638888888888889e-06, "loss": 0.0305, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01927250623703003, "mask/share_reasoning": 0.8669767379760742, "mask/share_step_conf": 0.10984449833631516, "num_tokens": 10166328.0, "reward": 0.6816341876983643, "reward_std": 0.23087355494499207, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5180652141571045, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.5452032089233398, "step": 33 }, { "adv/mean_abs_final_conf": 0.5265980362892151, "adv/mean_abs_reasoning": 0.520954966545105, "adv/mean_abs_step_conf": 0.7565268874168396, "adv/ratio_final_to_reasoning": 1.0108321642109184, "adv/ratio_step_to_reasoning": 1.4521924849551051, "adv/std_final_conf": 0.7767467498779297, "adv/std_reasoning": 0.7753825783729553, "adv/std_step_conf": 0.9345172643661499, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5070441079657669, "calib/avg_num_step_conf": 8.34765625, "calib/ece": 0.376600790513834, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.0017722185648452182, "calib/mean_conf": 0.9892490118577075, "calib/mu_c": 0.9899354838709677, "calib/mu_w": 0.9881632653061225, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.376600790513834, "calib/std_conf": 0.007991426298258135, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9212392156862744, "calib/step_q_c_n": 1275.0, "calib/step_q_gap": 0.0017728583776897011, "calib/step_q_w": 0.9194663573085847, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 741.03125, "completions/mean_terminated_length": 746.8661499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 445.0, "epoch": 0.03626666666666667, "grad_norm": 0.03941137716174126, "kl": 0.0266265869140625, "learning_rate": 4.611111111111112e-06, "loss": 0.0149, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019667698070406914, "mask/share_reasoning": 0.8525025844573975, "mask/share_step_conf": 0.12001723051071167, "num_tokens": 10461144.0, "reward": 0.775858998298645, "reward_std": 0.28364232182502747, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6142761707305908, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6194730997085571, "step": 34 }, { "adv/mean_abs_final_conf": 0.5497709512710571, "adv/mean_abs_reasoning": 0.4924013018608093, "adv/mean_abs_step_conf": 0.7616751194000244, "adv/ratio_final_to_reasoning": 1.1165099466501105, "adv/ratio_step_to_reasoning": 1.546858459800199, "adv/std_final_conf": 0.7888799905776978, "adv/std_reasoning": 0.7575621008872986, "adv/std_step_conf": 0.9331747889518738, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5073451910408432, "calib/avg_num_step_conf": 7.83203125, "calib/ece": 0.4517813765182185, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9595141700404858, "calib/gap": 0.00166534914360994, "calib/mean_conf": 0.9861943319838056, "calib/mu_c": 0.9869696969696969, "calib/mu_w": 0.985304347826087, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4517813765182185, "calib/std_conf": 0.01788658286023639, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9200578592092575, "calib/step_q_c_n": 1037.0, "calib/step_q_gap": 0.001845049291902101, "calib/step_q_w": 0.9182128099173554, "calib/step_q_w_n": 968.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 801.28515625, "completions/mean_terminated_length": 823.8112182617188, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.037333333333333336, "grad_norm": 0.04921026900410652, "kl": 0.024835586547851562, "learning_rate": 4.583333333333333e-06, "loss": -0.0897, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01787012442946434, "mask/share_reasoning": 0.8555195927619934, "mask/share_step_conf": 0.09926652163267136, "num_tokens": 10775529.0, "reward": 0.6645053625106812, "reward_std": 0.22534745931625366, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5283355116844177, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.5045813322067261, "step": 35 }, { "adv/mean_abs_final_conf": 0.3687620759010315, "adv/mean_abs_reasoning": 0.3481977880001068, "adv/mean_abs_step_conf": 0.796619176864624, "adv/ratio_final_to_reasoning": 1.059059214646471, "adv/ratio_step_to_reasoning": 2.287835260068854, "adv/std_final_conf": 0.6157107949256897, "adv/std_reasoning": 0.5961853265762329, "adv/std_step_conf": 0.930061399936676, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5065135895032803, "calib/avg_num_step_conf": 8.3046875, "calib/ece": 0.2101606425702811, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.0011724461105901973, "calib/mean_conf": 0.989277108433735, "calib/mu_c": 0.989536082474227, "calib/mu_w": 0.9883636363636368, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2101606425702811, "calib/std_conf": 0.008033534013575733, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9219170673076924, "calib/step_q_c_n": 1664.0, "calib/step_q_gap": 0.006700617091242256, "calib/step_q_w": 0.9152164502164502, "calib/step_q_w_n": 462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 774.19140625, "completions/mean_terminated_length": 780.2874145507812, "completions/min_length": 0.0, "completions/min_terminated_length": 358.0, "epoch": 0.0384, "grad_norm": 0.038044821470975876, "kl": 0.027385711669921875, "learning_rate": 4.555555555555556e-06, "loss": -0.0374, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019639793783426285, "mask/share_reasoning": 0.8501439690589905, "mask/share_step_conf": 0.12240374088287354, "num_tokens": 11076434.0, "reward": 0.9108313918113708, "reward_std": 0.1632777750492096, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7626378536224365, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.7129310965538025, "step": 36 }, { "adv/mean_abs_final_conf": 0.41283056139945984, "adv/mean_abs_reasoning": 0.40489670634269714, "adv/mean_abs_step_conf": 0.7636315822601318, "adv/ratio_final_to_reasoning": 1.0195947631395341, "adv/ratio_step_to_reasoning": 1.8859910942664178, "adv/std_final_conf": 0.6809049248695374, "adv/std_reasoning": 0.6816288828849792, "adv/std_step_conf": 0.9321484565734863, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5079365079365079, "calib/avg_num_step_conf": 8.23046875, "calib/ece": 0.48968000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.008571428571428563, "calib/mean_conf": 0.9856800000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9814285714285713, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48968000000000006, "calib/std_conf": 0.06272270402334389, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9196353166986564, "calib/step_q_c_n": 1042.0, "calib/step_q_gap": 0.012273814351238532, "calib/step_q_w": 0.9073615023474179, "calib/step_q_w_n": 1065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 763.76171875, "completions/mean_terminated_length": 778.9761352539062, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.039466666666666664, "grad_norm": 0.022712556645274162, "kl": 0.025478363037109375, "learning_rate": 4.527777777777778e-06, "loss": -0.1116, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019015856087207794, "mask/share_reasoning": 0.8509063720703125, "mask/share_step_conf": 0.11054646223783493, "num_tokens": 11379053.0, "reward": 0.6553043723106384, "reward_std": 0.20234841108322144, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.49861401319503784, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.5198071599006653, "step": 37 }, { "adv/mean_abs_final_conf": 0.5180400609970093, "adv/mean_abs_reasoning": 0.513764500617981, "adv/mean_abs_step_conf": 0.7800165414810181, "adv/ratio_final_to_reasoning": 1.0083220237557975, "adv/ratio_step_to_reasoning": 1.5182375203868235, "adv/std_final_conf": 0.7381928563117981, "adv/std_reasoning": 0.7394514679908752, "adv/std_step_conf": 0.9326812624931335, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.505, "calib/avg_num_step_conf": 8.0859375, "calib/ece": 0.3996721311475411, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000400000000000178, "calib/mean_conf": 0.9898360655737706, "calib/mu_c": 0.99, "calib/mu_w": 0.9895999999999998, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3996721311475411, "calib/std_conf": 0.0025554847980524434, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9227439532944121, "calib/step_q_c_n": 1199.0, "calib/step_q_gap": 0.013857615751358132, "calib/step_q_w": 0.9088863375430539, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2602.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 754.5, "completions/mean_terminated_length": 775.7108154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 0.04053333333333333, "grad_norm": 0.022115031257271767, "kl": 0.029575347900390625, "learning_rate": 4.5e-06, "loss": -0.0876, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01870632730424404, "mask/share_reasoning": 0.8444530963897705, "mask/share_step_conf": 0.10949686914682388, "num_tokens": 11679093.0, "reward": 0.710079550743103, "reward_std": 0.23920077085494995, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5704425573348999, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.5473726987838745, "step": 38 }, { "adv/mean_abs_final_conf": 0.4728773832321167, "adv/mean_abs_reasoning": 0.47525712847709656, "adv/mean_abs_step_conf": 0.7562281489372253, "adv/ratio_final_to_reasoning": 0.9949927205666426, "adv/ratio_step_to_reasoning": 1.5911979087204144, "adv/std_final_conf": 0.7385199069976807, "adv/std_reasoning": 0.7392476797103882, "adv/std_step_conf": 0.9321252107620239, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.504424778761062, "calib/avg_num_step_conf": 8.3671875, "calib/ece": 0.4398406374501994, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0007964601769909763, "calib/mean_conf": 0.9896414342629484, "calib/mu_c": 0.99, "calib/mu_w": 0.989203539823009, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4398406374501994, "calib/std_conf": 0.005669422099903468, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.924746835443038, "calib/step_q_c_n": 1106.0, "calib/step_q_gap": 0.0039360246322271975, "calib/step_q_w": 0.9208108108108108, "calib/step_q_w_n": 1036.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 751.0, "completions/mean_terminated_length": 759.9051513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.0416, "grad_norm": 0.028168994933366776, "kl": 0.02768707275390625, "learning_rate": 4.472222222222223e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019812334328889847, "mask/share_reasoning": 0.8540918231010437, "mask/share_step_conf": 0.11437710374593735, "num_tokens": 11977437.0, "reward": 0.7053453922271729, "reward_std": 0.22924397885799408, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5484570264816284, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.5583274960517883, "step": 39 }, { "adv/mean_abs_final_conf": 0.4358151853084564, "adv/mean_abs_reasoning": 0.402193546295166, "adv/mean_abs_step_conf": 0.7792983651161194, "adv/ratio_final_to_reasoning": 1.083595670102115, "adv/ratio_step_to_reasoning": 1.9376202634146689, "adv/std_final_conf": 0.6807341575622559, "adv/std_reasoning": 0.6614307761192322, "adv/std_step_conf": 0.9320389032363892, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49264705882352944, "calib/avg_num_step_conf": 7.3125, "calib/ece": 0.4477689243027889, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.0007352941176469452, "calib/mean_conf": 0.989601593625498, "calib/mu_c": 0.989264705882353, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4477689243027889, "calib/std_conf": 0.005701806298877533, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9225623130608175, "calib/step_q_c_n": 1003.0, "calib/step_q_gap": -0.0017760068471226464, "calib/step_q_w": 0.9243383199079401, "calib/step_q_w_n": 869.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2602.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 793.5703125, "completions/mean_terminated_length": 796.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 455.0, "epoch": 0.042666666666666665, "grad_norm": 0.02842039056122303, "kl": 0.029073715209960938, "learning_rate": 4.444444444444444e-06, "loss": 0.0205, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018899505957961082, "mask/share_reasoning": 0.874600350856781, "mask/share_step_conf": 0.10259392857551575, "num_tokens": 12287351.0, "reward": 0.6714740991592407, "reward_std": 0.22889409959316254, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5361906290054321, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.5051949620246887, "step": 40 }, { "adv/mean_abs_final_conf": 0.3478602468967438, "adv/mean_abs_reasoning": 0.34448909759521484, "adv/mean_abs_step_conf": 0.7755715847015381, "adv/ratio_final_to_reasoning": 1.0097859390182795, "adv/ratio_step_to_reasoning": 2.2513675762617553, "adv/std_final_conf": 0.619642972946167, "adv/std_reasoning": 0.6185486912727356, "adv/std_step_conf": 0.932489812374115, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5222222222222221, "calib/avg_num_step_conf": 8.3125, "calib/ece": 0.16596078431372552, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.0028888888888887188, "calib/mean_conf": 0.9894901960784314, "calib/mu_c": 0.99, "calib/mu_w": 0.9871111111111113, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16596078431372552, "calib/std_conf": 0.006146488074325677, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.925497175141243, "calib/step_q_c_n": 1770.0, "calib/step_q_gap": 0.003206672347946804, "calib/step_q_w": 0.9222905027932962, "calib/step_q_w_n": 358.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 701.359375, "completions/mean_terminated_length": 704.10986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.04373333333333333, "grad_norm": 0.018630625680088997, "kl": 0.037181854248046875, "learning_rate": 4.416666666666667e-06, "loss": -0.0122, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021096795797348022, "mask/share_reasoning": 0.848509669303894, "mask/share_step_conf": 0.12648731470108032, "num_tokens": 12574147.0, "reward": 0.9390561580657959, "reward_std": 0.19271451234817505, "rewards/accuracy_reward_step": 0.8203125, "rewards/final_brier_reward_step": 0.82079017162323, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6948220729827881, "step": 41 }, { "adv/mean_abs_final_conf": 0.3249935507774353, "adv/mean_abs_reasoning": 0.3224637508392334, "adv/mean_abs_step_conf": 0.7264145612716675, "adv/ratio_final_to_reasoning": 1.0078452227005916, "adv/ratio_step_to_reasoning": 2.2527014567718857, "adv/std_final_conf": 0.6402071118354797, "adv/std_reasoning": 0.6401504278182983, "adv/std_step_conf": 0.93314129114151, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.91015625, "calib/ece": 0.3900000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3900000000000001, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.922301854974705, "calib/step_q_c_n": 1186.0, "calib/step_q_gap": -0.003657620591445254, "calib/step_q_w": 0.9259594755661502, "calib/step_q_w_n": 839.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 665.9765625, "completions/mean_terminated_length": 676.5476684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.0448, "grad_norm": 0.023608466610312462, "kl": 0.048583984375, "learning_rate": 4.388888888888889e-06, "loss": -0.0291, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.021571829915046692, "mask/share_reasoning": 0.8393821120262146, "mask/share_step_conf": 0.1234210804104805, "num_tokens": 12849005.0, "reward": 0.7322165966033936, "reward_std": 0.18012070655822754, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5936523079872131, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.5582807660102844, "step": 42 }, { "adv/mean_abs_final_conf": 0.4121657609939575, "adv/mean_abs_reasoning": 0.4092313051223755, "adv/mean_abs_step_conf": 0.7901467084884644, "adv/ratio_final_to_reasoning": 1.007170653454052, "adv/ratio_step_to_reasoning": 1.9308070975952851, "adv/std_final_conf": 0.661336362361908, "adv/std_reasoning": 0.6613420248031616, "adv/std_step_conf": 0.9343063235282898, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.421875, "calib/ece": 0.3062055335968379, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3062055335968379, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9209961977186313, "calib/step_q_c_n": 1315.0, "calib/step_q_gap": -0.0018585031360693893, "calib/step_q_w": 0.9228547008547007, "calib/step_q_w_n": 585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2491.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 728.42578125, "completions/mean_terminated_length": 734.1614379882812, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.04586666666666667, "grad_norm": 0.012838178314268589, "kl": 0.032314300537109375, "learning_rate": 4.361111111111112e-06, "loss": -0.003, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02045440673828125, "mask/share_reasoning": 0.8614332675933838, "mask/share_step_conf": 0.11029988527297974, "num_tokens": 13140706.0, "reward": 0.8210978507995605, "reward_std": 0.21742019057273865, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6819324493408203, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6274506449699402, "step": 43 }, { "adv/mean_abs_final_conf": 0.42065298557281494, "adv/mean_abs_reasoning": 0.40963807702064514, "adv/mean_abs_step_conf": 0.7803975343704224, "adv/ratio_final_to_reasoning": 1.0268893669072043, "adv/ratio_step_to_reasoning": 1.9050903178883234, "adv/std_final_conf": 0.68174147605896, "adv/std_reasoning": 0.6816533803939819, "adv/std_step_conf": 0.932420551776886, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.41015625, "calib/ece": 0.4701587301587302, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4701587301587302, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.923446735395189, "calib/step_q_c_n": 970.0, "calib/step_q_gap": -0.0008898341840990476, "calib/step_q_w": 0.924336569579288, "calib/step_q_w_n": 927.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 755.4453125, "completions/mean_terminated_length": 761.3936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.046933333333333334, "grad_norm": 0.019129678606987, "kl": 0.03778839111328125, "learning_rate": 4.333333333333334e-06, "loss": -0.0373, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019462086260318756, "mask/share_reasoning": 0.8706867098808289, "mask/share_step_conf": 0.10203869640827179, "num_tokens": 13440420.0, "reward": 0.6606093049049377, "reward_std": 0.21581587195396423, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5209956765174866, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.5010042190551758, "step": 44 }, { "adv/mean_abs_final_conf": 0.5471866130828857, "adv/mean_abs_reasoning": 0.5119894742965698, "adv/mean_abs_step_conf": 0.7427734136581421, "adv/ratio_final_to_reasoning": 1.0687458249696906, "adv/ratio_step_to_reasoning": 1.4507591482786826, "adv/std_final_conf": 0.8045254349708557, "adv/std_reasoning": 0.792806088924408, "adv/std_step_conf": 0.9351861476898193, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.496551724137931, "calib/avg_num_step_conf": 8.1328125, "calib/ece": 0.4098400000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002758620689655711, "calib/mean_conf": 0.9898400000000002, "calib/mu_c": 0.9897241379310344, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4098400000000002, "calib/std_conf": 0.002524757414089521, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.926106346483705, "calib/step_q_c_n": 1166.0, "calib/step_q_gap": 0.0009753421168928744, "calib/step_q_w": 0.9251310043668122, "calib/step_q_w_n": 916.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 745.421875, "completions/mean_terminated_length": 754.2609252929688, "completions/min_length": 0.0, "completions/min_terminated_length": 318.0, "epoch": 0.048, "grad_norm": 0.02074560336768627, "kl": 0.037052154541015625, "learning_rate": 4.305555555555556e-06, "loss": -0.0306, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01985202543437481, "mask/share_reasoning": 0.8520389795303345, "mask/share_step_conf": 0.11639025807380676, "num_tokens": 13736296.0, "reward": 0.7014477252960205, "reward_std": 0.27716386318206787, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5705187320709229, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.5253453254699707, "step": 45 }, { "adv/mean_abs_final_conf": 0.41354337334632874, "adv/mean_abs_reasoning": 0.4040815830230713, "adv/mean_abs_step_conf": 0.7503899931907654, "adv/ratio_final_to_reasoning": 1.0234155445850082, "adv/ratio_step_to_reasoning": 1.8570259688076936, "adv/std_final_conf": 0.6817355751991272, "adv/std_reasoning": 0.6816933155059814, "adv/std_step_conf": 0.9333223104476929, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.46346938775510205, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46346938775510205, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9234462151394424, "calib/step_q_c_n": 1004.0, "calib/step_q_gap": 0.007498219473136891, "calib/step_q_w": 0.9159479956663055, "calib/step_q_w_n": 923.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 798.78125, "completions/mean_terminated_length": 805.0708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.04906666666666667, "grad_norm": 0.0159748587757349, "kl": 0.03411674499511719, "learning_rate": 4.277777777777778e-06, "loss": -0.0446, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019596949219703674, "mask/share_reasoning": 0.8614305257797241, "mask/share_step_conf": 0.11116001754999161, "num_tokens": 14045552.0, "reward": 0.6591576337814331, "reward_std": 0.22060437500476837, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5128730535507202, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5132546424865723, "step": 46 }, { "adv/mean_abs_final_conf": 0.44636037945747375, "adv/mean_abs_reasoning": 0.43221962451934814, "adv/mean_abs_step_conf": 0.7530901432037354, "adv/ratio_final_to_reasoning": 1.0327165962301015, "adv/ratio_step_to_reasoning": 1.7423784124592048, "adv/std_final_conf": 0.700847864151001, "adv/std_reasoning": 0.7015045881271362, "adv/std_step_conf": 0.9329828023910522, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.26171875, "calib/ece": 0.34510204081632656, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34510204081632656, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9173498233215548, "calib/step_q_c_n": 1132.0, "calib/step_q_gap": 0.0004997545457638974, "calib/step_q_w": 0.9168500687757909, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 771.3671875, "completions/mean_terminated_length": 783.6111450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.050133333333333335, "grad_norm": 0.014016284607350826, "kl": 0.03520965576171875, "learning_rate": 4.25e-06, "loss": -0.0716, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019007964059710503, "mask/share_reasoning": 0.8643789291381836, "mask/share_step_conf": 0.10098807513713837, "num_tokens": 14348998.0, "reward": 0.7666846513748169, "reward_std": 0.21894043684005737, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6238887310028076, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5946369171142578, "step": 47 }, { "adv/mean_abs_final_conf": 0.5394688844680786, "adv/mean_abs_reasoning": 0.5332775115966797, "adv/mean_abs_step_conf": 0.7809452414512634, "adv/ratio_final_to_reasoning": 1.011610039307417, "adv/ratio_step_to_reasoning": 1.4644256029342861, "adv/std_final_conf": 0.7751846313476562, "adv/std_reasoning": 0.7754152417182922, "adv/std_step_conf": 0.9337522983551025, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5043478260869565, "calib/avg_num_step_conf": 7.7109375, "calib/ece": 0.45168674698795186, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003478260869566041, "calib/mean_conf": 0.989839357429719, "calib/mu_c": 0.99, "calib/mu_w": 0.9896521739130434, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45168674698795186, "calib/std_conf": 0.0025298017265901426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.921573986804901, "calib/step_q_c_n": 1061.0, "calib/step_q_gap": 0.0020449616132252046, "calib/step_q_w": 0.9195290251916758, "calib/step_q_w_n": 913.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 749.64453125, "completions/mean_terminated_length": 755.5472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.0512, "grad_norm": 0.01471987459808588, "kl": 0.03924560546875, "learning_rate": 4.222222222222223e-06, "loss": -0.0151, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020167170092463493, "mask/share_reasoning": 0.8575121164321899, "mask/share_step_conf": 0.1145082637667656, "num_tokens": 14644595.0, "reward": 0.6888903379440308, "reward_std": 0.2654000520706177, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5326277017593384, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5459342002868652, "step": 48 }, { "adv/mean_abs_final_conf": 0.5216391086578369, "adv/mean_abs_reasoning": 0.5024067163467407, "adv/mean_abs_step_conf": 0.749990701675415, "adv/ratio_final_to_reasoning": 1.0382805238969433, "adv/ratio_step_to_reasoning": 1.492795930613718, "adv/std_final_conf": 0.7634592056274414, "adv/std_reasoning": 0.7575902938842773, "adv/std_step_conf": 0.9341700673103333, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49693251533742333, "calib/avg_num_step_conf": 7.57421875, "calib/ece": 0.32999999999999996, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00012269938650288914, "calib/mean_conf": 0.9899190283400809, "calib/mu_c": 0.9898773006134969, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32999999999999996, "calib/std_conf": 0.0012699908616484326, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9207016191210486, "calib/step_q_c_n": 1297.0, "calib/step_q_gap": 0.003723425974631156, "calib/step_q_w": 0.9169781931464175, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 694.75390625, "completions/mean_terminated_length": 708.5936279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.05226666666666667, "grad_norm": 0.01923024281859398, "kl": 0.040454864501953125, "learning_rate": 4.194444444444445e-06, "loss": -0.0531, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020560309290885925, "mask/share_reasoning": 0.8439966440200806, "mask/share_step_conf": 0.11591173708438873, "num_tokens": 14926988.0, "reward": 0.769041895866394, "reward_std": 0.25200900435447693, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6431816220283508, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.5745896100997925, "step": 49 }, { "adv/mean_abs_final_conf": 0.4636262357234955, "adv/mean_abs_reasoning": 0.46020743250846863, "adv/mean_abs_step_conf": 0.761410117149353, "adv/ratio_final_to_reasoning": 1.007428830943455, "adv/ratio_step_to_reasoning": 1.6544933075050712, "adv/std_final_conf": 0.7198258638381958, "adv/std_reasoning": 0.7206044793128967, "adv/std_step_conf": 0.932525634765625, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.796875, "calib/ece": 0.32467741935483885, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32467741935483885, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.92359756097561, "calib/step_q_c_n": 1312.0, "calib/step_q_gap": 0.0011121808586508353, "calib/step_q_w": 0.9224853801169591, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1916.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 730.42578125, "completions/mean_terminated_length": 733.2902221679688, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.05333333333333334, "grad_norm": 0.013666275888681412, "kl": 0.03583526611328125, "learning_rate": 4.166666666666667e-06, "loss": -0.0225, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020532866939902306, "mask/share_reasoning": 0.8563636541366577, "mask/share_step_conf": 0.11919719725847244, "num_tokens": 15219337.0, "reward": 0.7715030908584595, "reward_std": 0.22498729825019836, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6508409976959229, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.570290207862854, "step": 50 }, { "adv/mean_abs_final_conf": 0.3940299153327942, "adv/mean_abs_reasoning": 0.3988789916038513, "adv/mean_abs_step_conf": 0.7379977107048035, "adv/ratio_final_to_reasoning": 0.9878432397465721, "adv/ratio_step_to_reasoning": 1.850179443488339, "adv/std_final_conf": 0.682790994644165, "adv/std_reasoning": 0.6815266609191895, "adv/std_step_conf": 0.9336446523666382, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5053763440860215, "calib/avg_num_step_conf": 7.89453125, "calib/ece": 0.3574308300395257, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00043010752688188436, "calib/mean_conf": 0.9898418972332016, "calib/mu_c": 0.99, "calib/mu_w": 0.9895698924731181, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3574308300395257, "calib/std_conf": 0.0025098036152391397, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9203018575851395, "calib/step_q_c_n": 1292.0, "calib/step_q_gap": 0.007160568147553725, "calib/step_q_w": 0.9131412894375858, "calib/step_q_w_n": 729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 708.29296875, "completions/mean_terminated_length": 711.0706176757812, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.0544, "grad_norm": 0.016280025243759155, "kl": 0.03824615478515625, "learning_rate": 4.138888888888889e-06, "loss": 0.0003, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.021065454930067062, "mask/share_reasoning": 0.8582796454429626, "mask/share_step_conf": 0.11674864590167999, "num_tokens": 15509956.0, "reward": 0.7576898336410522, "reward_std": 0.21025392413139343, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6285640597343445, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5649406909942627, "step": 51 }, { "adv/mean_abs_final_conf": 0.4902188777923584, "adv/mean_abs_reasoning": 0.4728948175907135, "adv/mean_abs_step_conf": 0.7851869463920593, "adv/ratio_final_to_reasoning": 1.0366340665137903, "adv/ratio_step_to_reasoning": 1.6603839103003917, "adv/std_final_conf": 0.7273574471473694, "adv/std_reasoning": 0.7206883430480957, "adv/std_step_conf": 0.9345096349716187, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49411764705882355, "calib/avg_num_step_conf": 7.3046875, "calib/ece": 0.3124701195219124, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003529411764705559, "calib/mean_conf": 0.9897609561752988, "calib/mu_c": 0.9896470588235294, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3124701195219124, "calib/std_conf": 0.0028126474254538338, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9195335029686176, "calib/step_q_c_n": 1179.0, "calib/step_q_gap": -0.004185744498820809, "calib/step_q_w": 0.9237192474674384, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 724.32421875, "completions/mean_terminated_length": 732.9130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.055466666666666664, "grad_norm": 0.01628994569182396, "kl": 0.036128997802734375, "learning_rate": 4.111111111111111e-06, "loss": -0.0324, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02012309432029724, "mask/share_reasoning": 0.8632853031158447, "mask/share_step_conf": 0.10487289726734161, "num_tokens": 15803335.0, "reward": 0.8191436529159546, "reward_std": 0.2507874369621277, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6702799797058105, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6391010284423828, "step": 52 }, { "adv/mean_abs_final_conf": 0.36346209049224854, "adv/mean_abs_reasoning": 0.3581864535808563, "adv/mean_abs_step_conf": 0.7379522323608398, "adv/ratio_final_to_reasoning": 1.0147287449278182, "adv/ratio_step_to_reasoning": 2.0602460673300027, "adv/std_final_conf": 0.641384482383728, "adv/std_reasoning": 0.6402559280395508, "adv/std_step_conf": 0.9335207939147949, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.1640625, "calib/ece": 0.2813385826771654, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2813385826771654, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9206836248012719, "calib/step_q_c_n": 1258.0, "calib/step_q_gap": 0.0044857081346051375, "calib/step_q_w": 0.9161979166666667, "calib/step_q_w_n": 576.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 723.734375, "completions/mean_terminated_length": 726.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 313.0, "epoch": 0.05653333333333333, "grad_norm": 0.014305333606898785, "kl": 0.03975677490234375, "learning_rate": 4.083333333333334e-06, "loss": -0.0329, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.020240100100636482, "mask/share_reasoning": 0.8698899745941162, "mask/share_step_conf": 0.10596367716789246, "num_tokens": 16094435.0, "reward": 0.8506057262420654, "reward_std": 0.1964026689529419, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7088069915771484, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6533418893814087, "step": 53 }, { "adv/mean_abs_final_conf": 0.3542019724845886, "adv/mean_abs_reasoning": 0.3400605022907257, "adv/mean_abs_step_conf": 0.7365595102310181, "adv/ratio_final_to_reasoning": 1.0415851594013499, "adv/ratio_step_to_reasoning": 2.165966071535459, "adv/std_final_conf": 0.6514297723770142, "adv/std_reasoning": 0.6401675939559937, "adv/std_step_conf": 0.9333468079566956, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4973544973544973, "calib/avg_num_step_conf": 8.0, "calib/ece": 0.24578740157480317, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000158730158730358, "calib/mean_conf": 0.9898818897637796, "calib/mu_c": 0.9898412698412697, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24578740157480317, "calib/std_conf": 0.0018786583134551434, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9230980392156862, "calib/step_q_c_n": 1530.0, "calib/step_q_gap": 0.011823907941554967, "calib/step_q_w": 0.9112741312741313, "calib/step_q_w_n": 518.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 687.51953125, "completions/mean_terminated_length": 690.2156982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.0576, "grad_norm": 0.019337039440870285, "kl": 0.040557861328125, "learning_rate": 4.055555555555556e-06, "loss": -0.0213, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021507475525140762, "mask/share_reasoning": 0.851201593875885, "mask/share_step_conf": 0.12338472157716751, "num_tokens": 16376672.0, "reward": 0.8689508438110352, "reward_std": 0.18806323409080505, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7393484115600586, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6532406806945801, "step": 54 }, { "adv/mean_abs_final_conf": 0.38243305683135986, "adv/mean_abs_reasoning": 0.356126606464386, "adv/mean_abs_step_conf": 0.7784011363983154, "adv/ratio_final_to_reasoning": 1.0738682532825714, "adv/ratio_step_to_reasoning": 2.1857427169687154, "adv/std_final_conf": 0.6601985096931458, "adv/std_reasoning": 0.6402488350868225, "adv/std_step_conf": 0.9331528544425964, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5089285714285714, "calib/avg_num_step_conf": 7.765625, "calib/ece": 0.4297619047619048, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.010535714285714093, "calib/mean_conf": 0.9853174603174604, "calib/mu_c": 0.99, "calib/mu_w": 0.9794642857142859, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4297619047619048, "calib/std_conf": 0.06332940473951357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9225273722627737, "calib/step_q_c_n": 1096.0, "calib/step_q_gap": 0.011989255670845411, "calib/step_q_w": 0.9105381165919283, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 702.04296875, "completions/mean_terminated_length": 704.796142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.058666666666666666, "grad_norm": 0.02334001660346985, "kl": 0.041500091552734375, "learning_rate": 4.027777777777779e-06, "loss": -0.0285, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.021185822784900665, "mask/share_reasoning": 0.8552565574645996, "mask/share_step_conf": 0.1196514219045639, "num_tokens": 16664219.0, "reward": 0.7132257223129272, "reward_std": 0.1884762942790985, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5606836080551147, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5595178604125977, "step": 55 }, { "adv/mean_abs_final_conf": 0.43028029799461365, "adv/mean_abs_reasoning": 0.42309921979904175, "adv/mean_abs_step_conf": 0.7558045387268066, "adv/ratio_final_to_reasoning": 1.0169725630762985, "adv/ratio_step_to_reasoning": 1.7863529483363003, "adv/std_final_conf": 0.7008323669433594, "adv/std_reasoning": 0.7013556957244873, "adv/std_step_conf": 0.9343731999397278, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.3984375, "calib/ece": 0.41570281124498, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41570281124498, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9252973977695167, "calib/step_q_c_n": 1076.0, "calib/step_q_gap": 0.007473436889321117, "calib/step_q_w": 0.9178239608801956, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 735.5078125, "completions/mean_terminated_length": 741.2991943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.05973333333333333, "grad_norm": 0.015703028067946434, "kl": 0.039569854736328125, "learning_rate": 4.000000000000001e-06, "loss": -0.0043, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02014715038239956, "mask/share_reasoning": 0.8610376119613647, "mask/share_step_conf": 0.11100269854068756, "num_tokens": 16959349.0, "reward": 0.7191253304481506, "reward_std": 0.2070513367652893, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5667777061462402, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.565222978591919, "step": 56 }, { "adv/mean_abs_final_conf": 0.4444199204444885, "adv/mean_abs_reasoning": 0.43316081166267395, "adv/mean_abs_step_conf": 0.7824656963348389, "adv/ratio_final_to_reasoning": 1.0259929072036706, "adv/ratio_step_to_reasoning": 1.8064092486376349, "adv/std_final_conf": 0.7026104927062988, "adv/std_reasoning": 0.7014220952987671, "adv/std_step_conf": 0.9334197640419006, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.55078125, "calib/ece": 0.3034920634920636, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.3306690738754696e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999995, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3034920634920636, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9256273199703045, "calib/step_q_c_n": 1347.0, "calib/step_q_gap": 0.005951552052215758, "calib/step_q_w": 0.9196757679180887, "calib/step_q_w_n": 586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 740.6171875, "completions/mean_terminated_length": 743.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.0608, "grad_norm": 0.015545027330517769, "kl": 0.048763275146484375, "learning_rate": 3.972222222222223e-06, "loss": 0.0178, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02056235447525978, "mask/share_reasoning": 0.8628644943237305, "mask/share_step_conf": 0.11266690492630005, "num_tokens": 17255739.0, "reward": 0.8158880472183228, "reward_std": 0.23786352574825287, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6779488325119019, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6225773692131042, "step": 57 }, { "adv/mean_abs_final_conf": 0.5775068998336792, "adv/mean_abs_reasoning": 0.571548581123352, "adv/mean_abs_step_conf": 0.7824968099594116, "adv/ratio_final_to_reasoning": 1.0104248683438535, "adv/ratio_step_to_reasoning": 1.3690818870050394, "adv/std_final_conf": 0.7946643233299255, "adv/std_reasoning": 0.7929192781448364, "adv/std_step_conf": 0.935107946395874, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4966442953020134, "calib/avg_num_step_conf": 7.68359375, "calib/ece": 0.37930327868852465, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.711409395943058e-05, "calib/mean_conf": 0.9899590163934426, "calib/mu_c": 0.9899328859060402, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37930327868852465, "calib/std_conf": 0.0006388711995131108, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9162784810126582, "calib/step_q_c_n": 1185.0, "calib/step_q_gap": -0.0010488847162419823, "calib/step_q_w": 0.9173273657289002, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 777.26171875, "completions/mean_terminated_length": 792.7450561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.06186666666666667, "grad_norm": 0.015236100181937218, "kl": 0.03882598876953125, "learning_rate": 3.944444444444445e-06, "loss": -0.0967, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018720220774412155, "mask/share_reasoning": 0.8575639724731445, "mask/share_step_conf": 0.10418450832366943, "num_tokens": 17561038.0, "reward": 0.7194130420684814, "reward_std": 0.2849310040473938, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5893566012382507, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.542438268661499, "step": 58 }, { "adv/mean_abs_final_conf": 0.5258022546768188, "adv/mean_abs_reasoning": 0.5158117413520813, "adv/mean_abs_step_conf": 0.803584098815918, "adv/ratio_final_to_reasoning": 1.0193685263901704, "adv/ratio_step_to_reasoning": 1.5579019134180776, "adv/std_final_conf": 0.7406392693519592, "adv/std_reasoning": 0.7395347356796265, "adv/std_step_conf": 0.9328177571296692, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.505, "calib/avg_num_step_conf": 7.28125, "calib/ece": 0.3914457831325302, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000400000000000178, "calib/mean_conf": 0.989839357429719, "calib/mu_c": 0.99, "calib/mu_w": 0.9895999999999998, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3914457831325302, "calib/std_conf": 0.002529801726590142, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9236056838365897, "calib/step_q_c_n": 1126.0, "calib/step_q_gap": 0.0004485022647739978, "calib/step_q_w": 0.9231571815718157, "calib/step_q_w_n": 738.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 711.48046875, "completions/mean_terminated_length": 719.9170532226562, "completions/min_length": 0.0, "completions/min_terminated_length": 341.0, "epoch": 0.06293333333333333, "grad_norm": 0.013194671832025051, "kl": 0.047607421875, "learning_rate": 3.916666666666667e-06, "loss": -0.0722, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020480435341596603, "mask/share_reasoning": 0.8568286299705505, "mask/share_step_conf": 0.11097220331430435, "num_tokens": 17849425.0, "reward": 0.7413979768753052, "reward_std": 0.28497523069381714, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5900495648384094, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5810275077819824, "step": 59 }, { "adv/mean_abs_final_conf": 0.5531863570213318, "adv/mean_abs_reasoning": 0.5323112607002258, "adv/mean_abs_step_conf": 0.7446668148040771, "adv/ratio_final_to_reasoning": 1.0392159585233007, "adv/ratio_step_to_reasoning": 1.3989311701287503, "adv/std_final_conf": 0.7932870984077454, "adv/std_reasoning": 0.7929216027259827, "adv/std_step_conf": 0.9346678256988525, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.3828125, "calib/ece": 0.4249593495934959, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4249593495934959, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9261883408071749, "calib/step_q_c_n": 1115.0, "calib/step_q_gap": 0.00030085293132631197, "calib/step_q_w": 0.9258874878758486, "calib/step_q_w_n": 1031.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 755.8125, "completions/mean_terminated_length": 770.8685302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.064, "grad_norm": 0.012127498164772987, "kl": 0.04190826416015625, "learning_rate": 3.88888888888889e-06, "loss": -0.033, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01983150653541088, "mask/share_reasoning": 0.8472711443901062, "mask/share_step_conf": 0.11336609721183777, "num_tokens": 18151769.0, "reward": 0.6970077753067017, "reward_std": 0.28720253705978394, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5512320399284363, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5420023202896118, "step": 60 }, { "adv/mean_abs_final_conf": 0.4006597697734833, "adv/mean_abs_reasoning": 0.3929939270019531, "adv/mean_abs_step_conf": 0.7636055946350098, "adv/ratio_final_to_reasoning": 1.0195062626794538, "adv/ratio_step_to_reasoning": 1.9430468059910115, "adv/std_final_conf": 0.6817258596420288, "adv/std_reasoning": 0.6815775632858276, "adv/std_step_conf": 0.9337095022201538, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.2109375, "calib/ece": 0.2836507936507937, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2836507936507937, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9289635854341738, "calib/step_q_c_n": 1428.0, "calib/step_q_gap": 0.002123822822897692, "calib/step_q_w": 0.9268397626112761, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 701.54296875, "completions/mean_terminated_length": 701.54296875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.06506666666666666, "grad_norm": 0.012162295170128345, "kl": 0.047260284423828125, "learning_rate": 3.861111111111112e-06, "loss": 0.058, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02129204198718071, "mask/share_reasoning": 0.8529036045074463, "mask/share_step_conf": 0.1258043497800827, "num_tokens": 18435428.0, "reward": 0.8267836570739746, "reward_std": 0.21470335125923157, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.697089433670044, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6213215589523315, "step": 61 }, { "adv/mean_abs_final_conf": 0.5658851265907288, "adv/mean_abs_reasoning": 0.5585103034973145, "adv/mean_abs_step_conf": 0.7698912620544434, "adv/ratio_final_to_reasoning": 1.0132044530731736, "adv/ratio_step_to_reasoning": 1.3784728002930124, "adv/std_final_conf": 0.7942206859588623, "adv/std_reasoning": 0.7928854823112488, "adv/std_step_conf": 0.9352415204048157, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.94921875, "calib/ece": 0.41570281124498, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41570281124498, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9266353982300884, "calib/step_q_c_n": 1130.0, "calib/step_q_gap": 0.0038729672908619506, "calib/step_q_w": 0.9227624309392265, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 729.51171875, "completions/mean_terminated_length": 735.2559204101562, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.06613333333333334, "grad_norm": 0.01219266839325428, "kl": 0.046234130859375, "learning_rate": 3.833333333333334e-06, "loss": 0.0092, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020023802295327187, "mask/share_reasoning": 0.8546423316001892, "mask/share_step_conf": 0.11752137541770935, "num_tokens": 18729263.0, "reward": 0.7053297758102417, "reward_std": 0.29709774255752563, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5667777061462402, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5376317501068115, "step": 62 }, { "adv/mean_abs_final_conf": 0.45947033166885376, "adv/mean_abs_reasoning": 0.4331585168838501, "adv/mean_abs_step_conf": 0.7686165571212769, "adv/ratio_final_to_reasoning": 1.0607440781132305, "adv/ratio_step_to_reasoning": 1.7744463681580538, "adv/std_final_conf": 0.7218553423881531, "adv/std_reasoning": 0.7207862734794617, "adv/std_step_conf": 0.9341201782226562, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5048076923076923, "calib/avg_num_step_conf": 7.97265625, "calib/ece": 0.41247967479674796, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006730769230766098, "calib/mean_conf": 0.9897154471544716, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9893269230769233, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41247967479674796, "calib/std_conf": 0.004453956540548361, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9278424015009382, "calib/step_q_c_n": 1066.0, "calib/step_q_gap": 0.0019449656035023244, "calib/step_q_w": 0.9258974358974359, "calib/step_q_w_n": 975.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 723.21875, "completions/mean_terminated_length": 743.5501708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.0672, "grad_norm": 0.012249491177499294, "kl": 0.04229736328125, "learning_rate": 3.8055555555555556e-06, "loss": -0.0804, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019764099270105362, "mask/share_reasoning": 0.8458855152130127, "mask/share_step_conf": 0.10700662434101105, "num_tokens": 19023047.0, "reward": 0.7130417823791504, "reward_std": 0.2504788637161255, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5632386207580566, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.559719979763031, "step": 63 }, { "adv/mean_abs_final_conf": 0.4521995484828949, "adv/mean_abs_reasoning": 0.45031213760375977, "adv/mean_abs_step_conf": 0.7582074999809265, "adv/ratio_final_to_reasoning": 1.0041913391213007, "adv/ratio_step_to_reasoning": 1.683737649212758, "adv/std_final_conf": 0.7208352088928223, "adv/std_reasoning": 0.7206861972808838, "adv/std_step_conf": 0.9330006241798401, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.66015625, "calib/ece": 0.24099601593625497, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.3306690738754696e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9900000000000003, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24099601593625497, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9253862068965517, "calib/step_q_c_n": 1450.0, "calib/step_q_gap": 0.0016680072879411423, "calib/step_q_w": 0.9237181996086106, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 705.65625, "completions/mean_terminated_length": 714.0237426757812, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.06826666666666667, "grad_norm": 0.012615746818482876, "kl": 0.047271728515625, "learning_rate": 3.777777777777778e-06, "loss": -0.0807, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.02077304944396019, "mask/share_reasoning": 0.8502489328384399, "mask/share_step_conf": 0.11725927889347076, "num_tokens": 19307471.0, "reward": 0.8746538758277893, "reward_std": 0.24922211468219757, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.735292911529541, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6718271970748901, "step": 64 }, { "adv/mean_abs_final_conf": 0.3024356961250305, "adv/mean_abs_reasoning": 0.3007676899433136, "adv/mean_abs_step_conf": 0.7550225257873535, "adv/ratio_final_to_reasoning": 1.0055458290151822, "adv/ratio_step_to_reasoning": 2.510317933185092, "adv/std_final_conf": 0.5958569049835205, "adv/std_reasoning": 0.5959718823432922, "adv/std_step_conf": 0.9315097332000732, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4967741935483871, "calib/avg_num_step_conf": 8.37109375, "calib/ece": 0.3772332015810277, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00019354838709662037, "calib/mean_conf": 0.9898814229249012, "calib/mu_c": 0.9898064516129034, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3772332015810277, "calib/std_conf": 0.0018823527114293543, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9284633757961784, "calib/step_q_c_n": 1256.0, "calib/step_q_gap": 0.0004024964275199938, "calib/step_q_w": 0.9280608793686584, "calib/step_q_w_n": 887.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 656.8984375, "completions/mean_terminated_length": 664.687744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 0.06933333333333333, "grad_norm": 0.014276616275310516, "kl": 0.04969024658203125, "learning_rate": 3.7500000000000005e-06, "loss": -0.0374, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02203986421227455, "mask/share_reasoning": 0.8352499008178711, "mask/share_step_conf": 0.13099150359630585, "num_tokens": 19580661.0, "reward": 0.744539737701416, "reward_std": 0.16033200919628143, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.609114408493042, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5619961619377136, "step": 65 }, { "adv/mean_abs_final_conf": 0.5220428705215454, "adv/mean_abs_reasoning": 0.5041089653968811, "adv/mean_abs_step_conf": 0.7199528217315674, "adv/ratio_final_to_reasoning": 1.0355754536334125, "adv/ratio_step_to_reasoning": 1.4281690490561978, "adv/std_final_conf": 0.7923429608345032, "adv/std_reasoning": 0.792874813079834, "adv/std_step_conf": 0.9343789219856262, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.6796875, "calib/ece": 0.4776033057851241, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4776033057851241, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.92610710607621, "calib/step_q_c_n": 971.0, "calib/step_q_gap": -1.5995442574912033e-05, "calib/step_q_w": 0.926123101518785, "calib/step_q_w_n": 1251.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 768.4765625, "completions/mean_terminated_length": 799.7153930664062, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 0.0704, "grad_norm": 0.01726270467042923, "kl": 0.042469024658203125, "learning_rate": 3.7222222222222225e-06, "loss": -0.1283, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018511904403567314, "mask/share_reasoning": 0.836242139339447, "mask/share_step_conf": 0.1061834916472435, "num_tokens": 19883743.0, "reward": 0.6393930912017822, "reward_std": 0.273938924074173, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.49349915981292725, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.499349445104599, "step": 66 }, { "adv/mean_abs_final_conf": 0.39268285036087036, "adv/mean_abs_reasoning": 0.38221269845962524, "adv/mean_abs_step_conf": 0.7692726254463196, "adv/ratio_final_to_reasoning": 1.0273935218359866, "adv/ratio_step_to_reasoning": 2.0126820185373333, "adv/std_final_conf": 0.6624247431755066, "adv/std_reasoning": 0.661316454410553, "adv/std_step_conf": 0.9332561492919922, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.1953125, "calib/ece": 0.36944664031620555, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36944664031620555, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9247068273092371, "calib/step_q_c_n": 1245.0, "calib/step_q_gap": 0.0067466866292840155, "calib/step_q_w": 0.9179601406799531, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 749.734375, "completions/mean_terminated_length": 755.6378173828125, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.07146666666666666, "grad_norm": 0.014443333260715008, "kl": 0.043788909912109375, "learning_rate": 3.694444444444445e-06, "loss": -0.0186, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01966247148811817, "mask/share_reasoning": 0.8596717715263367, "mask/share_step_conf": 0.11285325139760971, "num_tokens": 20180683.0, "reward": 0.7620497345924377, "reward_std": 0.21320092678070068, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6206823587417603, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.5831045508384705, "step": 67 }, { "adv/mean_abs_final_conf": 0.38533303141593933, "adv/mean_abs_reasoning": 0.3794650435447693, "adv/mean_abs_step_conf": 0.7499998807907104, "adv/ratio_final_to_reasoning": 1.0154638430363816, "adv/ratio_step_to_reasoning": 1.9764663268705684, "adv/std_final_conf": 0.6597793698310852, "adv/std_reasoning": 0.661293625831604, "adv/std_step_conf": 0.9329918622970581, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.22265625, "calib/ece": 0.3965040650406504, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3965040650406504, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9219471947194721, "calib/step_q_c_n": 1212.0, "calib/step_q_gap": 0.013806097295059994, "calib/step_q_w": 0.9081410974244121, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 699.83984375, "completions/mean_terminated_length": 716.6360473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.07253333333333334, "grad_norm": 0.013353622518479824, "kl": 0.046169281005859375, "learning_rate": 3.6666666666666666e-06, "loss": -0.1861, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02063082344830036, "mask/share_reasoning": 0.8354395627975464, "mask/share_step_conf": 0.12049206346273422, "num_tokens": 20463930.0, "reward": 0.7265154123306274, "reward_std": 0.1846695840358734, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5780289173126221, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5687519311904907, "step": 68 }, { "adv/mean_abs_final_conf": 0.5332236289978027, "adv/mean_abs_reasoning": 0.5274813175201416, "adv/mean_abs_step_conf": 0.7604198455810547, "adv/ratio_final_to_reasoning": 1.0108862840956294, "adv/ratio_step_to_reasoning": 1.4416052669998467, "adv/std_final_conf": 0.7735781073570251, "adv/std_reasoning": 0.7754120230674744, "adv/std_step_conf": 0.9339145421981812, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4966442953020134, "calib/avg_num_step_conf": 8.05078125, "calib/ece": 0.37930327868852465, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.711409395943058e-05, "calib/mean_conf": 0.9899590163934426, "calib/mu_c": 0.9899328859060402, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37930327868852465, "calib/std_conf": 0.0006388711995131111, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9190017825311944, "calib/step_q_c_n": 1122.0, "calib/step_q_gap": -0.001743691377218859, "calib/step_q_w": 0.9207454739084132, "calib/step_q_w_n": 939.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 778.88671875, "completions/mean_terminated_length": 794.4024047851562, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 0.0736, "grad_norm": 0.011452188715338707, "kl": 0.036396026611328125, "learning_rate": 3.638888888888889e-06, "loss": -0.0839, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018734045326709747, "mask/share_reasoning": 0.8554292917251587, "mask/share_step_conf": 0.10630541294813156, "num_tokens": 20767821.0, "reward": 0.7228295803070068, "reward_std": 0.25022977590560913, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5893566608428955, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.5492713451385498, "step": 69 }, { "adv/mean_abs_final_conf": 0.42671480774879456, "adv/mean_abs_reasoning": 0.41062837839126587, "adv/mean_abs_step_conf": 0.7731932401657104, "adv/ratio_final_to_reasoning": 1.0391751525322022, "adv/ratio_step_to_reasoning": 1.8829513030611242, "adv/std_final_conf": 0.6997833847999573, "adv/std_reasoning": 0.7014550566673279, "adv/std_step_conf": 0.9338797330856323, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5061626385732976, "calib/avg_num_step_conf": 8.01953125, "calib/ece": 0.40758196721311474, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": 0.009264614749018851, "calib/mean_conf": 0.9854508196721311, "calib/mu_c": 0.9893617021276594, "calib/mu_w": 0.9800970873786405, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40758196721311474, "calib/std_conf": 0.06350585392650582, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9239586919104991, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": 0.005395280013753778, "calib/step_q_w": 0.9185634118967453, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 782.34765625, "completions/mean_terminated_length": 801.1240234375, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.07466666666666667, "grad_norm": 0.01270360592752695, "kl": 0.037288665771484375, "learning_rate": 3.6111111111111115e-06, "loss": -0.1025, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01925162971019745, "mask/share_reasoning": 0.8457670211791992, "mask/share_step_conf": 0.11154384166002274, "num_tokens": 21075094.0, "reward": 0.7064595222473145, "reward_std": 0.2108471393585205, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5627511739730835, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.5493866801261902, "step": 70 }, { "adv/mean_abs_final_conf": 0.45908522605895996, "adv/mean_abs_reasoning": 0.4423348605632782, "adv/mean_abs_step_conf": 0.7758592367172241, "adv/ratio_final_to_reasoning": 1.037868065551857, "adv/ratio_step_to_reasoning": 1.7540087971570435, "adv/std_final_conf": 0.7211512327194214, "adv/std_reasoning": 0.7207404971122742, "adv/std_step_conf": 0.932945966720581, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.234375, "calib/ece": 0.4290243902439024, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4290243902439024, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9207659208261618, "calib/step_q_c_n": 1162.0, "calib/step_q_gap": 0.0003430878451893271, "calib/step_q_w": 0.9204228329809725, "calib/step_q_w_n": 946.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 774.99609375, "completions/mean_terminated_length": 787.2976684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.07573333333333333, "grad_norm": 0.011231261305510998, "kl": 0.03789520263671875, "learning_rate": 3.5833333333333335e-06, "loss": -0.0531, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01901676505804062, "mask/share_reasoning": 0.8521870374679565, "mask/share_step_conf": 0.11317116022109985, "num_tokens": 21377901.0, "reward": 0.6887651085853577, "reward_std": 0.23772430419921875, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.547403872013092, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5301263332366943, "step": 71 }, { "adv/mean_abs_final_conf": 0.4625837206840515, "adv/mean_abs_reasoning": 0.45626822113990784, "adv/mean_abs_step_conf": 0.7668143510818481, "adv/ratio_final_to_reasoning": 1.013841637991718, "adv/ratio_step_to_reasoning": 1.6806218701054703, "adv/std_final_conf": 0.7177895307540894, "adv/std_reasoning": 0.7206159830093384, "adv/std_step_conf": 0.9346785545349121, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.640625, "calib/ece": 0.4438152610441768, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4438152610441768, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9228031634446396, "calib/step_q_c_n": 1138.0, "calib/step_q_gap": 0.011536869217451562, "calib/step_q_w": 0.9112662942271881, "calib/step_q_w_n": 1074.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 726.64453125, "completions/mean_terminated_length": 741.1195678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.0768, "grad_norm": 0.011547032743692398, "kl": 0.04325103759765625, "learning_rate": 3.555555555555556e-06, "loss": -0.0638, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019756484776735306, "mask/share_reasoning": 0.840281069278717, "mask/share_step_conf": 0.12043121457099915, "num_tokens": 21668330.0, "reward": 0.7075121402740479, "reward_std": 0.23124364018440247, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5399808883666992, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5742621421813965, "step": 72 }, { "adv/mean_abs_final_conf": 0.48381316661834717, "adv/mean_abs_reasoning": 0.4664658010005951, "adv/mean_abs_step_conf": 0.7619782090187073, "adv/ratio_final_to_reasoning": 1.037188933423503, "adv/ratio_step_to_reasoning": 1.633513555300778, "adv/std_final_conf": 0.7406063079833984, "adv/std_reasoning": 0.7393958568572998, "adv/std_step_conf": 0.9333019852638245, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5113636363636364, "calib/avg_num_step_conf": 8.41796875, "calib/ece": 0.34019920318725105, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.001136363636363491, "calib/mean_conf": 0.989601593625498, "calib/mu_c": 0.99, "calib/mu_w": 0.9888636363636365, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34019920318725105, "calib/std_conf": 0.005701806298877532, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9204226415094339, "calib/step_q_c_n": 1325.0, "calib/step_q_gap": 0.006506978858831558, "calib/step_q_w": 0.9139156626506023, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 739.50390625, "completions/mean_terminated_length": 748.2727661132812, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.07786666666666667, "grad_norm": 0.011704598553478718, "kl": 0.03731536865234375, "learning_rate": 3.5277777777777784e-06, "loss": -0.018, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.019500505179166794, "mask/share_reasoning": 0.8524847626686096, "mask/share_step_conf": 0.1162959635257721, "num_tokens": 21964675.0, "reward": 0.7815718054771423, "reward_std": 0.25788426399230957, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6402535438537598, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6010150909423828, "step": 73 }, { "adv/mean_abs_final_conf": 0.43151819705963135, "adv/mean_abs_reasoning": 0.4239124059677124, "adv/mean_abs_step_conf": 0.7469241619110107, "adv/ratio_final_to_reasoning": 1.0179418931478459, "adv/ratio_step_to_reasoning": 1.7619775958335617, "adv/std_final_conf": 0.7015545964241028, "adv/std_reasoning": 0.7014573216438293, "adv/std_step_conf": 0.9340309500694275, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5102040816326531, "calib/avg_num_step_conf": 8.33203125, "calib/ece": 0.3823886639676113, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.011020408163265327, "calib/mean_conf": 0.9856275303643725, "calib/mu_c": 0.99, "calib/mu_w": 0.9789795918367347, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3823886639676113, "calib/std_conf": 0.06310064362496394, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9235135135135136, "calib/step_q_c_n": 1258.0, "calib/step_q_gap": 0.012222084942085099, "calib/step_q_w": 0.9112914285714285, "calib/step_q_w_n": 875.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 758.73828125, "completions/mean_terminated_length": 773.8526000976562, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.07893333333333333, "grad_norm": 0.011972285807132721, "kl": 0.037693023681640625, "learning_rate": 3.5e-06, "loss": -0.0833, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018995292484760284, "mask/share_reasoning": 0.8444027900695801, "mask/share_step_conf": 0.11707063764333725, "num_tokens": 22262840.0, "reward": 0.7532713413238525, "reward_std": 0.23117206990718842, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5940839648246765, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.603083610534668, "step": 74 }, { "adv/mean_abs_final_conf": 0.41009172797203064, "adv/mean_abs_reasoning": 0.4013540744781494, "adv/mean_abs_step_conf": 0.7563887238502502, "adv/ratio_final_to_reasoning": 1.0217704367527405, "adv/ratio_step_to_reasoning": 1.8845921144159947, "adv/std_final_conf": 0.6615861058235168, "adv/std_reasoning": 0.6613550782203674, "adv/std_step_conf": 0.9317781329154968, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5013578522656735, "calib/avg_num_step_conf": 8.97265625, "calib/ece": 0.2767330677290837, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.7157045313308537e-05, "calib/mean_conf": 0.9898804780876495, "calib/mu_c": 0.9898882681564246, "calib/mu_w": 0.9898611111111113, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2767330677290837, "calib/std_conf": 0.001086707704939113, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9227473219911784, "calib/step_q_c_n": 1587.0, "calib/step_q_gap": 0.006986758610896593, "calib/step_q_w": 0.9157605633802818, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 703.82421875, "completions/mean_terminated_length": 717.8446655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 406.0, "epoch": 0.08, "grad_norm": 0.011645482853055, "kl": 0.042751312255859375, "learning_rate": 3.4722222222222224e-06, "loss": -0.0808, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020144283771514893, "mask/share_reasoning": 0.8334736824035645, "mask/share_step_conf": 0.12685082852840424, "num_tokens": 22547771.0, "reward": 0.8555404543876648, "reward_std": 0.20708394050598145, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7048202753067017, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6703230142593384, "step": 75 }, { "adv/mean_abs_final_conf": 0.29195424914360046, "adv/mean_abs_reasoning": 0.2810046076774597, "adv/mean_abs_step_conf": 0.7327713966369629, "adv/ratio_final_to_reasoning": 1.0389660566659067, "adv/ratio_step_to_reasoning": 2.6076846308443677, "adv/std_final_conf": 0.5702286958694458, "adv/std_reasoning": 0.5727683901786804, "adv/std_step_conf": 0.9335572123527527, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.921875, "calib/ece": 0.300204081632653, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.300204081632653, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9203335750543873, "calib/step_q_c_n": 1379.0, "calib/step_q_gap": 0.019681641352729873, "calib/step_q_w": 0.9006519337016574, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 778.91015625, "completions/mean_terminated_length": 804.0362548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 437.0, "epoch": 0.08106666666666666, "grad_norm": 0.012540625408291817, "kl": 0.03820037841796875, "learning_rate": 3.444444444444445e-06, "loss": -0.1275, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018356502056121826, "mask/share_reasoning": 0.8417627811431885, "mask/share_step_conf": 0.10863067209720612, "num_tokens": 22850228.0, "reward": 0.7930084466934204, "reward_std": 0.15477627515792847, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6581863164901733, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6059556007385254, "step": 76 }, { "adv/mean_abs_final_conf": 0.5305771827697754, "adv/mean_abs_reasoning": 0.5090773701667786, "adv/mean_abs_step_conf": 0.7380466461181641, "adv/ratio_final_to_reasoning": 1.042232897910888, "adv/ratio_step_to_reasoning": 1.4497730391676475, "adv/std_final_conf": 0.7934849858283997, "adv/std_reasoning": 0.7929652333259583, "adv/std_step_conf": 0.93379807472229, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5125, "calib/avg_num_step_conf": 9.19140625, "calib/ece": 0.2594190871369296, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004449300699300718, "calib/mean_conf": 0.9897095435684649, "calib/mu_c": 0.9898295454545455, "calib/mu_w": 0.9893846153846154, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2594190871369296, "calib/std_conf": 0.0027927499867989804, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.922860962566845, "calib/step_q_c_n": 1496.0, "calib/step_q_gap": -0.004105198459992909, "calib/step_q_w": 0.9269661610268379, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 763.61328125, "completions/mean_terminated_length": 791.437255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.08213333333333334, "grad_norm": 0.01434855256229639, "kl": 0.038990020751953125, "learning_rate": 3.416666666666667e-06, "loss": -0.1243, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018347784876823425, "mask/share_reasoning": 0.8309397101402283, "mask/share_step_conf": 0.11555621773004532, "num_tokens": 23150377.0, "reward": 0.8227552771568298, "reward_std": 0.2816285789012909, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6927835941314697, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6269456148147583, "step": 77 }, { "adv/mean_abs_final_conf": 0.5190737843513489, "adv/mean_abs_reasoning": 0.5054394602775574, "adv/mean_abs_step_conf": 0.7535461783409119, "adv/ratio_final_to_reasoning": 1.026975187228762, "adv/ratio_step_to_reasoning": 1.4908732648754985, "adv/std_final_conf": 0.7569723129272461, "adv/std_reasoning": 0.7576473951339722, "adv/std_step_conf": 0.9331627488136292, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.32421875, "calib/ece": 0.38516129032258073, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38516129032258073, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9183696529459242, "calib/step_q_c_n": 1239.0, "calib/step_q_gap": 0.0008023883719331382, "calib/step_q_w": 0.9175672645739911, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 810.0390625, "completions/mean_terminated_length": 826.17529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 0.0832, "grad_norm": 0.014095221646130085, "kl": 0.034351348876953125, "learning_rate": 3.3888888888888893e-06, "loss": -0.0805, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017470698803663254, "mask/share_reasoning": 0.8594420552253723, "mask/share_step_conf": 0.10355598479509354, "num_tokens": 23465771.0, "reward": 0.7383511066436768, "reward_std": 0.2517282962799072, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5934968590736389, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.5722677707672119, "step": 78 }, { "adv/mean_abs_final_conf": 0.5218999981880188, "adv/mean_abs_reasoning": 0.4994032084941864, "adv/mean_abs_step_conf": 0.7597758769989014, "adv/ratio_final_to_reasoning": 1.0450473471359252, "adv/ratio_step_to_reasoning": 1.521367632558464, "adv/std_final_conf": 0.7779203653335571, "adv/std_reasoning": 0.7754961848258972, "adv/std_step_conf": 0.935109555721283, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.49411764705882355, "calib/avg_num_step_conf": 8.578125, "calib/ece": 0.2843983402489627, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000294117647058445, "calib/mean_conf": 0.989792531120332, "calib/mu_c": 0.9897058823529412, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2843983402489627, "calib/std_conf": 0.002647810146646279, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9181918008784773, "calib/step_q_c_n": 1366.0, "calib/step_q_gap": -0.0019286810492336048, "calib/step_q_w": 0.9201204819277109, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 777.83984375, "completions/mean_terminated_length": 809.4592895507812, "completions/min_length": 0.0, "completions/min_terminated_length": 477.0, "epoch": 0.08426666666666667, "grad_norm": 0.014404214918613434, "kl": 0.033031463623046875, "learning_rate": 3.3611111111111117e-06, "loss": -0.1326, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01771673932671547, "mask/share_reasoning": 0.8376704454421997, "mask/share_step_conf": 0.10555030405521393, "num_tokens": 23771274.0, "reward": 0.8042483925819397, "reward_std": 0.27935120463371277, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6695046424865723, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6178983449935913, "step": 79 }, { "adv/mean_abs_final_conf": 0.42294803261756897, "adv/mean_abs_reasoning": 0.4056248068809509, "adv/mean_abs_step_conf": 0.75342857837677, "adv/ratio_final_to_reasoning": 1.0427075105929169, "adv/ratio_step_to_reasoning": 1.857451925019709, "adv/std_final_conf": 0.7015489935874939, "adv/std_reasoning": 0.7013869285583496, "adv/std_step_conf": 0.9324942231178284, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5080645161290323, "calib/avg_num_step_conf": 9.09375, "calib/ece": 0.2388755020080322, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00048387096774160643, "calib/mean_conf": 0.9898795180722892, "calib/mu_c": 0.99, "calib/mu_w": 0.9895161290322584, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2388755020080322, "calib/std_conf": 0.001897351294942607, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9194984709480123, "calib/step_q_c_n": 1635.0, "calib/step_q_gap": -0.00651884506930378, "calib/step_q_w": 0.9260173160173161, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 723.2734375, "completions/mean_terminated_length": 737.6812744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.08533333333333333, "grad_norm": 0.010196685791015625, "kl": 0.036296844482421875, "learning_rate": 3.3333333333333333e-06, "loss": -0.0607, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019432738423347473, "mask/share_reasoning": 0.837583065032959, "mask/share_step_conf": 0.12345296889543533, "num_tokens": 24058592.0, "reward": 0.8649125695228577, "reward_std": 0.2212262749671936, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7354437112808228, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6537563800811768, "step": 80 }, { "adv/mean_abs_final_conf": 0.4704670011997223, "adv/mean_abs_reasoning": 0.4546546936035156, "adv/mean_abs_step_conf": 0.7636871337890625, "adv/ratio_final_to_reasoning": 1.0347787184838697, "adv/ratio_step_to_reasoning": 1.6797080169484415, "adv/std_final_conf": 0.7218642830848694, "adv/std_reasoning": 0.7208608388900757, "adv/std_step_conf": 0.9334217309951782, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5072463768115942, "calib/avg_num_step_conf": 9.02734375, "calib/ece": 0.2712653061224489, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.0013043478260872376, "calib/mean_conf": 0.9896326530612244, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9886956521739129, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2712653061224489, "calib/std_conf": 0.00573814261903346, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9083748271092671, "calib/step_q_c_n": 1446.0, "calib/step_q_gap": 0.01323031843874689, "calib/step_q_w": 0.8951445086705202, "calib/step_q_w_n": 865.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 750.98046875, "completions/mean_terminated_length": 778.3441772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.0864, "grad_norm": 0.020107468590140343, "kl": 0.06332015991210938, "learning_rate": 3.3055555555555558e-06, "loss": -0.1792, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018680231645703316, "mask/share_reasoning": 0.8345333337783813, "mask/share_step_conf": 0.11163021624088287, "num_tokens": 24357091.0, "reward": 0.8299020528793335, "reward_std": 0.2704859673976898, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6934593915939331, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6374385356903076, "step": 81 }, { "adv/mean_abs_final_conf": 0.5415908694267273, "adv/mean_abs_reasoning": 0.5319398641586304, "adv/mean_abs_step_conf": 0.7804492712020874, "adv/ratio_final_to_reasoning": 1.018143038186021, "adv/ratio_step_to_reasoning": 1.4671757538542904, "adv/std_final_conf": 0.7750914692878723, "adv/std_reasoning": 0.7754891514778137, "adv/std_step_conf": 0.9343975782394409, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5054945054945055, "calib/avg_num_step_conf": 9.28515625, "calib/ece": 0.35542168674698804, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00010989010988993186, "calib/mean_conf": 0.9899598393574298, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9898901098901098, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35542168674698804, "calib/std_conf": 0.0006324504316475354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9176951130561636, "calib/step_q_c_n": 1371.0, "calib/step_q_gap": -0.006678644399104927, "calib/step_q_w": 0.9243737574552685, "calib/step_q_w_n": 1006.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2241.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 732.8984375, "completions/mean_terminated_length": 744.5317993164062, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.08746666666666666, "grad_norm": 0.010728063993155956, "kl": 0.034942626953125, "learning_rate": 3.277777777777778e-06, "loss": -0.0595, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019511591643095016, "mask/share_reasoning": 0.8416132926940918, "mask/share_step_conf": 0.1232500821352005, "num_tokens": 24650265.0, "reward": 0.768690288066864, "reward_std": 0.2716025710105896, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6242765188217163, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5951353311538696, "step": 82 }, { "adv/mean_abs_final_conf": 0.3733702301979065, "adv/mean_abs_reasoning": 0.35768407583236694, "adv/mean_abs_step_conf": 0.758823812007904, "adv/ratio_final_to_reasoning": 1.0438547741579949, "adv/ratio_step_to_reasoning": 2.121491738881706, "adv/std_final_conf": 0.659427285194397, "adv/std_reasoning": 0.6613901257514954, "adv/std_step_conf": 0.9329133629798889, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.4963768115942029, "calib/avg_num_step_conf": 9.39453125, "calib/ece": 0.42205761316872437, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.24637681158713e-05, "calib/mean_conf": 0.9899588477366256, "calib/mu_c": 0.9899275362318841, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42205761316872437, "calib/std_conf": 0.000640178978852019, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9205405405405407, "calib/step_q_c_n": 1147.0, "calib/step_q_gap": 0.000993640699523013, "calib/step_q_w": 0.9195468998410177, "calib/step_q_w_n": 1258.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 800.12890625, "completions/mean_terminated_length": 829.283447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.08853333333333334, "grad_norm": 0.013022164814174175, "kl": 0.031452178955078125, "learning_rate": 3.2500000000000002e-06, "loss": -0.2175, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017471952363848686, "mask/share_reasoning": 0.8387032151222229, "mask/share_step_conf": 0.10866856575012207, "num_tokens": 24962362.0, "reward": 0.6908458471298218, "reward_std": 0.18251261115074158, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.54716956615448, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.5368659496307373, "step": 83 }, { "adv/mean_abs_final_conf": 0.5106885433197021, "adv/mean_abs_reasoning": 0.5043027997016907, "adv/mean_abs_step_conf": 0.776948869228363, "adv/ratio_final_to_reasoning": 1.0126625186728861, "adv/ratio_step_to_reasoning": 1.5406396111382887, "adv/std_final_conf": 0.7742244601249695, "adv/std_reasoning": 0.7754048705101013, "adv/std_step_conf": 0.9331316351890564, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5164835164835164, "calib/avg_num_step_conf": 9.15625, "calib/ece": 0.35330612244897963, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": 0.021868131868131968, "calib/mean_conf": 0.9818775510204082, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9681318681318679, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35330612244897963, "calib/std_conf": 0.08908002195986395, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9211308203991131, "calib/step_q_c_n": 1353.0, "calib/step_q_gap": 0.0016555429016358314, "calib/step_q_w": 0.9194752774974773, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 772.91796875, "completions/mean_terminated_length": 794.6465454101562, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.0896, "grad_norm": 0.011630590073764324, "kl": 0.033512115478515625, "learning_rate": 3.2222222222222227e-06, "loss": -0.1131, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01856730505824089, "mask/share_reasoning": 0.8370269536972046, "mask/share_step_conf": 0.11706199496984482, "num_tokens": 25266149.0, "reward": 0.7697278261184692, "reward_std": 0.2595728933811188, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6163101196289062, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6114266514778137, "step": 84 }, { "adv/mean_abs_final_conf": 0.5432265996932983, "adv/mean_abs_reasoning": 0.5405408143997192, "adv/mean_abs_step_conf": 0.7449895143508911, "adv/ratio_final_to_reasoning": 1.0049687002757817, "adv/ratio_step_to_reasoning": 1.3782299032834662, "adv/std_final_conf": 0.792640745639801, "adv/std_reasoning": 0.7929560542106628, "adv/std_step_conf": 0.9342106580734253, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.5059523809523809, "calib/avg_num_step_conf": 9.296875, "calib/ece": 0.35034334763948494, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00047619047619074095, "calib/mean_conf": 0.9898283261802575, "calib/mu_c": 0.99, "calib/mu_w": 0.9895238095238093, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35034334763948494, "calib/std_conf": 0.002614857718751559, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.918590240123935, "calib/step_q_c_n": 1291.0, "calib/step_q_gap": -0.008581477047782116, "calib/step_q_w": 0.9271717171717171, "calib/step_q_w_n": 1089.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 790.203125, "completions/mean_terminated_length": 842.8833618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.09066666666666667, "grad_norm": 0.014631648547947407, "kl": 0.031978607177734375, "learning_rate": 3.1944444444444443e-06, "loss": -0.2057, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.017018930986523628, "mask/share_reasoning": 0.8115172386169434, "mask/share_step_conf": 0.10896383225917816, "num_tokens": 25576265.0, "reward": 0.7271938323974609, "reward_std": 0.26602375507354736, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.588728129863739, "rewards/format_reward_step": 0.90625, "rewards/step_l1_reward": 0.5680031776428223, "step": 85 }, { "adv/mean_abs_final_conf": 0.5152809619903564, "adv/mean_abs_reasoning": 0.5122286677360535, "adv/mean_abs_step_conf": 0.7808734774589539, "adv/ratio_final_to_reasoning": 1.0059588508932806, "adv/ratio_step_to_reasoning": 1.5244626602221538, "adv/std_final_conf": 0.7563278675079346, "adv/std_reasoning": 0.7577018141746521, "adv/std_step_conf": 0.9337679147720337, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5006368999731832, "calib/avg_num_step_conf": 9.125, "calib/ece": 0.4511428571428572, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.2737999463663385e-05, "calib/mean_conf": 0.9899183673469388, "calib/mu_c": 0.9899242424242424, "calib/mu_w": 0.9899115044247787, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4511428571428572, "calib/std_conf": 0.0008998125585734132, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.917151515151515, "calib/step_q_c_n": 1155.0, "calib/step_q_gap": -0.00500767197803631, "calib/step_q_w": 0.9221591871295514, "calib/step_q_w_n": 1181.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 782.5546875, "completions/mean_terminated_length": 811.06884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.09173333333333333, "grad_norm": 0.012190482579171658, "kl": 0.03171539306640625, "learning_rate": 3.1666666666666667e-06, "loss": -0.119, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018094191327691078, "mask/share_reasoning": 0.8291984796524048, "mask/share_step_conf": 0.11755112558603287, "num_tokens": 25882111.0, "reward": 0.6687038540840149, "reward_std": 0.2648327946662903, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5242776870727539, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.5201611518859863, "step": 86 }, { "adv/mean_abs_final_conf": 0.4775123596191406, "adv/mean_abs_reasoning": 0.45811158418655396, "adv/mean_abs_step_conf": 0.770564079284668, "adv/ratio_final_to_reasoning": 1.0423494539371574, "adv/ratio_step_to_reasoning": 1.682044519028962, "adv/std_final_conf": 0.7239235639572144, "adv/std_reasoning": 0.720928430557251, "adv/std_step_conf": 0.9331386685371399, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5131552317344668, "calib/avg_num_step_conf": 9.30078125, "calib/ece": 0.21755274261603375, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008166363084392003, "calib/mean_conf": 0.989704641350211, "calib/mu_c": 0.9898907103825135, "calib/mu_w": 0.9890740740740743, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21755274261603375, "calib/std_conf": 0.0028159618081795104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9205956112852663, "calib/step_q_c_n": 1595.0, "calib/step_q_gap": 0.010748283040991402, "calib/step_q_w": 0.9098473282442749, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 731.484375, "completions/mean_terminated_length": 786.8067626953125, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.0928, "grad_norm": 0.013589132577180862, "kl": 0.032924652099609375, "learning_rate": 3.138888888888889e-06, "loss": -0.2468, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01762017048895359, "mask/share_reasoning": 0.8009259700775146, "mask/share_step_conf": 0.11114132404327393, "num_tokens": 26174867.0, "reward": 0.8554751873016357, "reward_std": 0.2645605504512787, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7154418230056763, "rewards/format_reward_step": 0.921875, "rewards/step_l1_reward": 0.668164849281311, "step": 87 }, { "adv/mean_abs_final_conf": 0.4434081017971039, "adv/mean_abs_reasoning": 0.4059925675392151, "adv/mean_abs_step_conf": 0.7615506649017334, "adv/ratio_final_to_reasoning": 1.0921581754185063, "adv/ratio_step_to_reasoning": 1.8757748929188827, "adv/std_final_conf": 0.7220824956893921, "adv/std_reasoning": 0.7014883756637573, "adv/std_step_conf": 0.9331439137458801, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5073099415204678, "calib/avg_num_step_conf": 8.7734375, "calib/ece": 0.2975303643724697, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001461988304094497, "calib/mean_conf": 0.989838056680162, "calib/mu_c": 0.9898830409356724, "calib/mu_w": 0.9897368421052629, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2975303643724697, "calib/std_conf": 0.001262223260576511, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9183161764705882, "calib/step_q_c_n": 1360.0, "calib/step_q_gap": -0.009471633913158972, "calib/step_q_w": 0.9277878103837471, "calib/step_q_w_n": 886.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 809.953125, "completions/mean_terminated_length": 826.087646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.09386666666666667, "grad_norm": 0.013744988478720188, "kl": 0.029903411865234375, "learning_rate": 3.1111111111111116e-06, "loss": -0.0728, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017987487837672234, "mask/share_reasoning": 0.8538491725921631, "mask/share_step_conf": 0.10863205790519714, "num_tokens": 26492063.0, "reward": 0.8196660280227661, "reward_std": 0.23798441886901855, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6738835573196411, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6396672129631042, "step": 88 }, { "adv/mean_abs_final_conf": 0.4719206392765045, "adv/mean_abs_reasoning": 0.4592221975326538, "adv/mean_abs_step_conf": 0.7370298504829407, "adv/ratio_final_to_reasoning": 1.027652064321102, "adv/ratio_step_to_reasoning": 1.6049525794765895, "adv/std_final_conf": 0.7544752955436707, "adv/std_reasoning": 0.7576509118080139, "adv/std_step_conf": 0.9334758520126343, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.49603174603174605, "calib/avg_num_step_conf": 9.38671875, "calib/ece": 0.4604621848739495, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002380952380954815, "calib/mean_conf": 0.9898739495798319, "calib/mu_c": 0.9897619047619046, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4604621848739495, "calib/std_conf": 0.0019405215527320172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9215805168986083, "calib/step_q_c_n": 1006.0, "calib/step_q_gap": -0.003208316315421844, "calib/step_q_w": 0.9247888332140302, "calib/step_q_w_n": 1397.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 831.82421875, "completions/mean_terminated_length": 872.7335815429688, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.09493333333333333, "grad_norm": 0.013959411531686783, "kl": 0.032283782958984375, "learning_rate": 3.0833333333333336e-06, "loss": -0.2316, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.016758153215050697, "mask/share_reasoning": 0.8345001935958862, "mask/share_step_conf": 0.10186664760112762, "num_tokens": 26813898.0, "reward": 0.6590661406517029, "reward_std": 0.21499761939048767, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5008386373519897, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.5329186320304871, "step": 89 }, { "adv/mean_abs_final_conf": 0.4721667170524597, "adv/mean_abs_reasoning": 0.4331371486186981, "adv/mean_abs_step_conf": 0.7640393376350403, "adv/ratio_final_to_reasoning": 1.0901090302649619, "adv/ratio_step_to_reasoning": 1.7639663096818416, "adv/std_final_conf": 0.7346081137657166, "adv/std_reasoning": 0.7207502126693726, "adv/std_step_conf": 0.9328594207763672, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.49680641991483787, "calib/avg_num_step_conf": 9.9609375, "calib/ece": 0.281687242798354, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.826400262069601e-05, "calib/mean_conf": 0.9895061728395063, "calib/mu_c": 0.9895348837209301, "calib/mu_w": 0.9894366197183094, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.281687242798354, "calib/std_conf": 0.003103655577771987, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9143071161048688, "calib/step_q_c_n": 1602.0, "calib/step_q_gap": 0.005562390366472214, "calib/step_q_w": 0.9087447257383966, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 768.1328125, "completions/mean_terminated_length": 802.620361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.096, "grad_norm": 0.013207972049713135, "kl": 0.0336456298828125, "learning_rate": 3.055555555555556e-06, "loss": -0.2008, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01795533299446106, "mask/share_reasoning": 0.8207667469978333, "mask/share_step_conf": 0.11830917745828629, "num_tokens": 27113860.0, "reward": 0.8244093656539917, "reward_std": 0.23067374527454376, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6776206493377686, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6469793319702148, "step": 90 }, { "adv/mean_abs_final_conf": 0.5017357468605042, "adv/mean_abs_reasoning": 0.4565935730934143, "adv/mean_abs_step_conf": 0.7459276914596558, "adv/ratio_final_to_reasoning": 1.0988672999956008, "adv/ratio_step_to_reasoning": 1.633679787488044, "adv/std_final_conf": 0.7723668217658997, "adv/std_reasoning": 0.7395398616790771, "adv/std_step_conf": 0.9347891211509705, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5230179028132993, "calib/avg_num_step_conf": 9.640625, "calib/ece": 0.27794979079497917, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.99581589958159, "calib/gap": 0.0005711849957374948, "calib/mean_conf": 0.9892468619246862, "calib/mu_c": 0.9894117647058823, "calib/mu_w": 0.9888405797101448, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27794979079497917, "calib/std_conf": 0.006553428674440355, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9171748251748252, "calib/step_q_c_n": 1430.0, "calib/step_q_gap": -0.008461012975463866, "calib/step_q_w": 0.925635838150289, "calib/step_q_w_n": 1038.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 776.66015625, "completions/mean_terminated_length": 821.5908813476562, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.09706666666666666, "grad_norm": 0.013960733078420162, "kl": 0.034755706787109375, "learning_rate": 3.0277777777777776e-06, "loss": -0.276, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.017011146992444992, "mask/share_reasoning": 0.8231663107872009, "mask/share_step_conf": 0.10513506829738617, "num_tokens": 27420397.0, "reward": 0.8049872517585754, "reward_std": 0.25266385078430176, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6699300408363342, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.6205131411552429, "step": 91 }, { "adv/mean_abs_final_conf": 0.4236437678337097, "adv/mean_abs_reasoning": 0.3485650420188904, "adv/mean_abs_step_conf": 0.7679628133773804, "adv/ratio_final_to_reasoning": 1.215393733634225, "adv/ratio_step_to_reasoning": 2.2032123730174895, "adv/std_final_conf": 0.7064496278762817, "adv/std_reasoning": 0.6401773691177368, "adv/std_step_conf": 0.9319038391113281, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5535289452815226, "calib/avg_num_step_conf": 9.47265625, "calib/ece": 0.37217391304347824, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002861485593444124, "calib/mean_conf": 0.9887747035573122, "calib/mu_c": 0.9898717948717948, "calib/mu_w": 0.9870103092783507, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37217391304347824, "calib/std_conf": 0.0066890737711640105, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9190198863636364, "calib/step_q_c_n": 1408.0, "calib/step_q_gap": 0.00184191192902472, "calib/step_q_w": 0.9171779744346117, "calib/step_q_w_n": 1017.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 768.44921875, "completions/mean_terminated_length": 777.561279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.09813333333333334, "grad_norm": 0.01794438250362873, "kl": 0.036670684814453125, "learning_rate": 3e-06, "loss": -0.0148, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019045470282435417, "mask/share_reasoning": 0.8425126075744629, "mask/share_step_conf": 0.12672311067581177, "num_tokens": 27723840.0, "reward": 0.7808723449707031, "reward_std": 0.17103058099746704, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6190499663352966, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6231634020805359, "step": 92 }, { "adv/mean_abs_final_conf": 0.4681822657585144, "adv/mean_abs_reasoning": 0.4340810775756836, "adv/mean_abs_step_conf": 0.7634403705596924, "adv/ratio_final_to_reasoning": 1.078559490253028, "adv/ratio_step_to_reasoning": 1.7587506343825452, "adv/std_final_conf": 0.721306324005127, "adv/std_reasoning": 0.7015017867088318, "adv/std_step_conf": 0.9331609606742859, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5084011452682339, "calib/avg_num_step_conf": 10.00390625, "calib/ece": 0.3366942148760331, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004611211573238849, "calib/mean_conf": 0.9895867768595042, "calib/mu_c": 0.989746835443038, "calib/mu_w": 0.9892857142857141, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3366942148760331, "calib/std_conf": 0.003121954837607293, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9171076706544687, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": -0.007032680222724186, "calib/step_q_w": 0.9241403508771929, "calib/step_q_w_n": 1140.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 762.84765625, "completions/mean_terminated_length": 803.6583862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.0992, "grad_norm": 3.9375298023223877, "kl": 1.6125869750976562, "learning_rate": 2.9722222222222225e-06, "loss": -0.1713, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017697613686323166, "mask/share_reasoning": 0.8151333928108215, "mask/share_step_conf": 0.1163877323269844, "num_tokens": 28024905.0, "reward": 0.7663431167602539, "reward_std": 0.20701514184474945, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6241070032119751, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.5960791110992432, "step": 93 }, { "adv/mean_abs_final_conf": 0.521461009979248, "adv/mean_abs_reasoning": 0.4299072027206421, "adv/mean_abs_step_conf": 0.7327769994735718, "adv/ratio_final_to_reasoning": 1.2129617896123004, "adv/ratio_step_to_reasoning": 1.704500401101066, "adv/std_final_conf": 0.7765588164329529, "adv/std_reasoning": 0.7206876277923584, "adv/std_step_conf": 0.9341228008270264, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.4877741960845978, "calib/avg_num_step_conf": 9.109375, "calib/ece": 0.3171250000000002, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": -0.000401761144743773, "calib/mean_conf": 0.9879583333333335, "calib/mu_c": 0.9878260869565217, "calib/mu_w": 0.9882278481012655, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3171250000000002, "calib/std_conf": 0.009464755528920028, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9079537237888646, "calib/step_q_c_n": 1383.0, "calib/step_q_gap": -0.009917719836003758, "calib/step_q_w": 0.9178714436248684, "calib/step_q_w_n": 949.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 773.4140625, "completions/mean_terminated_length": 814.7901000976562, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.10026666666666667, "grad_norm": 0.016173789277672768, "kl": 0.0373992919921875, "learning_rate": 2.944444444444445e-06, "loss": -0.167, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01777365803718567, "mask/share_reasoning": 0.8225142955780029, "mask/share_step_conf": 0.108930803835392, "num_tokens": 28331579.0, "reward": 0.7784872651100159, "reward_std": 0.23750649392604828, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6359518766403198, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6077413558959961, "step": 94 }, { "adv/mean_abs_final_conf": 0.5023649334907532, "adv/mean_abs_reasoning": 0.4079497456550598, "adv/mean_abs_step_conf": 0.7471914291381836, "adv/ratio_final_to_reasoning": 1.2314382809188604, "adv/ratio_step_to_reasoning": 1.8315771417834592, "adv/std_final_conf": 0.7587032914161682, "adv/std_reasoning": 0.7014654874801636, "adv/std_step_conf": 0.933836042881012, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5303065134099616, "calib/avg_num_step_conf": 8.8125, "calib/ece": 0.288714859437751, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001397701149425279, "calib/mean_conf": 0.9875100401606426, "calib/mu_c": 0.9879310344827585, "calib/mu_w": 0.9865333333333333, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.288714859437751, "calib/std_conf": 0.008372892026171202, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9147102425876011, "calib/step_q_c_n": 1484.0, "calib/step_q_gap": 0.016135113053922345, "calib/step_q_w": 0.8985751295336788, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 765.5234375, "completions/mean_terminated_length": 774.600830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.10133333333333333, "grad_norm": 0.021871555596590042, "kl": 0.04198455810546875, "learning_rate": 2.916666666666667e-06, "loss": -0.0602, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018963484093546867, "mask/share_reasoning": 0.8512589931488037, "mask/share_step_conf": 0.11805877089500427, "num_tokens": 28633681.0, "reward": 0.8162245750427246, "reward_std": 0.23270940780639648, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6833745837211609, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6201682090759277, "step": 95 }, { "adv/mean_abs_final_conf": 0.5089430212974548, "adv/mean_abs_reasoning": 0.45139145851135254, "adv/mean_abs_step_conf": 0.7414897680282593, "adv/ratio_final_to_reasoning": 1.127498120978855, "adv/ratio_step_to_reasoning": 1.6426756732917018, "adv/std_final_conf": 0.7677562236785889, "adv/std_reasoning": 0.7394993305206299, "adv/std_step_conf": 0.9327307343482971, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49949494949494955, "calib/avg_num_step_conf": 9.4921875, "calib/ece": 0.25711382113821135, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002525252525252819, "calib/mean_conf": 0.9888211382113821, "calib/mu_c": 0.9888888888888889, "calib/mu_w": 0.9886363636363636, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25711382113821135, "calib/std_conf": 0.004667013669430779, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.918825857519789, "calib/step_q_c_n": 1516.0, "calib/step_q_gap": -0.008187271583055655, "calib/step_q_w": 0.9270131291028446, "calib/step_q_w_n": 914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 711.53515625, "completions/mean_terminated_length": 737.4615478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 76.0, "epoch": 0.1024, "grad_norm": 0.017186082899570465, "kl": 0.0482330322265625, "learning_rate": 2.888888888888889e-06, "loss": -0.258, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019098231568932533, "mask/share_reasoning": 0.830284595489502, "mask/share_step_conf": 0.11546093225479126, "num_tokens": 28921650.0, "reward": 0.8585371375083923, "reward_std": 0.24619776010513306, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7048609256744385, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6809632778167725, "step": 96 }, { "adv/mean_abs_final_conf": 0.5731841325759888, "adv/mean_abs_reasoning": 0.48445412516593933, "adv/mean_abs_step_conf": 0.7737342715263367, "adv/ratio_final_to_reasoning": 1.1831546121723227, "adv/ratio_step_to_reasoning": 1.5971259843464243, "adv/std_final_conf": 0.7974430322647095, "adv/std_reasoning": 0.7393472790718079, "adv/std_step_conf": 0.9335328936576843, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5352779269602578, "calib/avg_num_step_conf": 8.61328125, "calib/ece": 0.38015999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0013372717508056686, "calib/mean_conf": 0.9881599999999999, "calib/mu_c": 0.9886842105263158, "calib/mu_w": 0.9873469387755102, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38015999999999994, "calib/std_conf": 0.006372942805329425, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9205419847328243, "calib/step_q_c_n": 1310.0, "calib/step_q_gap": 0.011938632777517055, "calib/step_q_w": 0.9086033519553073, "calib/step_q_w_n": 895.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 775.078125, "completions/mean_terminated_length": 781.1810913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.10346666666666667, "grad_norm": 0.020701313391327858, "kl": 0.0483245849609375, "learning_rate": 2.861111111111111e-06, "loss": -0.0614, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018780209124088287, "mask/share_reasoning": 0.8552939891815186, "mask/share_step_conf": 0.11811327934265137, "num_tokens": 29225142.0, "reward": 0.7515037059783936, "reward_std": 0.26109981536865234, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6032609343528748, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.5849027633666992, "step": 97 }, { "adv/mean_abs_final_conf": 0.4930378794670105, "adv/mean_abs_reasoning": 0.4306323528289795, "adv/mean_abs_step_conf": 0.7536168694496155, "adv/ratio_final_to_reasoning": 1.144916019960104, "adv/ratio_step_to_reasoning": 1.750023806847846, "adv/std_final_conf": 0.7348681092262268, "adv/std_reasoning": 0.7015235424041748, "adv/std_step_conf": 0.9335595965385437, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.4756439393939394, "calib/avg_num_step_conf": 9.06640625, "calib/ece": 0.358361344537815, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.001118181818181907, "calib/mean_conf": 0.9886134453781511, "calib/mu_c": 0.9882, "calib/mu_w": 0.9893181818181819, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.358361344537815, "calib/std_conf": 0.005666663897392195, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9185202492211839, "calib/step_q_c_n": 1284.0, "calib/step_q_gap": 0.005415138324366131, "calib/step_q_w": 0.9131051108968178, "calib/step_q_w_n": 1037.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 781.96484375, "completions/mean_terminated_length": 820.4220581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.10453333333333334, "grad_norm": 0.01520277839154005, "kl": 0.049041748046875, "learning_rate": 2.8333333333333335e-06, "loss": -0.2201, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017720902338624, "mask/share_reasoning": 0.8247094750404358, "mask/share_step_conf": 0.11069461703300476, "num_tokens": 29531509.0, "reward": 0.735116183757782, "reward_std": 0.22515791654586792, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5931308269500732, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.5739765167236328, "step": 98 }, { "adv/mean_abs_final_conf": 0.6646596193313599, "adv/mean_abs_reasoning": 0.5712630748748779, "adv/mean_abs_step_conf": 0.7645260691642761, "adv/ratio_final_to_reasoning": 1.1634913029814473, "adv/ratio_step_to_reasoning": 1.3383082204844554, "adv/std_final_conf": 0.8679842948913574, "adv/std_reasoning": 0.8267807364463806, "adv/std_step_conf": 0.934746265411377, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.4874451399381251, "calib/avg_num_step_conf": 9.3359375, "calib/ece": 0.5084745762711865, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9957627118644068, "calib/gap": 0.0009583423267860436, "calib/mean_conf": 0.9872881355932204, "calib/mu_c": 0.9877876106194691, "calib/mu_w": 0.986829268292683, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5084745762711865, "calib/std_conf": 0.013757033144205443, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9159288537549407, "calib/step_q_c_n": 1012.0, "calib/step_q_gap": 0.004034804408061032, "calib/step_q_w": 0.9118940493468797, "calib/step_q_w_n": 1378.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 776.68359375, "completions/mean_terminated_length": 831.9288330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.1056, "grad_norm": 0.01690530963242054, "kl": 0.0462646484375, "learning_rate": 2.805555555555556e-06, "loss": -0.2449, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.016956226900219917, "mask/share_reasoning": 0.8120033740997314, "mask/share_step_conf": 0.10463409125804901, "num_tokens": 29836140.0, "reward": 0.5938348770141602, "reward_std": 0.29406648874282837, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.4537390470504761, "rewards/format_reward_step": 0.921875, "rewards/step_l1_reward": 0.46127450466156006, "step": 99 }, { "adv/mean_abs_final_conf": 0.5383015871047974, "adv/mean_abs_reasoning": 0.42496341466903687, "adv/mean_abs_step_conf": 0.7425418496131897, "adv/ratio_final_to_reasoning": 1.2667010112482946, "adv/ratio_step_to_reasoning": 1.7473077069269225, "adv/std_final_conf": 0.7861893177032471, "adv/std_reasoning": 0.7206922769546509, "adv/std_step_conf": 0.9341355562210083, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5274305555555555, "calib/avg_num_step_conf": 9.19921875, "calib/ece": 0.31941908713692957, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.013182098765432304, "calib/mean_conf": 0.9833195020746889, "calib/mu_c": 0.9877500000000001, "calib/mu_w": 0.9745679012345678, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31941908713692957, "calib/std_conf": 0.06407154968151031, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9168573551263003, "calib/step_q_c_n": 1346.0, "calib/step_q_gap": 0.0012478407556363313, "calib/step_q_w": 0.915609514370664, "calib/step_q_w_n": 1009.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 759.0546875, "completions/mean_terminated_length": 796.3851928710938, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.10666666666666667, "grad_norm": 0.0173397958278656, "kl": 0.05344390869140625, "learning_rate": 2.7777777777777783e-06, "loss": -0.142, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018021564930677414, "mask/share_reasoning": 0.8257356882095337, "mask/share_step_conf": 0.10936775803565979, "num_tokens": 30137866.0, "reward": 0.7770144939422607, "reward_std": 0.22662517428398132, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6368894577026367, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6046394109725952, "step": 100 }, { "adv/mean_abs_final_conf": 0.611314058303833, "adv/mean_abs_reasoning": 0.5150396227836609, "adv/mean_abs_step_conf": 0.7677444219589233, "adv/ratio_final_to_reasoning": 1.1869262698660596, "adv/ratio_step_to_reasoning": 1.490651180989641, "adv/std_final_conf": 0.8419346213340759, "adv/std_reasoning": 0.7928768992424011, "adv/std_step_conf": 0.9352395534515381, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5257377049180328, "calib/avg_num_step_conf": 8.94921875, "calib/ece": 0.4697975708502024, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838056680161943, "calib/gap": 0.024233442622950796, "calib/mean_conf": 0.9758704453441294, "calib/mu_c": 0.9878399999999999, "calib/mu_w": 0.9636065573770491, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4697975708502024, "calib/std_conf": 0.1086163969073252, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.913648393194707, "calib/step_q_c_n": 1058.0, "calib/step_q_gap": 0.016827630826499296, "calib/step_q_w": 0.8968207623682077, "calib/step_q_w_n": 1233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 817.125, "completions/mean_terminated_length": 846.8988037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.10773333333333333, "grad_norm": 0.018264202401041985, "kl": 0.05799102783203125, "learning_rate": 2.7500000000000004e-06, "loss": -0.109, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017344597727060318, "mask/share_reasoning": 0.8403701782226562, "mask/share_step_conf": 0.10712893307209015, "num_tokens": 30454042.0, "reward": 0.6558018922805786, "reward_std": 0.2865288555622101, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5110242366790771, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.5099546909332275, "step": 101 }, { "adv/mean_abs_final_conf": 0.5574395656585693, "adv/mean_abs_reasoning": 0.496779203414917, "adv/mean_abs_step_conf": 0.7435898184776306, "adv/ratio_final_to_reasoning": 1.1221072899724185, "adv/ratio_step_to_reasoning": 1.4968215524444446, "adv/std_final_conf": 0.7952538132667542, "adv/std_reasoning": 0.7576413154602051, "adv/std_step_conf": 0.9338082075119019, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.4978512267541803, "calib/avg_num_step_conf": 9.4609375, "calib/ece": 0.31547717842323664, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002555086732297651, "calib/mean_conf": 0.987676348547718, "calib/mu_c": 0.9875925925925927, "calib/mu_w": 0.9878481012658225, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31547717842323664, "calib/std_conf": 0.007594154551192266, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9166166439290586, "calib/step_q_c_n": 1466.0, "calib/step_q_gap": -0.004293397911945607, "calib/step_q_w": 0.9209100418410042, "calib/step_q_w_n": 956.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 729.296875, "completions/mean_terminated_length": 755.8704833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.1088, "grad_norm": 0.023663917556405067, "kl": 0.060455322265625, "learning_rate": 2.7222222222222224e-06, "loss": -0.1672, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018974870443344116, "mask/share_reasoning": 0.8205289840698242, "mask/share_step_conf": 0.12533991038799286, "num_tokens": 30747438.0, "reward": 0.7782399654388428, "reward_std": 0.23757195472717285, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6362093687057495, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.606208086013794, "step": 102 }, { "adv/mean_abs_final_conf": 0.470939040184021, "adv/mean_abs_reasoning": 0.43253329396247864, "adv/mean_abs_step_conf": 0.7381531000137329, "adv/ratio_final_to_reasoning": 1.0887925779532568, "adv/ratio_step_to_reasoning": 1.7065809969249817, "adv/std_final_conf": 0.774346113204956, "adv/std_reasoning": 0.73920077085495, "adv/std_step_conf": 0.9338717460632324, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5595996038483305, "calib/avg_num_step_conf": 9.515625, "calib/ece": 0.3599591836734693, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9877551020408163, "calib/gap": 0.023989813242784708, "calib/mean_conf": 0.9803673469387755, "calib/mu_c": 0.9894736842105265, "calib/mu_w": 0.9654838709677418, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3599591836734693, "calib/std_conf": 0.08927067161911918, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9253194959229057, "calib/step_q_c_n": 1349.0, "calib/step_q_gap": 0.011740839069179887, "calib/step_q_w": 0.9135786568537259, "calib/step_q_w_n": 1087.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 816.39453125, "completions/mean_terminated_length": 839.3453369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 343.0, "epoch": 0.10986666666666667, "grad_norm": 0.024849504232406616, "kl": 0.05157470703125, "learning_rate": 2.6944444444444444e-06, "loss": -0.0615, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017709825187921524, "mask/share_reasoning": 0.8464046120643616, "mask/share_step_conf": 0.1085418090224266, "num_tokens": 31060987.0, "reward": 0.7577913403511047, "reward_std": 0.19486621022224426, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6108323335647583, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5938127040863037, "step": 103 }, { "adv/mean_abs_final_conf": 0.5395331382751465, "adv/mean_abs_reasoning": 0.4769881069660187, "adv/mean_abs_step_conf": 0.7343798875808716, "adv/ratio_final_to_reasoning": 1.131124928264896, "adv/ratio_step_to_reasoning": 1.5396188644032374, "adv/std_final_conf": 0.7932192087173462, "adv/std_reasoning": 0.7576055526733398, "adv/std_step_conf": 0.9342693090438843, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5226561980171668, "calib/avg_num_step_conf": 9.2890625, "calib/ece": 0.448170731707317, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009295362299551746, "calib/mean_conf": 0.9888211382113821, "calib/mu_c": 0.9892481203007518, "calib/mu_w": 0.9883185840707966, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.448170731707317, "calib/std_conf": 0.004921386083299872, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9211541701769167, "calib/step_q_c_n": 1187.0, "calib/step_q_gap": 0.007031584114783995, "calib/step_q_w": 0.9141225860621327, "calib/step_q_w_n": 1191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2850.0, "completions/max_terminated_length": 2850.0, "completions/mean_length": 752.5, "completions/mean_terminated_length": 783.0894165039062, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.11093333333333333, "grad_norm": 0.014635776169598103, "kl": 0.0593109130859375, "learning_rate": 2.666666666666667e-06, "loss": -0.192, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018321998417377472, "mask/share_reasoning": 0.8238799571990967, "mask/share_step_conf": 0.11873549222946167, "num_tokens": 31360307.0, "reward": 0.6815319061279297, "reward_std": 0.2409968078136444, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5297003984451294, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5372697114944458, "step": 104 }, { "adv/mean_abs_final_conf": 0.5429937839508057, "adv/mean_abs_reasoning": 0.522207498550415, "adv/mean_abs_step_conf": 0.7398031949996948, "adv/ratio_final_to_reasoning": 1.0398046474975768, "adv/ratio_step_to_reasoning": 1.4166843583313131, "adv/std_final_conf": 0.7971596717834473, "adv/std_reasoning": 0.7929220795631409, "adv/std_step_conf": 0.9352218508720398, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5242816091954023, "calib/avg_num_step_conf": 9.54296875, "calib/ece": 0.3780497925311204, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.983402489626556, "calib/gap": 0.02392887931034471, "calib/mean_conf": 0.9797095435684647, "calib/mu_c": 0.9892413793103448, "calib/mu_w": 0.9653125000000001, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3780497925311204, "calib/std_conf": 0.09072505699646372, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.919859375, "calib/step_q_c_n": 1280.0, "calib/step_q_gap": 0.013978033641444565, "calib/step_q_w": 0.9058813413585555, "calib/step_q_w_n": 1163.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 792.6015625, "completions/mean_terminated_length": 828.187744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 409.0, "epoch": 0.112, "grad_norm": 0.013355753384530544, "kl": 0.0567626953125, "learning_rate": 2.6388888888888893e-06, "loss": -0.1744, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017631947994232178, "mask/share_reasoning": 0.8259000778198242, "mask/share_step_conf": 0.11349925398826599, "num_tokens": 31668973.0, "reward": 0.7236430644989014, "reward_std": 0.2772770822048187, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5842854976654053, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.5614380836486816, "step": 105 }, { "adv/mean_abs_final_conf": 0.5538749098777771, "adv/mean_abs_reasoning": 0.4785403609275818, "adv/mean_abs_step_conf": 0.7471832036972046, "adv/ratio_final_to_reasoning": 1.1574256950953314, "adv/ratio_step_to_reasoning": 1.5613796968951525, "adv/std_final_conf": 0.7899455428123474, "adv/std_reasoning": 0.7393032312393188, "adv/std_step_conf": 0.934104323387146, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5122705932565087, "calib/avg_num_step_conf": 9.19140625, "calib/ece": 0.3994190871369294, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.0009332764262341975, "calib/mean_conf": 0.9886307053941908, "calib/mu_c": 0.9890140845070422, "calib/mu_w": 0.988080808080808, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3994190871369294, "calib/std_conf": 0.007301489866746601, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9207292539815591, "calib/step_q_c_n": 1193.0, "calib/step_q_gap": -0.0048052287770616875, "calib/step_q_w": 0.9255344827586208, "calib/step_q_w_n": 1160.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 717.5546875, "completions/mean_terminated_length": 759.0661010742188, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.11306666666666666, "grad_norm": 0.019834445789456367, "kl": 0.06396484375, "learning_rate": 2.6111111111111113e-06, "loss": -0.1487, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018569810315966606, "mask/share_reasoning": 0.8146433234214783, "mask/share_step_conf": 0.11209937185049057, "num_tokens": 31957251.0, "reward": 0.7071048021316528, "reward_std": 0.21697908639907837, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5637343525886536, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.5512564182281494, "step": 106 }, { "adv/mean_abs_final_conf": 0.5383345484733582, "adv/mean_abs_reasoning": 0.47710686922073364, "adv/mean_abs_step_conf": 0.7578780651092529, "adv/ratio_final_to_reasoning": 1.1283311626861057, "adv/ratio_step_to_reasoning": 1.5884870120338184, "adv/std_final_conf": 0.7956221699714661, "adv/std_reasoning": 0.7576800584793091, "adv/std_step_conf": 0.9345592260360718, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.518074912891986, "calib/avg_num_step_conf": 9.8984375, "calib/ece": 0.3277822580645162, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0012979094076654984, "calib/mean_conf": 0.9890725806451613, "calib/mu_c": 0.9895121951219511, "calib/mu_w": 0.9882142857142856, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3277822580645162, "calib/std_conf": 0.005034835723201447, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.91822266934943, "calib/step_q_c_n": 1491.0, "calib/step_q_gap": -0.004576947141461729, "calib/step_q_w": 0.9227996164908917, "calib/step_q_w_n": 1043.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 741.94140625, "completions/mean_terminated_length": 756.7211303710938, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.11413333333333334, "grad_norm": 0.014583291485905647, "kl": 0.06496429443359375, "learning_rate": 2.5833333333333337e-06, "loss": -0.1273, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01896454207599163, "mask/share_reasoning": 0.8346059322357178, "mask/share_step_conf": 0.12689821422100067, "num_tokens": 32251804.0, "reward": 0.7869763970375061, "reward_std": 0.2739853858947754, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6443132162094116, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.60854572057724, "step": 107 }, { "adv/mean_abs_final_conf": 0.5826718807220459, "adv/mean_abs_reasoning": 0.5499783754348755, "adv/mean_abs_step_conf": 0.7166726589202881, "adv/ratio_final_to_reasoning": 1.0594450741109942, "adv/ratio_step_to_reasoning": 1.3030924322317312, "adv/std_final_conf": 0.8173876404762268, "adv/std_reasoning": 0.8100540041923523, "adv/std_step_conf": 0.9353122711181641, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5079188663517643, "calib/avg_num_step_conf": 9.76171875, "calib/ece": 0.24537815126050422, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007409465592294051, "calib/mean_conf": 0.9890756302521009, "calib/mu_c": 0.989265536723164, "calib/mu_w": 0.9885245901639346, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24537815126050422, "calib/std_conf": 0.004581951556374991, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9210409670920081, "calib/step_q_c_n": 1489.0, "calib/step_q_gap": -0.00631546855155618, "calib/step_q_w": 0.9273564356435643, "calib/step_q_w_n": 1010.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 764.2421875, "completions/mean_terminated_length": 801.8278198242188, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.1152, "grad_norm": 0.013571220450103283, "kl": 0.0575714111328125, "learning_rate": 2.5555555555555557e-06, "loss": -0.1597, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017903491854667664, "mask/share_reasoning": 0.8209375143051147, "mask/share_step_conf": 0.1142839789390564, "num_tokens": 32550682.0, "reward": 0.8342296481132507, "reward_std": 0.30271947383880615, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.6967445015907288, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.6474959850311279, "step": 108 }, { "adv/mean_abs_final_conf": 0.44212257862091064, "adv/mean_abs_reasoning": 0.3431011438369751, "adv/mean_abs_step_conf": 0.7551099061965942, "adv/ratio_final_to_reasoning": 1.2886071252242333, "adv/ratio_step_to_reasoning": 2.2008376240079968, "adv/std_final_conf": 0.7278553247451782, "adv/std_reasoning": 0.6403741240501404, "adv/std_step_conf": 0.933077871799469, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5175289312457454, "calib/avg_num_step_conf": 9.77734375, "calib/ece": 0.4522633744855968, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.004605854322668468, "calib/mean_conf": 0.9872427983539095, "calib/mu_c": 0.9893846153846154, "calib/mu_w": 0.984778761061947, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4522633744855968, "calib/std_conf": 0.02564819560608947, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9250763052208836, "calib/step_q_c_n": 1245.0, "calib/step_q_gap": 0.018216209831376506, "calib/step_q_w": 0.9068600953895071, "calib/step_q_w_n": 1258.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 789.09765625, "completions/mean_terminated_length": 817.8502197265625, "completions/min_length": 0.0, "completions/min_terminated_length": 458.0, "epoch": 0.11626666666666667, "grad_norm": 0.019535109400749207, "kl": 0.06496429443359375, "learning_rate": 2.5277777777777778e-06, "loss": -0.12, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01754792034626007, "mask/share_reasoning": 0.8291429281234741, "mask/share_step_conf": 0.1181529089808464, "num_tokens": 32857291.0, "reward": 0.6686311960220337, "reward_std": 0.1786944568157196, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5204710960388184, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.5253850221633911, "step": 109 }, { "adv/mean_abs_final_conf": 0.5592010021209717, "adv/mean_abs_reasoning": 0.5262867212295532, "adv/mean_abs_step_conf": 0.7592321634292603, "adv/ratio_final_to_reasoning": 1.0625405877893357, "adv/ratio_step_to_reasoning": 1.4426207859766653, "adv/std_final_conf": 0.8077802062034607, "adv/std_reasoning": 0.7928251028060913, "adv/std_step_conf": 0.9346528053283691, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5193466648404328, "calib/avg_num_step_conf": 9.7578125, "calib/ece": 0.3857085020242914, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001299821942199686, "calib/mean_conf": 0.9889473684210526, "calib/mu_c": 0.9894630872483222, "calib/mu_w": 0.9881632653061225, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3857085020242914, "calib/std_conf": 0.0056648342716980205, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.92643906020558, "calib/step_q_c_n": 1362.0, "calib/step_q_gap": 0.0037453982337489755, "calib/step_q_w": 0.922693661971831, "calib/step_q_w_n": 1136.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 720.6015625, "completions/mean_terminated_length": 737.8960571289062, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.11733333333333333, "grad_norm": 0.016404099762439728, "kl": 0.06627655029296875, "learning_rate": 2.5e-06, "loss": -0.1404, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019600635394454002, "mask/share_reasoning": 0.8245271444320679, "mask/share_step_conf": 0.1324346661567688, "num_tokens": 33146685.0, "reward": 0.7375810146331787, "reward_std": 0.2660372257232666, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5909448862075806, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.5748422145843506, "step": 110 }, { "adv/mean_abs_final_conf": 0.534350574016571, "adv/mean_abs_reasoning": 0.46892502903938293, "adv/mean_abs_step_conf": 0.7618857622146606, "adv/ratio_final_to_reasoning": 1.1395223989455536, "adv/ratio_step_to_reasoning": 1.6247496188792117, "adv/std_final_conf": 0.783694863319397, "adv/std_reasoning": 0.7576603293418884, "adv/std_step_conf": 0.9344452619552612, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5555994824611845, "calib/avg_num_step_conf": 9.7890625, "calib/ece": 0.3735674931129478, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.987603305785124, "calib/gap": 0.010952175579835521, "calib/mean_conf": 0.985137741046832, "calib/mu_c": 0.9893918918918919, "calib/mu_w": 0.9784397163120564, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3735674931129478, "calib/std_conf": 0.04316458283197865, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9178399378399378, "calib/step_q_c_n": 1287.0, "calib/step_q_gap": -0.0005193730706446464, "calib/step_q_w": 0.9183593109105824, "calib/step_q_w_n": 1219.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 753.44140625, "completions/mean_terminated_length": 793.7489624023438, "completions/min_length": 0.0, "completions/min_terminated_length": 398.0, "epoch": 0.1184, "grad_norm": 0.016747089102864265, "kl": 0.06616973876953125, "learning_rate": 2.4722222222222226e-06, "loss": -0.2023, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018317745998501778, "mask/share_reasoning": 0.8140534162521362, "mask/share_step_conf": 0.11684764921665192, "num_tokens": 33446974.0, "reward": 0.7371137142181396, "reward_std": 0.25048425793647766, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.591988205909729, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.5775517225265503, "step": 111 }, { "adv/mean_abs_final_conf": 0.5227197408676147, "adv/mean_abs_reasoning": 0.4496156573295593, "adv/mean_abs_step_conf": 0.7582208514213562, "adv/ratio_final_to_reasoning": 1.1625923882905875, "adv/ratio_step_to_reasoning": 1.686375550008027, "adv/std_final_conf": 0.7749086022377014, "adv/std_reasoning": 0.7393931746482849, "adv/std_step_conf": 0.9344673752784729, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5401295409743734, "calib/avg_num_step_conf": 9.83203125, "calib/ece": 0.42933333333333346, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": 0.002410588566600813, "calib/mean_conf": 0.9876666666666668, "calib/mu_c": 0.9887313432835819, "calib/mu_w": 0.9863207547169811, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.42933333333333346, "calib/std_conf": 0.009285592184789415, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9196293176074136, "calib/step_q_c_n": 1187.0, "calib/step_q_gap": 0.012482701066060198, "calib/step_q_w": 0.9071466165413534, "calib/step_q_w_n": 1330.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 775.73828125, "completions/mean_terminated_length": 820.6156616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.11946666666666667, "grad_norm": 0.02123652584850788, "kl": 0.06374359130859375, "learning_rate": 2.4444444444444447e-06, "loss": -0.2496, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017327800393104553, "mask/share_reasoning": 0.8179709911346436, "mask/share_step_conf": 0.11001374572515488, "num_tokens": 33753483.0, "reward": 0.678582489490509, "reward_std": 0.20910188555717468, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5304815769195557, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.536058247089386, "step": 112 }, { "adv/mean_abs_final_conf": 0.6130001544952393, "adv/mean_abs_reasoning": 0.5337440967559814, "adv/mean_abs_step_conf": 0.7346939444541931, "adv/ratio_final_to_reasoning": 1.1484907434498377, "adv/ratio_step_to_reasoning": 1.3764909980635953, "adv/std_final_conf": 0.8501105308532715, "adv/std_reasoning": 0.8098631501197815, "adv/std_step_conf": 0.934903621673584, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.513346747149564, "calib/avg_num_step_conf": 10.6875, "calib/ece": 0.41344129554655873, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005915492957746515, "calib/mean_conf": 0.9883400809716599, "calib/mu_c": 0.9885915492957746, "calib/mu_w": 0.988, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41344129554655873, "calib/std_conf": 0.00742559126679215, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9231344410876133, "calib/step_q_c_n": 1324.0, "calib/step_q_gap": -0.005417258629100585, "calib/step_q_w": 0.9285516997167139, "calib/step_q_w_n": 1412.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 708.453125, "completions/mean_terminated_length": 734.2672119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.12053333333333334, "grad_norm": 0.016310779377818108, "kl": 0.07384490966796875, "learning_rate": 2.4166666666666667e-06, "loss": -0.1681, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019763082265853882, "mask/share_reasoning": 0.813399076461792, "mask/share_step_conf": 0.1316816210746765, "num_tokens": 34040047.0, "reward": 0.7140007019042969, "reward_std": 0.2736779451370239, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5643468499183655, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.5597482919692993, "step": 113 }, { "adv/mean_abs_final_conf": 0.5179460048675537, "adv/mean_abs_reasoning": 0.40910542011260986, "adv/mean_abs_step_conf": 0.7549639940261841, "adv/ratio_final_to_reasoning": 1.2660453257377635, "adv/ratio_step_to_reasoning": 1.8454020819826187, "adv/std_final_conf": 0.7732772827148438, "adv/std_reasoning": 0.6816585659980774, "adv/std_step_conf": 0.933284342288971, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5517870439314967, "calib/avg_num_step_conf": 10.19921875, "calib/ece": 0.33880658436214, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9917695473251029, "calib/gap": 0.0025532390171258967, "calib/mean_conf": 0.9877777777777779, "calib/mu_c": 0.9886708860759492, "calib/mu_w": 0.9861176470588233, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33818930041152273, "calib/std_conf": 0.012406025351440173, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9254854368932038, "calib/step_q_c_n": 1442.0, "calib/step_q_gap": 0.012525642196882236, "calib/step_q_w": 0.9129597946963216, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 712.6328125, "completions/mean_terminated_length": 738.5991821289062, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.1216, "grad_norm": 0.023121921345591545, "kl": 0.0703887939453125, "learning_rate": 2.388888888888889e-06, "loss": -0.1215, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.019264452159404755, "mask/share_reasoning": 0.817579448223114, "mask/share_step_conf": 0.12799984216690063, "num_tokens": 34327505.0, "reward": 0.7811685800552368, "reward_std": 0.19103579223155975, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6261183023452759, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6221563220024109, "step": 114 }, { "adv/mean_abs_final_conf": 0.5585366487503052, "adv/mean_abs_reasoning": 0.49713629484176636, "adv/mean_abs_step_conf": 0.7559834718704224, "adv/ratio_final_to_reasoning": 1.1235080893220277, "adv/ratio_step_to_reasoning": 1.520676481911353, "adv/std_final_conf": 0.8085088729858398, "adv/std_reasoning": 0.7576323747634888, "adv/std_step_conf": 0.934259831905365, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4924181427804616, "calib/avg_num_step_conf": 9.1875, "calib/ece": 0.43219512195121956, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": -0.008107890499194692, "calib/mean_conf": 0.9834146341463416, "calib/mu_c": 0.9798550724637681, "calib/mu_w": 0.9879629629629628, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42731707317073175, "calib/std_conf": 0.06447825962837048, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9207207953603976, "calib/step_q_c_n": 1207.0, "calib/step_q_gap": 0.01630158138659854, "calib/step_q_w": 0.9044192139737991, "calib/step_q_w_n": 1145.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 769.578125, "completions/mean_terminated_length": 794.4031982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.12266666666666666, "grad_norm": 0.019223693758249283, "kl": 0.0823211669921875, "learning_rate": 2.361111111111111e-06, "loss": -0.1441, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0183610450476408, "mask/share_reasoning": 0.829988420009613, "mask/share_step_conf": 0.12040051817893982, "num_tokens": 34629781.0, "reward": 0.689035952091217, "reward_std": 0.2614095211029053, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5410534739494324, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.537799596786499, "step": 115 }, { "adv/mean_abs_final_conf": 0.6313544511795044, "adv/mean_abs_reasoning": 0.6060963869094849, "adv/mean_abs_step_conf": 0.7450257539749146, "adv/ratio_final_to_reasoning": 1.0416733457178513, "adv/ratio_step_to_reasoning": 1.2292199228803151, "adv/std_final_conf": 0.859879195690155, "adv/std_reasoning": 0.8429909348487854, "adv/std_step_conf": 0.9344965219497681, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5225281602002503, "calib/avg_num_step_conf": 10.37890625, "calib/ece": 0.40823045267489727, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9917695473251029, "calib/gap": 0.003289528577388734, "calib/mean_conf": 0.9884773662551442, "calib/mu_c": 0.9898581560283688, "calib/mu_w": 0.9865686274509801, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40823045267489727, "calib/std_conf": 0.013719204273677617, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9222453703703704, "calib/step_q_c_n": 1296.0, "calib/step_q_gap": 0.006110175660598016, "calib/step_q_w": 0.9161351947097723, "calib/step_q_w_n": 1361.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 792.05078125, "completions/mean_terminated_length": 824.2479248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.12373333333333333, "grad_norm": 0.01399253774434328, "kl": 0.0571746826171875, "learning_rate": 2.3333333333333336e-06, "loss": -0.116, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017849216237664223, "mask/share_reasoning": 0.821422278881073, "mask/share_step_conf": 0.12166602164506912, "num_tokens": 34937066.0, "reward": 0.7141497135162354, "reward_std": 0.3195981979370117, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5611796379089355, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.5671197175979614, "step": 116 }, { "adv/mean_abs_final_conf": 0.64402174949646, "adv/mean_abs_reasoning": 0.5441372394561768, "adv/mean_abs_step_conf": 0.7752643823623657, "adv/ratio_final_to_reasoning": 1.183564922224603, "adv/ratio_step_to_reasoning": 1.4247589140143813, "adv/std_final_conf": 0.8416389226913452, "adv/std_reasoning": 0.7930080890655518, "adv/std_step_conf": 0.9348081946372986, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5257218718884833, "calib/avg_num_step_conf": 9.71484375, "calib/ece": 0.44333333333333347, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9878048780487805, "calib/gap": 0.02748489877198812, "calib/mean_conf": 0.9758536585365853, "calib/mu_c": 0.9887022900763359, "calib/mu_w": 0.9612173913043478, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.44333333333333347, "calib/std_conf": 0.1087731048571918, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9253018372703411, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.02639558727034097, "calib/step_q_w": 0.8989062500000001, "calib/step_q_w_n": 1344.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 747.90234375, "completions/mean_terminated_length": 772.0281982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 305.0, "epoch": 0.1248, "grad_norm": 0.015917358919978142, "kl": 0.0683746337890625, "learning_rate": 2.305555555555556e-06, "loss": -0.088, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018997523933649063, "mask/share_reasoning": 0.8268004655838013, "mask/share_step_conf": 0.12295202910900116, "num_tokens": 35235129.0, "reward": 0.6883047819137573, "reward_std": 0.29987388849258423, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5306491851806641, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.5529916286468506, "step": 117 }, { "adv/mean_abs_final_conf": 0.538367509841919, "adv/mean_abs_reasoning": 0.4627258777618408, "adv/mean_abs_step_conf": 0.7386770248413086, "adv/ratio_final_to_reasoning": 1.1634696387544807, "adv/ratio_step_to_reasoning": 1.5963598759901134, "adv/std_final_conf": 0.8059641718864441, "adv/std_reasoning": 0.7574905753135681, "adv/std_step_conf": 0.9340482950210571, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5306023763470572, "calib/avg_num_step_conf": 9.60546875, "calib/ece": 0.3664112903225807, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.002460624481900986, "calib/mean_conf": 0.9873790322580646, "calib/mu_c": 0.9883116883116883, "calib/mu_w": 0.9858510638297873, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3664112903225807, "calib/std_conf": 0.009587536215127371, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9131305536568695, "calib/step_q_c_n": 1463.0, "calib/step_q_gap": 0.008920379627418318, "calib/step_q_w": 0.9042101740294511, "calib/step_q_w_n": 996.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 757.5, "completions/mean_terminated_length": 775.6800537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.12586666666666665, "grad_norm": 0.015437428839504719, "kl": 0.06353759765625, "learning_rate": 2.277777777777778e-06, "loss": -0.1185, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018956642597913742, "mask/share_reasoning": 0.831961989402771, "mask/share_step_conf": 0.12564381957054138, "num_tokens": 35533057.0, "reward": 0.7535146474838257, "reward_std": 0.23272234201431274, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6116320490837097, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.58133465051651, "step": 118 }, { "adv/mean_abs_final_conf": 0.538609504699707, "adv/mean_abs_reasoning": 0.47553685307502747, "adv/mean_abs_step_conf": 0.7550853490829468, "adv/ratio_final_to_reasoning": 1.132634623829519, "adv/ratio_step_to_reasoning": 1.5878587415470273, "adv/std_final_conf": 0.7883803844451904, "adv/std_reasoning": 0.7576671242713928, "adv/std_step_conf": 0.9337403774261475, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5490570112294846, "calib/avg_num_step_conf": 8.828125, "calib/ece": 0.3654732510288067, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9917695473251029, "calib/gap": 0.0033630866685860905, "calib/mean_conf": 0.9868724279835391, "calib/mu_c": 0.9881456953642382, "calib/mu_w": 0.9847826086956522, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3654732510288067, "calib/std_conf": 0.012107720433875898, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9222435897435898, "calib/step_q_c_n": 1248.0, "calib/step_q_gap": 0.002688253775210425, "calib/step_q_w": 0.9195553359683794, "calib/step_q_w_n": 1012.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 800.87890625, "completions/mean_terminated_length": 830.0607299804688, "completions/min_length": 0.0, "completions/min_terminated_length": 348.0, "epoch": 0.12693333333333334, "grad_norm": 0.02127382531762123, "kl": 0.0583038330078125, "learning_rate": 2.25e-06, "loss": -0.1025, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018054481595754623, "mask/share_reasoning": 0.8387048244476318, "mask/share_step_conf": 0.10808445513248444, "num_tokens": 35843146.0, "reward": 0.7453292608261108, "reward_std": 0.24058867990970612, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6004007458686829, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.5832265615463257, "step": 119 }, { "adv/mean_abs_final_conf": 0.5529657602310181, "adv/mean_abs_reasoning": 0.5008734464645386, "adv/mean_abs_step_conf": 0.7641829252243042, "adv/ratio_final_to_reasoning": 1.104002945522822, "adv/ratio_step_to_reasoning": 1.5257006148326686, "adv/std_final_conf": 0.8203580975532532, "adv/std_reasoning": 0.7754554748535156, "adv/std_step_conf": 0.9351306557655334, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.551783659378596, "calib/avg_num_step_conf": 10.9765625, "calib/ece": 0.326137339055794, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.9828326180257511, "calib/gap": 0.001138418543481734, "calib/mean_conf": 0.9870815450643777, "calib/mu_c": 0.9874675324675324, "calib/mu_w": 0.9863291139240506, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.326137339055794, "calib/std_conf": 0.012604149913328892, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9201223902087833, "calib/step_q_c_n": 1389.0, "calib/step_q_gap": -0.008723492972075308, "calib/step_q_w": 0.9288458831808586, "calib/step_q_w_n": 1421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 715.015625, "completions/mean_terminated_length": 778.9105834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.128, "grad_norm": 0.01677713543176651, "kl": 0.064300537109375, "learning_rate": 2.222222222222222e-06, "loss": -0.2494, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.017968494445085526, "mask/share_reasoning": 0.7828407883644104, "mask/share_step_conf": 0.11715947091579437, "num_tokens": 36132878.0, "reward": 0.7485262751579285, "reward_std": 0.22463907301425934, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6096257567405701, "rewards/format_reward_step": 0.90625, "rewards/step_l1_reward": 0.585864245891571, "step": 120 }, { "adv/mean_abs_final_conf": 0.6709598302841187, "adv/mean_abs_reasoning": 0.5795350074768066, "adv/mean_abs_step_conf": 0.7495955228805542, "adv/ratio_final_to_reasoning": 1.1577554791821112, "adv/ratio_step_to_reasoning": 1.293443041765779, "adv/std_final_conf": 0.868781328201294, "adv/std_reasoning": 0.8101203441619873, "adv/std_step_conf": 0.9349991083145142, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5046420083184788, "calib/avg_num_step_conf": 9.1796875, "calib/ece": 0.35302904564315374, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": -0.0010063874034462739, "calib/mean_conf": 0.9878838174273861, "calib/mu_c": 0.9875163398692811, "calib/mu_w": 0.9885227272727274, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35302904564315374, "calib/std_conf": 0.009022964451993423, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9181722054380665, "calib/step_q_c_n": 1324.0, "calib/step_q_gap": 0.012295012455610377, "calib/step_q_w": 0.9058771929824562, "calib/step_q_w_n": 1026.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 780.52734375, "completions/mean_terminated_length": 818.9138793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.12906666666666666, "grad_norm": 0.012640361674129963, "kl": 0.059722900390625, "learning_rate": 2.1944444444444445e-06, "loss": -0.2439, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01739826798439026, "mask/share_reasoning": 0.8256901502609253, "mask/share_step_conf": 0.11003653705120087, "num_tokens": 36437749.0, "reward": 0.7456340789794922, "reward_std": 0.30374008417129517, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6053320169448853, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.5781236886978149, "step": 121 }, { "adv/mean_abs_final_conf": 0.5027682185173035, "adv/mean_abs_reasoning": 0.44970449805259705, "adv/mean_abs_step_conf": 0.7147163152694702, "adv/ratio_final_to_reasoning": 1.1179968639284104, "adv/ratio_step_to_reasoning": 1.5893021269844574, "adv/std_final_conf": 0.7782015800476074, "adv/std_reasoning": 0.7575188875198364, "adv/std_step_conf": 0.9345012903213501, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.49660633484162897, "calib/avg_num_step_conf": 9.2578125, "calib/ece": 0.33995850622406654, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9875518672199171, "calib/gap": 0.0012217194570136591, "calib/mean_conf": 0.9872614107883818, "calib/mu_c": 0.9876923076923075, "calib/mu_w": 0.9864705882352939, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.33995850622406654, "calib/std_conf": 0.015351014823162748, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9221030042918454, "calib/step_q_c_n": 1398.0, "calib/step_q_gap": 0.014726461081968845, "calib/step_q_w": 0.9073765432098766, "calib/step_q_w_n": 972.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 767.0546875, "completions/mean_terminated_length": 801.4938354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 432.0, "epoch": 0.13013333333333332, "grad_norm": 0.015725623816251755, "kl": 0.06002044677734375, "learning_rate": 2.166666666666667e-06, "loss": -0.1633, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018181851133704185, "mask/share_reasoning": 0.8258481025695801, "mask/share_step_conf": 0.1130012795329094, "num_tokens": 36741459.0, "reward": 0.7598967552185059, "reward_std": 0.19657093286514282, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6178292632102966, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.5933704376220703, "step": 122 }, { "adv/mean_abs_final_conf": 0.624484658241272, "adv/mean_abs_reasoning": 0.5451442003250122, "adv/mean_abs_step_conf": 0.7624143362045288, "adv/ratio_final_to_reasoning": 1.145540313680229, "adv/ratio_step_to_reasoning": 1.3985553469155156, "adv/std_final_conf": 0.8395637273788452, "adv/std_reasoning": 0.7929816246032715, "adv/std_step_conf": 0.934124767780304, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5196058374249864, "calib/avg_num_step_conf": 9.15625, "calib/ece": 0.40697959183673466, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.009171440261865671, "calib/mean_conf": 0.9824897959183673, "calib/mu_c": 0.9863829787234042, "calib/mu_w": 0.9772115384615385, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40697959183673466, "calib/std_conf": 0.06380394794546601, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9132793522267207, "calib/step_q_c_n": 1235.0, "calib/step_q_gap": -0.001919024689419846, "calib/step_q_w": 0.9151983769161406, "calib/step_q_w_n": 1109.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 820.36328125, "completions/mean_terminated_length": 846.8265991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.1312, "grad_norm": 0.05354432389140129, "kl": 0.08370208740234375, "learning_rate": 2.138888888888889e-06, "loss": -0.1112, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0171657707542181, "mask/share_reasoning": 0.839978039264679, "mask/share_step_conf": 0.11160621047019958, "num_tokens": 37056760.0, "reward": 0.7095881700515747, "reward_std": 0.29550105333328247, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5651074051856995, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5525063276290894, "step": 123 }, { "adv/mean_abs_final_conf": 0.5212051868438721, "adv/mean_abs_reasoning": 0.4511791467666626, "adv/mean_abs_step_conf": 0.7597951889038086, "adv/ratio_final_to_reasoning": 1.1552067301404447, "adv/ratio_step_to_reasoning": 1.6840210686792971, "adv/std_final_conf": 0.7759954929351807, "adv/std_reasoning": 0.7207808494567871, "adv/std_step_conf": 0.9350464344024658, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5416800385109115, "calib/avg_num_step_conf": 10.3046875, "calib/ece": 0.31208333333333327, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9791666666666666, "calib/gap": -0.008183568677792019, "calib/mean_conf": 0.9787500000000001, "calib/mu_c": 0.9761585365853657, "calib/mu_w": 0.9843421052631577, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3037499999999999, "calib/std_conf": 0.09053141719867197, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9168852459016393, "calib/step_q_c_n": 1464.0, "calib/step_q_gap": 0.006482349819867506, "calib/step_q_w": 0.9104028960817718, "calib/step_q_w_n": 1174.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 804.55859375, "completions/mean_terminated_length": 844.1270141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.13226666666666667, "grad_norm": 0.03308064118027687, "kl": 0.058441162109375, "learning_rate": 2.1111111111111114e-06, "loss": -0.1693, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017353367060422897, "mask/share_reasoning": 0.8213303089141846, "mask/share_step_conf": 0.11444129049777985, "num_tokens": 37369543.0, "reward": 0.7908234596252441, "reward_std": 0.24148871004581451, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6418148279190063, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6242070198059082, "step": 124 }, { "adv/mean_abs_final_conf": 0.6522349119186401, "adv/mean_abs_reasoning": 0.5776388645172119, "adv/mean_abs_step_conf": 0.7451116442680359, "adv/ratio_final_to_reasoning": 1.1291395921979301, "adv/ratio_step_to_reasoning": 1.2899264402695567, "adv/std_final_conf": 0.8551003932952881, "adv/std_reasoning": 0.8099701404571533, "adv/std_step_conf": 0.9350612759590149, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5327742837176799, "calib/avg_num_step_conf": 9.19921875, "calib/ece": 0.4260995850622408, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9875518672199171, "calib/gap": 0.0036065688329837142, "calib/mean_conf": 0.9862655601659752, "calib/mu_c": 0.9878518518518518, "calib/mu_w": 0.984245283018868, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4260995850622408, "calib/std_conf": 0.01645300161157873, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9197979797979798, "calib/step_q_c_n": 1188.0, "calib/step_q_gap": 0.0045280569188023145, "calib/step_q_w": 0.9152699228791775, "calib/step_q_w_n": 1167.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 856.3984375, "completions/mean_terminated_length": 880.473876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 326.0, "epoch": 0.13333333333333333, "grad_norm": 0.024113211780786514, "kl": 0.0589752197265625, "learning_rate": 2.0833333333333334e-06, "loss": -0.053, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017093347385525703, "mask/share_reasoning": 0.844731330871582, "mask/share_step_conf": 0.11083149909973145, "num_tokens": 37693589.0, "reward": 0.6838798522949219, "reward_std": 0.27375972270965576, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5360523462295532, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.5387386083602905, "step": 125 }, { "adv/mean_abs_final_conf": 0.519602358341217, "adv/mean_abs_reasoning": 0.4709121584892273, "adv/mean_abs_step_conf": 0.7496457695960999, "adv/ratio_final_to_reasoning": 1.1033955037563627, "adv/ratio_step_to_reasoning": 1.5919014960265652, "adv/std_final_conf": 0.7761543989181519, "adv/std_reasoning": 0.7577767968177795, "adv/std_step_conf": 0.9334598183631897, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.531223083548665, "calib/avg_num_step_conf": 9.9765625, "calib/ece": 0.43189873417721514, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9662447257383966, "calib/gap": 0.021431955211025033, "calib/mean_conf": 0.9762025316455696, "calib/mu_c": 0.985968992248062, "calib/mu_w": 0.964537037037037, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43189873417721514, "calib/std_conf": 0.09223393205018894, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9176877133105804, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": 0.01070507944661514, "calib/step_q_w": 0.9069826338639653, "calib/step_q_w_n": 1382.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 799.23046875, "completions/mean_terminated_length": 838.536865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.1344, "grad_norm": 0.025906076654791832, "kl": 0.06085205078125, "learning_rate": 2.0555555555555555e-06, "loss": -0.1235, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01753048598766327, "mask/share_reasoning": 0.8175128102302551, "mask/share_step_conf": 0.11808168888092041, "num_tokens": 38003656.0, "reward": 0.6730961799621582, "reward_std": 0.24437865614891052, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5240218639373779, "rewards/format_reward_step": 0.921875, "rewards/step_l1_reward": 0.5370141267776489, "step": 126 }, { "adv/mean_abs_final_conf": 0.5989837646484375, "adv/mean_abs_reasoning": 0.4840828776359558, "adv/mean_abs_step_conf": 0.7649731636047363, "adv/ratio_final_to_reasoning": 1.2373578829592284, "adv/ratio_step_to_reasoning": 1.5802524711068546, "adv/std_final_conf": 0.8067528605461121, "adv/std_reasoning": 0.739617645740509, "adv/std_step_conf": 0.9335212707519531, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5324467698568418, "calib/avg_num_step_conf": 9.9765625, "calib/ece": 0.39130252100840346, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.9831932773109243, "calib/gap": 0.024702419882276172, "calib/mean_conf": 0.9753361344537815, "calib/mu_c": 0.9856115107913669, "calib/mu_w": 0.9609090909090907, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39130252100840346, "calib/std_conf": 0.09605939840553002, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9170251177394034, "calib/step_q_c_n": 1274.0, "calib/step_q_gap": 0.010358451072736807, "calib/step_q_w": 0.9066666666666666, "calib/step_q_w_n": 1280.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 732.9296875, "completions/mean_terminated_length": 775.33056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.13546666666666668, "grad_norm": 0.02946789562702179, "kl": 0.079742431640625, "learning_rate": 2.027777777777778e-06, "loss": -0.1981, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01851074770092964, "mask/share_reasoning": 0.8075978755950928, "mask/share_step_conf": 0.1192038431763649, "num_tokens": 38294958.0, "reward": 0.7024646401405334, "reward_std": 0.26939183473587036, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5640590190887451, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.5463390350341797, "step": 127 }, { "adv/mean_abs_final_conf": 0.6941781044006348, "adv/mean_abs_reasoning": 0.5947922468185425, "adv/mean_abs_step_conf": 0.7895233631134033, "adv/ratio_final_to_reasoning": 1.1670933979279199, "adv/ratio_step_to_reasoning": 1.3273935014056577, "adv/std_final_conf": 0.8686315417289734, "adv/std_reasoning": 0.810073971748352, "adv/std_step_conf": 0.9350660443305969, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.5875923413236847, "calib/avg_num_step_conf": 10.046875, "calib/ece": 0.39669527896995715, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.9613733905579399, "calib/gap": 0.03229006482737817, "calib/mean_conf": 0.9718025751072962, "calib/mu_c": 0.9855223880597013, "calib/mu_w": 0.9532323232323231, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39669527896995715, "calib/std_conf": 0.09675070656505114, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9161878009630819, "calib/step_q_c_n": 1246.0, "calib/step_q_gap": 0.009815251943474212, "calib/step_q_w": 0.9063725490196077, "calib/step_q_w_n": 1326.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 829.671875, "completions/mean_terminated_length": 870.475341796875, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.13653333333333334, "grad_norm": 0.02151694893836975, "kl": 0.06743621826171875, "learning_rate": 2.0000000000000003e-06, "loss": -0.1478, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.01694222167134285, "mask/share_reasoning": 0.8211142420768738, "mask/share_step_conf": 0.11506853997707367, "num_tokens": 38614018.0, "reward": 0.6881914138793945, "reward_std": 0.3057379722595215, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5496238470077515, "rewards/format_reward_step": 0.90625, "rewards/step_l1_reward": 0.5408215522766113, "step": 128 }, { "adv/mean_abs_final_conf": 0.5974356532096863, "adv/mean_abs_reasoning": 0.42029517889022827, "adv/mean_abs_step_conf": 0.7427824139595032, "adv/ratio_final_to_reasoning": 1.4214668243095008, "adv/ratio_step_to_reasoning": 1.7672874952332045, "adv/std_final_conf": 0.8128259181976318, "adv/std_reasoning": 0.7016498446464539, "adv/std_step_conf": 0.9345721006393433, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5926601488399241, "calib/avg_num_step_conf": 11.4609375, "calib/ece": 0.3499176954732511, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9794238683127572, "calib/gap": 0.009147818473660974, "calib/mean_conf": 0.9836625514403293, "calib/mu_c": 0.987012987012987, "calib/mu_w": 0.977865168539326, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3499176954732511, "calib/std_conf": 0.019461931736232917, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9127908496732027, "calib/step_q_c_n": 1530.0, "calib/step_q_gap": -0.018776101893748898, "calib/step_q_w": 0.9315669515669516, "calib/step_q_w_n": 1404.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 739.125, "completions/mean_terminated_length": 775.475341796875, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.1376, "grad_norm": 0.021641433238983154, "kl": 0.0746612548828125, "learning_rate": 1.9722222222222224e-06, "loss": -0.2126, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018588412553071976, "mask/share_reasoning": 0.8041425943374634, "mask/share_step_conf": 0.1303940713405609, "num_tokens": 38905618.0, "reward": 0.7644376754760742, "reward_std": 0.22583582997322083, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6163402199745178, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6023787260055542, "step": 129 }, { "adv/mean_abs_final_conf": 0.6023110151290894, "adv/mean_abs_reasoning": 0.2895757555961609, "adv/mean_abs_step_conf": 0.7641828060150146, "adv/ratio_final_to_reasoning": 2.0799773582187093, "adv/ratio_step_to_reasoning": 2.638973709804406, "adv/std_final_conf": 0.825607419013977, "adv/std_reasoning": 0.5727540850639343, "adv/std_step_conf": 0.9316400289535522, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5526162790697675, "calib/avg_num_step_conf": 10.53125, "calib/ece": 0.34788617886178874, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.943089430894309, "calib/gap": -0.01525726744186029, "calib/mean_conf": 0.9641463414634147, "calib/mu_c": 0.9588125, "calib/mu_w": 0.9740697674418602, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3308130081300814, "calib/std_conf": 0.12827768931519623, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9229144827586208, "calib/step_q_c_n": 1450.0, "calib/step_q_gap": 0.0019514008966624585, "calib/step_q_w": 0.9209630818619583, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 720.69921875, "completions/mean_terminated_length": 746.9595336914062, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.13866666666666666, "grad_norm": 0.03302701190114021, "kl": 0.07592010498046875, "learning_rate": 1.944444444444445e-06, "loss": -0.1114, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019100401550531387, "mask/share_reasoning": 0.818557858467102, "mask/share_step_conf": 0.12718549370765686, "num_tokens": 39195405.0, "reward": 0.7844294309616089, "reward_std": 0.17043429613113403, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6253741979598999, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6262969970703125, "step": 130 }, { "adv/mean_abs_final_conf": 0.6709331274032593, "adv/mean_abs_reasoning": 0.4085502624511719, "adv/mean_abs_step_conf": 0.7450651526451111, "adv/ratio_final_to_reasoning": 1.6422290941091886, "adv/ratio_step_to_reasoning": 1.8236805140570873, "adv/std_final_conf": 0.853084921836853, "adv/std_reasoning": 0.7014339566230774, "adv/std_step_conf": 0.9335004687309265, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.5526536846124475, "calib/avg_num_step_conf": 11.375, "calib/ece": 0.5476724137931034, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.9439655172413793, "calib/gap": 0.02887361588392534, "calib/mean_conf": 0.9657758620689655, "calib/mu_c": 0.9825773195876291, "calib/mu_w": 0.9537037037037037, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5476724137931034, "calib/std_conf": 0.1129819245695047, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9186820809248556, "calib/step_q_c_n": 865.0, "calib/step_q_gap": 0.010453779345308223, "calib/step_q_w": 0.9082283015795474, "calib/step_q_w_n": 2047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 726.26953125, "completions/mean_terminated_length": 784.49365234375, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.13973333333333332, "grad_norm": 0.03065774217247963, "kl": 0.0743255615234375, "learning_rate": 1.916666666666667e-06, "loss": -0.2806, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.017435047775506973, "mask/share_reasoning": 0.7895956635475159, "mask/share_step_conf": 0.11875058710575104, "num_tokens": 39487538.0, "reward": 0.5388969779014587, "reward_std": 0.1687847077846527, "rewards/accuracy_reward_step": 0.37890625, "rewards/final_brier_reward_step": 0.4151046872138977, "rewards/format_reward_step": 0.90625, "rewards/step_l1_reward": 0.40565797686576843, "step": 131 }, { "adv/mean_abs_final_conf": 0.6716172695159912, "adv/mean_abs_reasoning": 0.5227418541908264, "adv/mean_abs_step_conf": 0.774185061454773, "adv/ratio_final_to_reasoning": 1.2847971979508226, "adv/ratio_step_to_reasoning": 1.4810083700934293, "adv/std_final_conf": 0.8198915719985962, "adv/std_reasoning": 0.7578502297401428, "adv/std_step_conf": 0.9345174431800842, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.650655864197531, "calib/avg_num_step_conf": 10.7421875, "calib/ece": 0.30702066115702487, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9380165289256198, "calib/gap": 0.03149861111111096, "calib/mean_conf": 0.9656983471074382, "calib/mu_c": 0.976111111111111, "calib/mu_w": 0.9446125000000001, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3016487603305786, "calib/std_conf": 0.11223983653925576, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9218165304268847, "calib/step_q_c_n": 1468.0, "calib/step_q_gap": -0.000662408730681574, "calib/step_q_w": 0.9224789391575663, "calib/step_q_w_n": 1282.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 726.109375, "completions/mean_terminated_length": 764.9547119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.1408, "grad_norm": 0.033202171325683594, "kl": 0.080047607421875, "learning_rate": 1.888888888888889e-06, "loss": -0.2414, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018654868006706238, "mask/share_reasoning": 0.8074687123298645, "mask/share_step_conf": 0.12309515476226807, "num_tokens": 39779014.0, "reward": 0.7947405576705933, "reward_std": 0.27415207028388977, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6544085741043091, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.6194474697113037, "step": 132 }, { "adv/mean_abs_final_conf": 0.7088272571563721, "adv/mean_abs_reasoning": 0.5792692303657532, "adv/mean_abs_step_conf": 0.7640947103500366, "adv/ratio_final_to_reasoning": 1.2236577052587712, "adv/ratio_step_to_reasoning": 1.3190666279090706, "adv/std_final_conf": 0.8966022729873657, "adv/std_reasoning": 0.8267735838890076, "adv/std_step_conf": 0.935346245765686, "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.5655533980582524, "calib/avg_num_step_conf": 11.609375, "calib/ece": 0.5164912280701756, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.9473684210526315, "calib/gap": 0.02622135922330071, "calib/mean_conf": 0.9682456140350878, "calib/mu_c": 0.982621359223301, "calib/mu_w": 0.9564000000000002, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5164912280701756, "calib/std_conf": 0.09939064884210455, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9174496644295302, "calib/step_q_c_n": 894.0, "calib/step_q_gap": 0.031120662183780956, "calib/step_q_w": 0.8863290022457493, "calib/step_q_w_n": 2078.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 756.0390625, "completions/mean_terminated_length": 841.5043334960938, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.14186666666666667, "grad_norm": 0.03641675040125847, "kl": 0.06818389892578125, "learning_rate": 1.8611111111111113e-06, "loss": -0.2985, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.016083287075161934, "mask/share_reasoning": 0.7779492139816284, "mask/share_step_conf": 0.104404978454113, "num_tokens": 40078904.0, "reward": 0.5671447515487671, "reward_std": 0.29637211561203003, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.43522578477859497, "rewards/format_reward_step": 0.890625, "rewards/step_l1_reward": 0.44047001004219055, "step": 133 }, { "adv/mean_abs_final_conf": 0.6852869987487793, "adv/mean_abs_reasoning": 0.5316742658615112, "adv/mean_abs_step_conf": 0.7545932531356812, "adv/ratio_final_to_reasoning": 1.2889226407043757, "adv/ratio_step_to_reasoning": 1.419277368847179, "adv/std_final_conf": 0.8771677017211914, "adv/std_reasoning": 0.7929584980010986, "adv/std_step_conf": 0.9351169466972351, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.635230179028133, "calib/avg_num_step_conf": 10.265625, "calib/ece": 0.38364583333333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8666666666666667, "calib/gap": 0.0455562659846549, "calib/mean_conf": 0.9553125000000001, "calib/mu_c": 0.9746739130434783, "calib/mu_w": 0.9291176470588234, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3819791666666667, "calib/std_conf": 0.11596967208606739, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9199074852817494, "calib/step_q_c_n": 1189.0, "calib/step_q_gap": 0.008364747269240635, "calib/step_q_w": 0.9115427380125087, "calib/step_q_w_n": 1439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 821.1953125, "completions/mean_terminated_length": 865.1275634765625, "completions/min_length": 0.0, "completions/min_terminated_length": 424.0, "epoch": 0.14293333333333333, "grad_norm": 0.03140419349074364, "kl": 0.06982421875, "learning_rate": 1.8333333333333333e-06, "loss": -0.1888, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016364730894565582, "mask/share_reasoning": 0.8280090689659119, "mask/share_step_conf": 0.10484500229358673, "num_tokens": 40398082.0, "reward": 0.7085082530975342, "reward_std": 0.2661244571208954, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5810662508010864, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.5406375527381897, "step": 134 }, { "adv/mean_abs_final_conf": 0.6562824249267578, "adv/mean_abs_reasoning": 0.5146161317825317, "adv/mean_abs_step_conf": 0.7555180788040161, "adv/ratio_final_to_reasoning": 1.2752853717459676, "adv/ratio_step_to_reasoning": 1.4681196957177503, "adv/std_final_conf": 0.8674776554107666, "adv/std_reasoning": 0.775693416595459, "adv/std_step_conf": 0.9348829388618469, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5915721844293272, "calib/avg_num_step_conf": 11.0390625, "calib/ece": 0.3475949367088608, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.869198312236287, "calib/gap": 0.015238095238095273, "calib/mean_conf": 0.9627848101265823, "calib/mu_c": 0.9685714285714285, "calib/mu_w": 0.9533333333333333, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34506329113924056, "calib/std_conf": 0.06449289449353177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9222406908927772, "calib/step_q_c_n": 1486.0, "calib/step_q_gap": 0.02305412372859794, "calib/step_q_w": 0.8991865671641792, "calib/step_q_w_n": 1340.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 799.9375, "completions/mean_terminated_length": 846.21484375, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 0.144, "grad_norm": 0.03021004982292652, "kl": 0.0778350830078125, "learning_rate": 1.8055555555555557e-06, "loss": -0.2959, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017042845487594604, "mask/share_reasoning": 0.8079831004142761, "mask/share_step_conf": 0.12028656899929047, "num_tokens": 40708746.0, "reward": 0.730875551700592, "reward_std": 0.2815753221511841, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6018984317779541, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.55907142162323, "step": 135 }, { "adv/mean_abs_final_conf": 0.6929965019226074, "adv/mean_abs_reasoning": 0.5509441494941711, "adv/mean_abs_step_conf": 0.749183177947998, "adv/ratio_final_to_reasoning": 1.2578343967512067, "adv/ratio_step_to_reasoning": 1.3598169227059271, "adv/std_final_conf": 0.8918748497962952, "adv/std_reasoning": 0.8101029396057129, "adv/std_step_conf": 0.934345006942749, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.632510885341074, "calib/avg_num_step_conf": 12.73828125, "calib/ece": 0.39983050847457635, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.8432203389830508, "calib/gap": 0.05777648766328025, "calib/mean_conf": 0.9422033898305086, "calib/mu_c": 0.9681538461538463, "calib/mu_w": 0.910377358490566, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3955932203389831, "calib/std_conf": 0.15291038470692864, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9165577507598784, "calib/step_q_c_n": 1316.0, "calib/step_q_gap": 0.005629730194325666, "calib/step_q_w": 0.9109280205655528, "calib/step_q_w_n": 1945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 743.79296875, "completions/mean_terminated_length": 800.0462646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.14506666666666668, "grad_norm": 0.025395670905709267, "kl": 0.084625244140625, "learning_rate": 1.777777777777778e-06, "loss": -0.3088, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01756712794303894, "mask/share_reasoning": 0.7889890670776367, "mask/share_step_conf": 0.12313126027584076, "num_tokens": 41007645.0, "reward": 0.6623439788818359, "reward_std": 0.2670624256134033, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5570906400680542, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.48244112730026245, "step": 136 }, { "adv/mean_abs_final_conf": 0.6910654902458191, "adv/mean_abs_reasoning": 0.492321252822876, "adv/mean_abs_step_conf": 0.7506650686264038, "adv/ratio_final_to_reasoning": 1.4036881127584513, "adv/ratio_step_to_reasoning": 1.5247464218175302, "adv/std_final_conf": 0.8978220820426941, "adv/std_reasoning": 0.792906641960144, "adv/std_step_conf": 0.9349708557128906, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5634290294374817, "calib/avg_num_step_conf": 10.28125, "calib/ece": 0.3368708333333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.7708333333333334, "calib/gap": 0.04376318857475936, "calib/mean_conf": 0.9385375000000001, "calib/mu_c": 0.955678082191781, "calib/mu_w": 0.9119148936170216, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3335375000000002, "calib/std_conf": 0.1254835258526659, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9210515765765765, "calib/step_q_c_n": 1332.0, "calib/step_q_gap": 0.038313115038115075, "calib/step_q_w": 0.8827384615384615, "calib/step_q_w_n": 1300.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 769.59375, "completions/mean_terminated_length": 804.1469116210938, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.14613333333333334, "grad_norm": 0.0315549410879612, "kl": 0.08020782470703125, "learning_rate": 1.75e-06, "loss": -0.1211, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017642011865973473, "mask/share_reasoning": 0.8182367086410522, "mask/share_step_conf": 0.12115253508090973, "num_tokens": 41311645.0, "reward": 0.7388617992401123, "reward_std": 0.27276355028152466, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6155734062194824, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.5621501207351685, "step": 137 }, { "adv/mean_abs_final_conf": 0.7305846214294434, "adv/mean_abs_reasoning": 0.46121594309806824, "adv/mean_abs_step_conf": 0.7638969421386719, "adv/ratio_final_to_reasoning": 1.5840402578496713, "adv/ratio_step_to_reasoning": 1.6562674243380278, "adv/std_final_conf": 0.8997833728790283, "adv/std_reasoning": 0.7208998203277588, "adv/std_step_conf": 0.9345141053199768, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6519679895476074, "calib/avg_num_step_conf": 10.9375, "calib/ece": 0.2590212765957448, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.7574468085106383, "calib/gap": 0.11035113506451089, "calib/mean_conf": 0.9185957446808513, "calib/mu_c": 0.9552229299363059, "calib/mu_w": 0.844871794871795, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25476595744680863, "calib/std_conf": 0.17728732054054358, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9116248303934872, "calib/step_q_c_n": 1474.0, "calib/step_q_gap": 0.0005124623693543162, "calib/step_q_w": 0.9111123680241329, "calib/step_q_w_n": 1326.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3016.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 795.16796875, "completions/mean_terminated_length": 844.6597900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.1472, "grad_norm": 0.03231998533010483, "kl": 0.08074188232421875, "learning_rate": 1.7222222222222224e-06, "loss": -0.1688, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01688062585890293, "mask/share_reasoning": 0.8103748559951782, "mask/share_step_conf": 0.11415077745914459, "num_tokens": 41619544.0, "reward": 0.7909604907035828, "reward_std": 0.2596752345561981, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6728769540786743, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.6027940511703491, "step": 138 }, { "adv/mean_abs_final_conf": 0.7011387944221497, "adv/mean_abs_reasoning": 0.37920230627059937, "adv/mean_abs_step_conf": 0.7306464910507202, "adv/ratio_final_to_reasoning": 1.84898346562749, "adv/ratio_step_to_reasoning": 1.9267986480264963, "adv/std_final_conf": 0.9124810099601746, "adv/std_reasoning": 0.6817423105239868, "adv/std_step_conf": 0.9337948560714722, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6916055663624121, "calib/avg_num_step_conf": 11.04296875, "calib/ece": 0.2850204081632655, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.726530612244898, "calib/gap": 0.05144096962442002, "calib/mean_conf": 0.9291020408163266, "calib/mu_c": 0.946319018404908, "calib/mu_w": 0.8948780487804879, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2744081632653063, "calib/std_conf": 0.14888706100467758, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9204639175257732, "calib/step_q_c_n": 1552.0, "calib/step_q_gap": 0.0009188194865574761, "calib/step_q_w": 0.9195450980392157, "calib/step_q_w_n": 1275.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 732.1640625, "completions/mean_terminated_length": 765.0366821289062, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.14826666666666666, "grad_norm": 0.06230498477816582, "kl": 0.0870208740234375, "learning_rate": 1.6944444444444446e-06, "loss": -0.2228, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018324337899684906, "mask/share_reasoning": 0.8134405612945557, "mask/share_step_conf": 0.12526635825634003, "num_tokens": 41910074.0, "reward": 0.8124518394470215, "reward_std": 0.21989166736602783, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.678037166595459, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6281166076660156, "step": 139 }, { "adv/mean_abs_final_conf": 0.744654655456543, "adv/mean_abs_reasoning": 0.5463522672653198, "adv/mean_abs_step_conf": 0.7565972805023193, "adv/ratio_final_to_reasoning": 1.362957015230841, "adv/ratio_step_to_reasoning": 1.384815852031415, "adv/std_final_conf": 0.9136979579925537, "adv/std_reasoning": 0.7755256295204163, "adv/std_step_conf": 0.9340972304344177, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6650914634146343, "calib/avg_num_step_conf": 10.73046875, "calib/ece": 0.25704918032786894, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7213114754098361, "calib/gap": 0.10781402439024401, "calib/mean_conf": 0.9045901639344264, "calib/mu_c": 0.9399390243902441, "calib/mu_w": 0.8321250000000001, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24475409836065584, "calib/std_conf": 0.2062850468623199, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9193794871794871, "calib/step_q_c_n": 1560.0, "calib/step_q_gap": 0.014998695267102957, "calib/step_q_w": 0.9043807919123842, "calib/step_q_w_n": 1187.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 767.859375, "completions/mean_terminated_length": 802.3346557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.14933333333333335, "grad_norm": 0.043366990983486176, "kl": 0.09185791015625, "learning_rate": 1.6666666666666667e-06, "loss": -0.1546, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017610831186175346, "mask/share_reasoning": 0.817704439163208, "mask/share_step_conf": 0.1217159777879715, "num_tokens": 42211662.0, "reward": 0.8117408752441406, "reward_std": 0.25670474767684937, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6961570382118225, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6093559861183167, "step": 140 }, { "adv/mean_abs_final_conf": 0.6669213771820068, "adv/mean_abs_reasoning": 0.4145224988460541, "adv/mean_abs_step_conf": 0.7681502103805542, "adv/ratio_final_to_reasoning": 1.6088906610342735, "adv/ratio_step_to_reasoning": 1.8530965448653025, "adv/std_final_conf": 0.8626038432121277, "adv/std_reasoning": 0.7015207409858704, "adv/std_step_conf": 0.9342188239097595, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7096652022420844, "calib/avg_num_step_conf": 10.5859375, "calib/ece": 0.27102880658436224, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7654320987654321, "calib/gap": 0.09559839418269955, "calib/mean_conf": 0.917119341563786, "calib/mu_c": 0.9493788819875778, "calib/mu_w": 0.8537804878048783, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2627983539094651, "calib/std_conf": 0.16448927741073446, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9215673141326189, "calib/step_q_c_n": 1493.0, "calib/step_q_gap": 0.013884487509775778, "calib/step_q_w": 0.9076828266228432, "calib/step_q_w_n": 1217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 806.23046875, "completions/mean_terminated_length": 835.6072998046875, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.1504, "grad_norm": 0.04461470618844032, "kl": 0.07849884033203125, "learning_rate": 1.638888888888889e-06, "loss": -0.1327, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01757936179637909, "mask/share_reasoning": 0.8285161256790161, "mask/share_step_conf": 0.11874829232692719, "num_tokens": 42525153.0, "reward": 0.8140060901641846, "reward_std": 0.2348444014787674, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6903749704360962, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6220120787620544, "step": 141 }, { "adv/mean_abs_final_conf": 0.667151689529419, "adv/mean_abs_reasoning": 0.44450634717941284, "adv/mean_abs_step_conf": 0.7593700885772705, "adv/ratio_final_to_reasoning": 1.5008822568289253, "adv/ratio_step_to_reasoning": 1.708344759069934, "adv/std_final_conf": 0.8530212044715881, "adv/std_reasoning": 0.7206939458847046, "adv/std_step_conf": 0.9329362511634827, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7053686714237173, "calib/avg_num_step_conf": 9.57421875, "calib/ece": 0.3773770491803279, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7172131147540983, "calib/gap": 0.09912062521236831, "calib/mean_conf": 0.8978688524590165, "calib/mu_c": 0.9421481481481483, "calib/mu_w": 0.84302752293578, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3609836065573771, "calib/std_conf": 0.22754473351660362, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9254538021259198, "calib/step_q_c_n": 1223.0, "calib/step_q_gap": 0.007803150660121716, "calib/step_q_w": 0.917650651465798, "calib/step_q_w_n": 1228.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 816.53125, "completions/mean_terminated_length": 836.1280517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.15146666666666667, "grad_norm": 0.041575007140636444, "kl": 0.075927734375, "learning_rate": 1.6111111111111113e-06, "loss": -0.092, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017675727605819702, "mask/share_reasoning": 0.8376240730285645, "mask/share_step_conf": 0.12126270681619644, "num_tokens": 42839345.0, "reward": 0.7154664993286133, "reward_std": 0.21693003177642822, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5958648324012756, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.5405368804931641, "step": 142 }, { "adv/mean_abs_final_conf": 0.7077853679656982, "adv/mean_abs_reasoning": 0.5404249429702759, "adv/mean_abs_step_conf": 0.7752654552459717, "adv/ratio_final_to_reasoning": 1.309683013658804, "adv/ratio_step_to_reasoning": 1.4345478781659646, "adv/std_final_conf": 0.8969067335128784, "adv/std_reasoning": 0.7929291129112244, "adv/std_step_conf": 0.9345833659172058, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6854413702239789, "calib/avg_num_step_conf": 12.53515625, "calib/ece": 0.3470886075949368, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.679324894514768, "calib/gap": 0.0835924462011417, "calib/mean_conf": 0.908270042194093, "calib/mu_c": 0.9431884057971014, "calib/mu_w": 0.8595959595959597, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3365400843881858, "calib/std_conf": 0.18677069943066818, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9213190406976746, "calib/step_q_c_n": 1376.0, "calib/step_q_gap": 0.020118276922442613, "calib/step_q_w": 0.901200763775232, "calib/step_q_w_n": 1833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 789.1328125, "completions/mean_terminated_length": 841.74169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.15253333333333333, "grad_norm": 0.04036405682563782, "kl": 0.0802764892578125, "learning_rate": 1.5833333333333333e-06, "loss": -0.2751, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017021197825670242, "mask/share_reasoning": 0.8005629777908325, "mask/share_step_conf": 0.11991582810878754, "num_tokens": 43148699.0, "reward": 0.7185168266296387, "reward_std": 0.2503293752670288, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6061657667160034, "rewards/format_reward_step": 0.921875, "rewards/step_l1_reward": 0.5386803150177002, "step": 143 }, { "adv/mean_abs_final_conf": 0.6762067675590515, "adv/mean_abs_reasoning": 0.5450261831283569, "adv/mean_abs_step_conf": 0.741301953792572, "adv/ratio_final_to_reasoning": 1.2406867568778814, "adv/ratio_step_to_reasoning": 1.3601217276161408, "adv/std_final_conf": 0.8893658518791199, "adv/std_reasoning": 0.7930487990379333, "adv/std_step_conf": 0.9350625872612, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6217501585288522, "calib/avg_num_step_conf": 10.33984375, "calib/ece": 0.23797520661157034, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7066115702479339, "calib/gap": 0.0992549143944198, "calib/mean_conf": 0.8933471074380165, "calib/mu_c": 0.9245180722891566, "calib/mu_w": 0.8252631578947368, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22268595041322323, "calib/std_conf": 0.21259308941070215, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9215514905149051, "calib/step_q_c_n": 1476.0, "calib/step_q_gap": 0.015168484537108506, "calib/step_q_w": 0.9063830059777966, "calib/step_q_w_n": 1171.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 789.56640625, "completions/mean_terminated_length": 821.66259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.1536, "grad_norm": 0.03410850092768669, "kl": 0.07788848876953125, "learning_rate": 1.5555555555555558e-06, "loss": -0.205, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017329085618257523, "mask/share_reasoning": 0.8310571908950806, "mask/share_step_conf": 0.1125512421131134, "num_tokens": 43454956.0, "reward": 0.8180810809135437, "reward_std": 0.2903987169265747, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6948046684265137, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6226074695587158, "step": 144 }, { "adv/mean_abs_final_conf": 0.7054914236068726, "adv/mean_abs_reasoning": 0.6051229238510132, "adv/mean_abs_step_conf": 0.769420862197876, "adv/ratio_final_to_reasoning": 1.1658646463384867, "adv/ratio_step_to_reasoning": 1.271511674522703, "adv/std_final_conf": 0.8777279853820801, "adv/std_reasoning": 0.8268728852272034, "adv/std_step_conf": 0.9352013468742371, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6287099983419002, "calib/avg_num_step_conf": 11.625, "calib/ece": 0.26510548523206773, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.6835443037974683, "calib/gap": 0.0986395290996519, "calib/mean_conf": 0.8466244725738398, "calib/mu_c": 0.8774233128834358, "calib/mu_w": 0.7787837837837839, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2119831223628694, "calib/std_conf": 0.28987629820641186, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9135570804741111, "calib/step_q_c_n": 1603.0, "calib/step_q_gap": 0.12349153058336093, "calib/step_q_w": 0.7900655498907502, "calib/step_q_w_n": 1373.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 744.83984375, "completions/mean_terminated_length": 791.19921875, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.15466666666666667, "grad_norm": 0.06647800654172897, "kl": 0.07993316650390625, "learning_rate": 1.527777777777778e-06, "loss": -0.2408, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017830245196819305, "mask/share_reasoning": 0.7956852912902832, "mask/share_step_conf": 0.1278906762599945, "num_tokens": 43748339.0, "reward": 0.7790793180465698, "reward_std": 0.3233683109283447, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6591414213180542, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.5880796909332275, "step": 145 }, { "adv/mean_abs_final_conf": 0.7313030958175659, "adv/mean_abs_reasoning": 0.5380452871322632, "adv/mean_abs_step_conf": 0.7613229751586914, "adv/ratio_final_to_reasoning": 1.3591850227242968, "adv/ratio_step_to_reasoning": 1.4149793583668018, "adv/std_final_conf": 0.8902589082717896, "adv/std_reasoning": 0.7931119799613953, "adv/std_step_conf": 0.9345507621765137, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7030024247610897, "calib/avg_num_step_conf": 11.421875, "calib/ece": 0.3994810126582279, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6582278481012658, "calib/gap": 0.13828947368421063, "calib/mean_conf": 0.8491856540084388, "calib/mu_c": 0.9209561403508774, "calib/mu_w": 0.7826666666666667, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3838270042194093, "calib/std_conf": 0.2700605326051657, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8985568181818182, "calib/step_q_c_n": 1144.0, "calib/step_q_gap": -0.0048785750766087155, "calib/step_q_w": 0.9034353932584269, "calib/step_q_w_n": 1780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 812.39453125, "completions/mean_terminated_length": 862.9585571289062, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.15573333333333333, "grad_norm": 0.0607595257461071, "kl": 0.07318878173828125, "learning_rate": 1.5e-06, "loss": -0.2455, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01656745746731758, "mask/share_reasoning": 0.8082844614982605, "mask/share_step_conf": 0.1165543720126152, "num_tokens": 44063528.0, "reward": 0.6538082957267761, "reward_std": 0.3023184537887573, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.5655796527862549, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.46781808137893677, "step": 146 }, { "adv/mean_abs_final_conf": 0.6403455138206482, "adv/mean_abs_reasoning": 0.39228206872940063, "adv/mean_abs_step_conf": 0.7484415769577026, "adv/ratio_final_to_reasoning": 1.6323598881150077, "adv/ratio_step_to_reasoning": 1.9079168705872807, "adv/std_final_conf": 0.8574442863464355, "adv/std_reasoning": 0.6816247701644897, "adv/std_step_conf": 0.9328452944755554, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6356864383180172, "calib/avg_num_step_conf": 10.14453125, "calib/ece": 0.3536250000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6625, "calib/gap": 0.1537301587301586, "calib/mean_conf": 0.8040416666666667, "calib/mu_c": 0.877063492063492, "calib/mu_w": 0.7233333333333334, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31633333333333336, "calib/std_conf": 0.33256064348810466, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9106250000000001, "calib/step_q_c_n": 1168.0, "calib/step_q_gap": 0.00865089223233051, "calib/step_q_w": 0.9019741077676696, "calib/step_q_w_n": 1429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 802.4921875, "completions/mean_terminated_length": 845.423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 385.0, "epoch": 0.1568, "grad_norm": 0.04731550067663193, "kl": 0.0695037841796875, "learning_rate": 1.4722222222222225e-06, "loss": -0.1847, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016832541674375534, "mask/share_reasoning": 0.8159008026123047, "mask/share_step_conf": 0.11648540198802948, "num_tokens": 44372646.0, "reward": 0.6841420531272888, "reward_std": 0.18067112565040588, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5989097356796265, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.48343682289123535, "step": 147 }, { "adv/mean_abs_final_conf": 0.6124477982521057, "adv/mean_abs_reasoning": 0.44582149386405945, "adv/mean_abs_step_conf": 0.7373963594436646, "adv/ratio_final_to_reasoning": 1.3737511687555697, "adv/ratio_step_to_reasoning": 1.6540170664551057, "adv/std_final_conf": 0.8261498212814331, "adv/std_reasoning": 0.7206892967224121, "adv/std_step_conf": 0.9336621165275574, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6929432957393484, "calib/avg_num_step_conf": 10.41015625, "calib/ece": 0.2697540983606558, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6557377049180327, "calib/gap": 0.18275689223057645, "calib/mean_conf": 0.7918852459016394, "calib/mu_c": 0.8488095238095239, "calib/mu_w": 0.6660526315789475, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18655737704918038, "calib/std_conf": 0.35254538591020995, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.894113427345187, "calib/step_q_c_n": 1631.0, "calib/step_q_gap": -0.015572259308584857, "calib/step_q_w": 0.9096856866537718, "calib/step_q_w_n": 1034.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 743.19921875, "completions/mean_terminated_length": 776.5673217773438, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.15786666666666666, "grad_norm": 0.042488742619752884, "kl": 0.0799407958984375, "learning_rate": 1.4444444444444445e-06, "loss": -0.1819, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01842150092124939, "mask/share_reasoning": 0.8100440502166748, "mask/share_step_conf": 0.1285656988620758, "num_tokens": 44668017.0, "reward": 0.8046440482139587, "reward_std": 0.24246975779533386, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6908816695213318, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.5973125696182251, "step": 148 }, { "adv/mean_abs_final_conf": 0.6814405918121338, "adv/mean_abs_reasoning": 0.5871527194976807, "adv/mean_abs_step_conf": 0.7749460339546204, "adv/ratio_final_to_reasoning": 1.1605849197038856, "adv/ratio_step_to_reasoning": 1.3198372556591411, "adv/std_final_conf": 0.8630624413490295, "adv/std_reasoning": 0.8266708850860596, "adv/std_step_conf": 0.9342211484909058, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7871368673255466, "calib/avg_num_step_conf": 10.1875, "calib/ece": 0.2008641975308642, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5843621399176955, "calib/gap": 0.35755390835579504, "calib/mean_conf": 0.7440740740740741, "calib/mu_c": 0.8676729559748426, "calib/mu_w": 0.5101190476190476, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14530864197530863, "calib/std_conf": 0.37642495613262844, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9176170510132775, "calib/step_q_c_n": 1431.0, "calib/step_q_gap": 0.04122792611947956, "calib/step_q_w": 0.8763891248937979, "calib/step_q_w_n": 1177.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 805.97265625, "completions/mean_terminated_length": 838.7357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.15893333333333334, "grad_norm": 0.051406074315309525, "kl": 0.06976318359375, "learning_rate": 1.4166666666666667e-06, "loss": -0.0968, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017330875620245934, "mask/share_reasoning": 0.826300323009491, "mask/share_step_conf": 0.11730631440877914, "num_tokens": 44978802.0, "reward": 0.8375177383422852, "reward_std": 0.26769959926605225, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.745905876159668, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6150670647621155, "step": 149 }, { "adv/mean_abs_final_conf": 0.6216943860054016, "adv/mean_abs_reasoning": 0.5660361647605896, "adv/mean_abs_step_conf": 0.7330152988433838, "adv/ratio_final_to_reasoning": 1.0983297971223327, "adv/ratio_step_to_reasoning": 1.294997289004351, "adv/std_final_conf": 0.8433254361152649, "adv/std_reasoning": 0.8099560737609863, "adv/std_step_conf": 0.9347100257873535, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6438846371882085, "calib/avg_num_step_conf": 10.71484375, "calib/ece": 0.28720164609053506, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6707818930041153, "calib/gap": 0.20551020408163279, "calib/mean_conf": 0.7843209876543209, "calib/mu_c": 0.8655102040816328, "calib/mu_w": 0.66, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23329218106995891, "calib/std_conf": 0.35839010726216575, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9128195488721805, "calib/step_q_c_n": 1463.0, "calib/step_q_gap": 0.035010955122180665, "calib/step_q_w": 0.8778085937499999, "calib/step_q_w_n": 1280.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2877.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 746.6171875, "completions/mean_terminated_length": 780.1387329101562, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.16, "grad_norm": 0.036864928901195526, "kl": 0.07648468017578125, "learning_rate": 1.3888888888888892e-06, "loss": -0.2174, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01857100799679756, "mask/share_reasoning": 0.8006365299224854, "mask/share_step_conf": 0.1378237009048462, "num_tokens": 45274896.0, "reward": 0.7855761051177979, "reward_std": 0.25697511434555054, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.65923672914505, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.6080092787742615, "step": 150 }, { "adv/mean_abs_final_conf": 0.6929680705070496, "adv/mean_abs_reasoning": 0.5574659705162048, "adv/mean_abs_step_conf": 0.7786697149276733, "adv/ratio_final_to_reasoning": 1.2430679308826185, "adv/ratio_step_to_reasoning": 1.396802237465073, "adv/std_final_conf": 0.8634377717971802, "adv/std_reasoning": 0.7931473255157471, "adv/std_step_conf": 0.9336228370666504, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.6892233370494241, "calib/avg_num_step_conf": 11.02734375, "calib/ece": 0.31870689655172413, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.5172413793103449, "calib/gap": 0.2716283909327387, "calib/mean_conf": 0.6432758620689656, "calib/mu_c": 0.7802608695652175, "calib/mu_w": 0.5086324786324787, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23314655172413795, "calib/std_conf": 0.4318300588914439, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8998490393412627, "calib/step_q_c_n": 1093.0, "calib/step_q_gap": 0.00872476188461535, "calib/step_q_w": 0.8911242774566474, "calib/step_q_w_n": 1730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3000.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 817.57421875, "completions/mean_terminated_length": 886.8601684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 0.16106666666666666, "grad_norm": 0.0430113859474659, "kl": 0.0694580078125, "learning_rate": 1.3611111111111112e-06, "loss": -0.2838, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.015757262706756592, "mask/share_reasoning": 0.7966047525405884, "mask/share_step_conf": 0.10951297730207443, "num_tokens": 45591219.0, "reward": 0.6842317581176758, "reward_std": 0.28172385692596436, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6139644384384155, "rewards/format_reward_step": 0.90234375, "rewards/step_l1_reward": 0.48418641090393066, "step": 151 }, { "adv/mean_abs_final_conf": 0.7391480207443237, "adv/mean_abs_reasoning": 0.607087254524231, "adv/mean_abs_step_conf": 0.7786715626716614, "adv/ratio_final_to_reasoning": 1.2175317719749983, "adv/ratio_step_to_reasoning": 1.2826353326786601, "adv/std_final_conf": 0.8781725168228149, "adv/std_reasoning": 0.8266691565513611, "adv/std_step_conf": 0.9354438781738281, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.626782463592233, "calib/avg_num_step_conf": 11.671875, "calib/ece": 0.3554112554112554, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.5151515151515151, "calib/gap": 0.18460254854368935, "calib/mean_conf": 0.6011255411255412, "calib/mu_c": 0.6834375, "calib/mu_w": 0.4988349514563107, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2012121212121212, "calib/std_conf": 0.45127768281844893, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8895158665581773, "calib/step_q_c_n": 1229.0, "calib/step_q_gap": 0.13263752659228756, "calib/step_q_w": 0.7568783399658897, "calib/step_q_w_n": 1759.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2844.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 779.5859375, "completions/mean_terminated_length": 835.0376586914062, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.16213333333333332, "grad_norm": 0.05237310379743576, "kl": 0.0787506103515625, "learning_rate": 1.3333333333333334e-06, "loss": -0.2079, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.016670234501361847, "mask/share_reasoning": 0.7984967231750488, "mask/share_step_conf": 0.11842679232358932, "num_tokens": 45896185.0, "reward": 0.6975486278533936, "reward_std": 0.2750810980796814, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5759539008140564, "rewards/format_reward_step": 0.90234375, "rewards/step_l1_reward": 0.5386744737625122, "step": 152 }, { "adv/mean_abs_final_conf": 0.7310658097267151, "adv/mean_abs_reasoning": 0.5798449516296387, "adv/mean_abs_step_conf": 0.7634512186050415, "adv/ratio_final_to_reasoning": 1.260795334463246, "adv/ratio_step_to_reasoning": 1.3166471769037262, "adv/std_final_conf": 0.8923981785774231, "adv/std_reasoning": 0.8266776204109192, "adv/std_step_conf": 0.935492753982544, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5558449289381651, "calib/avg_num_step_conf": 10.109375, "calib/ece": 0.3710833333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.48333333333333334, "calib/gap": 0.09073145323312748, "calib/mean_conf": 0.6119166666666668, "calib/mu_c": 0.6455629139072848, "calib/mu_w": 0.5548314606741573, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1769166666666667, "calib/std_conf": 0.43642925320784304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9105449775112445, "calib/step_q_c_n": 1334.0, "calib/step_q_gap": 0.029432537319856955, "calib/step_q_w": 0.8811124401913876, "calib/step_q_w_n": 1254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 757.01953125, "completions/mean_terminated_length": 797.5184936523438, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.1632, "grad_norm": 0.1365126371383667, "kl": 0.0780487060546875, "learning_rate": 1.3055555555555556e-06, "loss": -0.1451, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017622144892811775, "mask/share_reasoning": 0.8127738237380981, "mask/share_step_conf": 0.11882279813289642, "num_tokens": 46197302.0, "reward": 0.7272552847862244, "reward_std": 0.2770336866378784, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5796132683753967, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.5694285035133362, "step": 153 }, { "adv/mean_abs_final_conf": 0.6581071615219116, "adv/mean_abs_reasoning": 0.43368637561798096, "adv/mean_abs_step_conf": 0.7545804977416992, "adv/ratio_final_to_reasoning": 1.5174725297379759, "adv/ratio_step_to_reasoning": 1.739922073102851, "adv/std_final_conf": 0.8604373335838318, "adv/std_reasoning": 0.7394781112670898, "adv/std_step_conf": 0.9343244433403015, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6834290516879115, "calib/avg_num_step_conf": 9.9453125, "calib/ece": 0.33485355648535564, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5941422594142259, "calib/gap": 0.2627153662978008, "calib/mean_conf": 0.6868200836820084, "calib/mu_c": 0.8165289256198348, "calib/mu_w": 0.553813559322034, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2576987447698745, "calib/std_conf": 0.42267915794250943, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.905844465648855, "calib/step_q_c_n": 1048.0, "calib/step_q_gap": 0.007313090481965823, "calib/step_q_w": 0.8985313751668892, "calib/step_q_w_n": 1498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 738.48828125, "completions/mean_terminated_length": 787.7208862304688, "completions/min_length": 0.0, "completions/min_terminated_length": 389.0, "epoch": 0.16426666666666667, "grad_norm": 0.056357041001319885, "kl": 0.07154083251953125, "learning_rate": 1.2777777777777779e-06, "loss": -0.2656, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01771831139922142, "mask/share_reasoning": 0.8029538989067078, "mask/share_step_conf": 0.11682778596878052, "num_tokens": 46490795.0, "reward": 0.7125537395477295, "reward_std": 0.2651887536048889, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6242159605026245, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.5204226970672607, "step": 154 }, { "adv/mean_abs_final_conf": 0.6670347452163696, "adv/mean_abs_reasoning": 0.48000073432922363, "adv/mean_abs_step_conf": 0.7483382225036621, "adv/ratio_final_to_reasoning": 1.389653593235678, "adv/ratio_step_to_reasoning": 1.559035578454742, "adv/std_final_conf": 0.8591130375862122, "adv/std_reasoning": 0.7395284175872803, "adv/std_step_conf": 0.9336428642272949, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5808728215468818, "calib/avg_num_step_conf": 10.51953125, "calib/ece": 0.359746835443038, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5611814345991561, "calib/gap": 0.12453406308512183, "calib/mean_conf": 0.6914767932489452, "calib/mu_c": 0.7471755725190841, "calib/mu_w": 0.6226415094339622, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2492405063291139, "calib/std_conf": 0.4124844102532277, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9006412337662338, "calib/step_q_c_n": 1232.0, "calib/step_q_gap": 0.031168270042756796, "calib/step_q_w": 0.869472963723477, "calib/step_q_w_n": 1461.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 732.3046875, "completions/mean_terminated_length": 781.1250610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.16533333333333333, "grad_norm": 0.06447619944810867, "kl": 0.07836151123046875, "learning_rate": 1.25e-06, "loss": -0.248, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017479702830314636, "mask/share_reasoning": 0.7976410984992981, "mask/share_step_conf": 0.12237919867038727, "num_tokens": 46785481.0, "reward": 0.689497709274292, "reward_std": 0.24059349298477173, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5785812139511108, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.5129141807556152, "step": 155 }, { "adv/mean_abs_final_conf": 0.6695674657821655, "adv/mean_abs_reasoning": 0.4885939955711365, "adv/mean_abs_step_conf": 0.783113956451416, "adv/ratio_final_to_reasoning": 1.370396426995551, "adv/ratio_step_to_reasoning": 1.6027907906154346, "adv/std_final_conf": 0.8554741740226746, "adv/std_reasoning": 0.739392876625061, "adv/std_step_conf": 0.9342993497848511, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.661429347088367, "calib/avg_num_step_conf": 9.6796875, "calib/ece": 0.31632653061224497, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5183673469387755, "calib/gap": 0.21964775349531696, "calib/mean_conf": 0.6479183673469389, "calib/mu_c": 0.7429496402877699, "calib/mu_w": 0.5233018867924529, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19844897959183683, "calib/std_conf": 0.4164830318093278, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9048530579825258, "calib/step_q_c_n": 1259.0, "calib/step_q_gap": 0.06672809216354847, "calib/step_q_w": 0.8381249658189773, "calib/step_q_w_n": 1219.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 799.234375, "completions/mean_terminated_length": 821.7027587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 398.0, "epoch": 0.1664, "grad_norm": 0.061981022357940674, "kl": 0.06970977783203125, "learning_rate": 1.2222222222222223e-06, "loss": -0.1421, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01749337837100029, "mask/share_reasoning": 0.8347588181495667, "mask/share_step_conf": 0.12040403485298157, "num_tokens": 47094845.0, "reward": 0.7639741897583008, "reward_std": 0.22314155101776123, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6530945301055908, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5748538374900818, "step": 156 }, { "adv/mean_abs_final_conf": 0.6397287845611572, "adv/mean_abs_reasoning": 0.3884810209274292, "adv/mean_abs_step_conf": 0.7641727924346924, "adv/ratio_final_to_reasoning": 1.64674398516025, "adv/ratio_step_to_reasoning": 1.967078830801994, "adv/std_final_conf": 0.8433766961097717, "adv/std_reasoning": 0.6818484663963318, "adv/std_step_conf": 0.9321699142456055, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.6683219178082193, "calib/avg_num_step_conf": 10.78125, "calib/ece": 0.22656652360515023, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.6094420600858369, "calib/gap": 0.24674058219178074, "calib/mean_conf": 0.7502575107296138, "calib/mu_c": 0.8275625, "calib/mu_w": 0.5808219178082192, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14506437768240343, "calib/std_conf": 0.3626203192320913, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8907831715210355, "calib/step_q_c_n": 1545.0, "calib/step_q_gap": 0.07269263654161162, "calib/step_q_w": 0.8180905349794239, "calib/step_q_w_n": 1215.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 786.59765625, "completions/mean_terminated_length": 842.548095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.16746666666666668, "grad_norm": 0.07800068706274033, "kl": 0.07373809814453125, "learning_rate": 1.1944444444444446e-06, "loss": -0.1742, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.016521422192454338, "mask/share_reasoning": 0.799405038356781, "mask/share_step_conf": 0.11766725033521652, "num_tokens": 47399942.0, "reward": 0.8016955852508545, "reward_std": 0.24124254286289215, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6876152157783508, "rewards/format_reward_step": 0.91015625, "rewards/step_l1_reward": 0.608744740486145, "step": 157 }, { "adv/mean_abs_final_conf": 0.5802692174911499, "adv/mean_abs_reasoning": 0.46292001008987427, "adv/mean_abs_step_conf": 0.7462335824966431, "adv/ratio_final_to_reasoning": 1.2534978070584866, "adv/ratio_step_to_reasoning": 1.6120140979685982, "adv/std_final_conf": 0.8109095096588135, "adv/std_reasoning": 0.7393543720245361, "adv/std_step_conf": 0.9349581599235535, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6089843749999999, "calib/avg_num_step_conf": 9.59765625, "calib/ece": 0.28076612903225817, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7580645161290323, "calib/gap": 0.11602272727272733, "calib/mean_conf": 0.8763306451612904, "calib/mu_c": 0.9175000000000001, "calib/mu_w": 0.8014772727272728, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25596774193548394, "calib/std_conf": 0.259120417323254, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9124654377880184, "calib/step_q_c_n": 1519.0, "calib/step_q_gap": 0.022699979365843448, "calib/step_q_w": 0.8897654584221749, "calib/step_q_w_n": 938.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 755.12890625, "completions/mean_terminated_length": 776.357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.16853333333333334, "grad_norm": 0.044181808829307556, "kl": 0.0697174072265625, "learning_rate": 1.1666666666666668e-06, "loss": -0.116, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018855862319469452, "mask/share_reasoning": 0.8255026340484619, "mask/share_step_conf": 0.12829776108264923, "num_tokens": 47698495.0, "reward": 0.800797700881958, "reward_std": 0.24212971329689026, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6816230416297913, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6012223958969116, "step": 158 }, { "adv/mean_abs_final_conf": 0.6155053377151489, "adv/mean_abs_reasoning": 0.5257014036178589, "adv/mean_abs_step_conf": 0.7635072469711304, "adv/ratio_final_to_reasoning": 1.1708268866684823, "adv/ratio_step_to_reasoning": 1.4523591561991274, "adv/std_final_conf": 0.8442915081977844, "adv/std_reasoning": 0.8098849058151245, "adv/std_step_conf": 0.9338783621788025, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6846465390279823, "calib/avg_num_step_conf": 11.21875, "calib/ece": 0.30071729957805904, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6877637130801688, "calib/gap": 0.1699845360824742, "calib/mean_conf": 0.8309282700421942, "calib/mu_c": 0.9005, "calib/mu_w": 0.7305154639175258, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.270464135021097, "calib/std_conf": 0.30175433451587225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9182377049180328, "calib/step_q_c_n": 1342.0, "calib/step_q_gap": 0.07119522125790201, "calib/step_q_w": 0.8470424836601308, "calib/step_q_w_n": 1530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 739.734375, "completions/mean_terminated_length": 789.050048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.1696, "grad_norm": 0.04411659762263298, "kl": 0.07027435302734375, "learning_rate": 1.138888888888889e-06, "loss": -0.2479, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017607979476451874, "mask/share_reasoning": 0.7957726716995239, "mask/share_step_conf": 0.12411928176879883, "num_tokens": 47992651.0, "reward": 0.7762739658355713, "reward_std": 0.26523715257644653, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6403324007987976, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.6169029474258423, "step": 159 }, { "adv/mean_abs_final_conf": 0.6596049070358276, "adv/mean_abs_reasoning": 0.5177563428878784, "adv/mean_abs_step_conf": 0.7162469625473022, "adv/ratio_final_to_reasoning": 1.2739677960423692, "adv/ratio_step_to_reasoning": 1.3833668527406289, "adv/std_final_conf": 0.8876446485519409, "adv/std_reasoning": 0.8100550770759583, "adv/std_step_conf": 0.934569239616394, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.6421108823305117, "calib/avg_num_step_conf": 11.1015625, "calib/ece": 0.3119230769230767, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.6324786324786325, "calib/gap": 0.11231525966598055, "calib/mean_conf": 0.810042735042735, "calib/mu_c": 0.8546808510638297, "calib/mu_w": 0.7423655913978492, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25970085470085447, "calib/std_conf": 0.2998054894544952, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9068384615384616, "calib/step_q_c_n": 1300.0, "calib/step_q_gap": 0.025583597725232, "calib/step_q_w": 0.8812548638132296, "calib/step_q_w_n": 1542.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 776.13671875, "completions/mean_terminated_length": 838.3585815429688, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.17066666666666666, "grad_norm": 0.058579009026288986, "kl": 0.067352294921875, "learning_rate": 1.111111111111111e-06, "loss": -0.3345, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.016484688967466354, "mask/share_reasoning": 0.7973984479904175, "mask/share_step_conf": 0.11189806461334229, "num_tokens": 48296182.0, "reward": 0.7409486770629883, "reward_std": 0.27730506658554077, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6198972463607788, "rewards/format_reward_step": 0.91015625, "rewards/step_l1_reward": 0.5698126554489136, "step": 160 }, { "adv/mean_abs_final_conf": 0.5626407265663147, "adv/mean_abs_reasoning": 0.3099568486213684, "adv/mean_abs_step_conf": 0.744856595993042, "adv/ratio_final_to_reasoning": 1.8152227610676717, "adv/ratio_step_to_reasoning": 2.403097719266499, "adv/std_final_conf": 0.8191770315170288, "adv/std_reasoning": 0.5961928367614746, "adv/std_step_conf": 0.9331373572349548, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6836936936936937, "calib/avg_num_step_conf": 10.14453125, "calib/ece": 0.17024489795918363, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6816326530612244, "calib/gap": 0.23347747747747738, "calib/mean_conf": 0.8276326530612245, "calib/mu_c": 0.8848108108108108, "calib/mu_w": 0.6513333333333334, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12138775510204079, "calib/std_conf": 0.30573048647844897, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8879506172839505, "calib/step_q_c_n": 1755.0, "calib/step_q_gap": 0.03697558755743435, "calib/step_q_w": 0.8509750297265162, "calib/step_q_w_n": 841.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 735.34375, "completions/mean_terminated_length": 765.2357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.17173333333333332, "grad_norm": 0.09371700137853622, "kl": 0.068603515625, "learning_rate": 1.0833333333333335e-06, "loss": -0.1205, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0185359176248312, "mask/share_reasoning": 0.8156201243400574, "mask/share_step_conf": 0.12678146362304688, "num_tokens": 48588350.0, "reward": 0.8863547444343567, "reward_std": 0.1917186677455902, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.764299213886261, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6732540130615234, "step": 161 }, { "adv/mean_abs_final_conf": 0.5152875781059265, "adv/mean_abs_reasoning": 0.4537314474582672, "adv/mean_abs_step_conf": 0.7489913105964661, "adv/ratio_final_to_reasoning": 1.1356664410026838, "adv/ratio_step_to_reasoning": 1.6507370489574804, "adv/std_final_conf": 0.7631077766418457, "adv/std_reasoning": 0.7206796407699585, "adv/std_step_conf": 0.93306964635849, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7224846480869155, "calib/avg_num_step_conf": 9.3828125, "calib/ece": 0.1806072874493927, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6923076923076923, "calib/gap": 0.2122295701464335, "calib/mean_conf": 0.8627935222672064, "calib/mu_c": 0.9255172413793101, "calib/mu_w": 0.7132876712328766, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16947368421052628, "calib/std_conf": 0.24479002379082498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9073752869964518, "calib/step_q_c_n": 1597.0, "calib/step_q_gap": 0.030490814946762423, "calib/step_q_w": 0.8768844720496893, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 753.15234375, "completions/mean_terminated_length": 777.4475708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.1728, "grad_norm": 0.044426869601011276, "kl": 0.06839752197265625, "learning_rate": 1.0555555555555557e-06, "loss": -0.1427, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01875723898410797, "mask/share_reasoning": 0.8271980881690979, "mask/share_step_conf": 0.12279466539621353, "num_tokens": 48885301.0, "reward": 0.8925344944000244, "reward_std": 0.21106109023094177, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7672237753868103, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6889388561248779, "step": 162 }, { "adv/mean_abs_final_conf": 0.5857610702514648, "adv/mean_abs_reasoning": 0.44100522994995117, "adv/mean_abs_step_conf": 0.7768477201461792, "adv/ratio_final_to_reasoning": 1.3282406431278417, "adv/ratio_step_to_reasoning": 1.761538565504863, "adv/std_final_conf": 0.7958298325538635, "adv/std_reasoning": 0.7014980316162109, "adv/std_step_conf": 0.9332451224327087, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7322478991596638, "calib/avg_num_step_conf": 9.52734375, "calib/ece": 0.283734439834025, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6556016597510373, "calib/gap": 0.22323109243697492, "calib/mean_conf": 0.8408298755186724, "calib/mu_c": 0.9380882352941178, "calib/mu_w": 0.7148571428571429, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.280124481327801, "calib/std_conf": 0.260970372596227, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9138898233809927, "calib/step_q_c_n": 1189.0, "calib/step_q_gap": 0.05811382338099247, "calib/step_q_w": 0.8557760000000002, "calib/step_q_w_n": 1250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 778.171875, "completions/mean_terminated_length": 819.8024291992188, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.17386666666666667, "grad_norm": 0.046456970274448395, "kl": 0.062042236328125, "learning_rate": 1.0277777777777777e-06, "loss": -0.1249, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018092192709445953, "mask/share_reasoning": 0.8152344822883606, "mask/share_step_conf": 0.11589207500219345, "num_tokens": 49189345.0, "reward": 0.7753407955169678, "reward_std": 0.22338135540485382, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6771906018257141, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.5789597034454346, "step": 163 }, { "adv/mean_abs_final_conf": 0.6575444936752319, "adv/mean_abs_reasoning": 0.5587068200111389, "adv/mean_abs_step_conf": 0.7896183729171753, "adv/ratio_final_to_reasoning": 1.1769043622236839, "adv/ratio_step_to_reasoning": 1.4132964636111525, "adv/std_final_conf": 0.8603196740150452, "adv/std_reasoning": 0.826644241809845, "adv/std_step_conf": 0.9346628189086914, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6551515151515153, "calib/avg_num_step_conf": 10.12109375, "calib/ece": 0.2635294117647058, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.7142857142857143, "calib/gap": 0.12994545454545448, "calib/mean_conf": 0.8623529411764707, "calib/mu_c": 0.9103999999999999, "calib/mu_w": 0.7804545454545454, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2478151260504201, "calib/std_conf": 0.24918929351790883, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.887434456928839, "calib/step_q_c_n": 1335.0, "calib/step_q_gap": 0.04784050788425287, "calib/step_q_w": 0.8395939490445862, "calib/step_q_w_n": 1256.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 788.15625, "completions/mean_terminated_length": 840.7000732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.17493333333333333, "grad_norm": 0.049657903611660004, "kl": 0.06340789794921875, "learning_rate": 1.0000000000000002e-06, "loss": -0.2315, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01661180891096592, "mask/share_reasoning": 0.8164951801300049, "mask/share_step_conf": 0.10439302027225494, "num_tokens": 49497249.0, "reward": 0.7687405347824097, "reward_std": 0.2964133322238922, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6576253771781921, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.5775119662284851, "step": 164 }, { "adv/mean_abs_final_conf": 0.5841305255889893, "adv/mean_abs_reasoning": 0.4875459671020508, "adv/mean_abs_step_conf": 0.7689133882522583, "adv/ratio_final_to_reasoning": 1.1981034917815696, "adv/ratio_step_to_reasoning": 1.5771095243031987, "adv/std_final_conf": 0.7935722470283508, "adv/std_reasoning": 0.7394719123840332, "adv/std_step_conf": 0.9343661069869995, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6993981737686773, "calib/avg_num_step_conf": 9.42578125, "calib/ece": 0.32152263374485596, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6790123456790124, "calib/gap": 0.15990453790813497, "calib/mean_conf": 0.841275720164609, "calib/mu_c": 0.9097122302158274, "calib/mu_w": 0.7498076923076924, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2953909465020576, "calib/std_conf": 0.27270859443063616, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9023426517571884, "calib/step_q_c_n": 1252.0, "calib/step_q_gap": 0.02143825899233054, "calib/step_q_w": 0.8809043927648579, "calib/step_q_w_n": 1161.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 796.234375, "completions/mean_terminated_length": 828.6016235351562, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.176, "grad_norm": 0.03774955868721008, "kl": 0.06166839599609375, "learning_rate": 9.722222222222224e-07, "loss": -0.1753, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017093051224946976, "mask/share_reasoning": 0.8328508734703064, "mask/share_step_conf": 0.11099356412887573, "num_tokens": 49806661.0, "reward": 0.7528131604194641, "reward_std": 0.2334972769021988, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6517425775527954, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.555446207523346, "step": 165 }, { "adv/mean_abs_final_conf": 0.5557419657707214, "adv/mean_abs_reasoning": 0.39997929334640503, "adv/mean_abs_step_conf": 0.7824414968490601, "adv/ratio_final_to_reasoning": 1.3894268403775016, "adv/ratio_step_to_reasoning": 1.9562050082713176, "adv/std_final_conf": 0.791394829750061, "adv/std_reasoning": 0.6817243695259094, "adv/std_step_conf": 0.9336012005805969, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8174369747899161, "calib/avg_num_step_conf": 9.38671875, "calib/ece": 0.21129166666666663, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7291666666666666, "calib/gap": 0.2625630252100841, "calib/mean_conf": 0.8571250000000001, "calib/mu_c": 0.9337058823529413, "calib/mu_w": 0.6711428571428572, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1800416666666666, "calib/std_conf": 0.26829023657536755, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8895049309664695, "calib/step_q_c_n": 1521.0, "calib/step_q_gap": 0.019992459311140798, "calib/step_q_w": 0.8695124716553287, "calib/step_q_w_n": 882.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 753.671875, "completions/mean_terminated_length": 797.272705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.17706666666666668, "grad_norm": 0.052816737443208694, "kl": 0.0668182373046875, "learning_rate": 9.444444444444445e-07, "loss": -0.168, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017924867570400238, "mask/share_reasoning": 0.81169593334198, "mask/share_step_conf": 0.11569161713123322, "num_tokens": 50105785.0, "reward": 0.8533405065536499, "reward_std": 0.2136755734682083, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.757287859916687, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6290807127952576, "step": 166 }, { "adv/mean_abs_final_conf": 0.5000762343406677, "adv/mean_abs_reasoning": 0.4703137278556824, "adv/mean_abs_step_conf": 0.762180745601654, "adv/ratio_final_to_reasoning": 1.0632822406028473, "adv/ratio_step_to_reasoning": 1.620579414249061, "adv/std_final_conf": 0.741891086101532, "adv/std_reasoning": 0.7393703460693359, "adv/std_step_conf": 0.933522641658783, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5653275296132438, "calib/avg_num_step_conf": 9.90625, "calib/ece": 0.3084081632653061, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8122448979591836, "calib/gap": 0.062127872127872186, "calib/mean_conf": 0.921469387755102, "calib/mu_c": 0.9445454545454546, "calib/mu_w": 0.8824175824175824, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30065306122448976, "calib/std_conf": 0.17472627012288028, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.898509129213483, "calib/step_q_c_n": 1424.0, "calib/step_q_gap": 0.03991110763074923, "calib/step_q_w": 0.8585980215827338, "calib/step_q_w_n": 1112.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 779.24609375, "completions/mean_terminated_length": 807.6397094726562, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.17813333333333334, "grad_norm": 0.04024127870798111, "kl": 0.06053924560546875, "learning_rate": 9.166666666666666e-07, "loss": -0.0904, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017891276627779007, "mask/share_reasoning": 0.8272270560264587, "mask/share_step_conf": 0.11972543597221375, "num_tokens": 50410880.0, "reward": 0.7745072841644287, "reward_std": 0.21950817108154297, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6461308598518372, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.5919462442398071, "step": 167 }, { "adv/mean_abs_final_conf": 0.6446714997291565, "adv/mean_abs_reasoning": 0.5393311977386475, "adv/mean_abs_step_conf": 0.7846331000328064, "adv/ratio_final_to_reasoning": 1.1953165372820793, "adv/ratio_step_to_reasoning": 1.4548260944715994, "adv/std_final_conf": 0.8436398506164551, "adv/std_reasoning": 0.7755438685417175, "adv/std_step_conf": 0.9343622326850891, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7345307154544102, "calib/avg_num_step_conf": 9.32421875, "calib/ece": 0.2300404858299595, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7206477732793523, "calib/gap": 0.21213520749665327, "calib/mean_conf": 0.8814574898785424, "calib/mu_c": 0.9510240963855422, "calib/mu_w": 0.7388888888888889, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2197165991902834, "calib/std_conf": 0.23510765303744627, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9018106796116505, "calib/step_q_c_n": 1442.0, "calib/step_q_gap": 0.10523925104022192, "calib/step_q_w": 0.7965714285714286, "calib/step_q_w_n": 945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 818.5703125, "completions/mean_terminated_length": 838.216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 395.0, "epoch": 0.1792, "grad_norm": 0.04501558840274811, "kl": 0.0583648681640625, "learning_rate": 8.88888888888889e-07, "loss": -0.1051, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017265118658542633, "mask/share_reasoning": 0.8477543592453003, "mask/share_step_conf": 0.11154300719499588, "num_tokens": 50725106.0, "reward": 0.8554896712303162, "reward_std": 0.2765381336212158, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7467812299728394, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6415417790412903, "step": 168 }, { "adv/mean_abs_final_conf": 0.5372599959373474, "adv/mean_abs_reasoning": 0.444337397813797, "adv/mean_abs_step_conf": 0.762183427810669, "adv/ratio_final_to_reasoning": 1.2091262148555193, "adv/ratio_step_to_reasoning": 1.7153258572443362, "adv/std_final_conf": 0.7773349285125732, "adv/std_reasoning": 0.7015244364738464, "adv/std_step_conf": 0.9335224628448486, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6206280491994778, "calib/avg_num_step_conf": 9.359375, "calib/ece": 0.3318292682926829, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8089430894308943, "calib/gap": 0.10794681508967241, "calib/mean_conf": 0.9066260162601626, "calib/mu_c": 0.9500680272108846, "calib/mu_w": 0.8421212121212122, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32044715447154465, "calib/std_conf": 0.20670173706004993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9029939668174963, "calib/step_q_c_n": 1326.0, "calib/step_q_gap": 0.01505938737824386, "calib/step_q_w": 0.8879345794392525, "calib/step_q_w_n": 1070.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 797.59765625, "completions/mean_terminated_length": 820.02001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.18026666666666666, "grad_norm": 0.0497799851000309, "kl": 0.06368255615234375, "learning_rate": 8.611111111111112e-07, "loss": -0.117, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017907004803419113, "mask/share_reasoning": 0.840368390083313, "mask/share_step_conf": 0.11438088119029999, "num_tokens": 51033475.0, "reward": 0.7868397831916809, "reward_std": 0.2144777625799179, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6468933820724487, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6197549104690552, "step": 169 }, { "adv/mean_abs_final_conf": 0.5735569000244141, "adv/mean_abs_reasoning": 0.4856990575790405, "adv/mean_abs_step_conf": 0.7721710205078125, "adv/ratio_final_to_reasoning": 1.180889464524184, "adv/ratio_step_to_reasoning": 1.5898137096594074, "adv/std_final_conf": 0.7941933274269104, "adv/std_reasoning": 0.7576899528503418, "adv/std_step_conf": 0.9353705644607544, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.619972115719763, "calib/avg_num_step_conf": 9.48828125, "calib/ece": 0.3219918699186991, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8577235772357723, "calib/gap": 0.09807110491460413, "calib/mean_conf": 0.9309349593495936, "calib/mu_c": 0.9688079470198673, "calib/mu_w": 0.8707368421052631, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3195528455284552, "calib/std_conf": 0.17873181253029413, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9172869101978691, "calib/step_q_c_n": 1314.0, "calib/step_q_gap": 0.03548421961490944, "calib/step_q_w": 0.8818026905829597, "calib/step_q_w_n": 1115.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 781.828125, "completions/mean_terminated_length": 807.04833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.18133333333333335, "grad_norm": 0.03491868078708649, "kl": 0.06319427490234375, "learning_rate": 8.333333333333333e-07, "loss": -0.0732, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017963500693440437, "mask/share_reasoning": 0.8349390029907227, "mask/share_step_conf": 0.11584748327732086, "num_tokens": 51337775.0, "reward": 0.7697086930274963, "reward_std": 0.28596359491348267, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6465945243835449, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5834478139877319, "step": 170 }, { "adv/mean_abs_final_conf": 0.5924351811408997, "adv/mean_abs_reasoning": 0.49389344453811646, "adv/mean_abs_step_conf": 0.75652015209198, "adv/ratio_final_to_reasoning": 1.1995202359791155, "adv/ratio_step_to_reasoning": 1.531747708859487, "adv/std_final_conf": 0.812272846698761, "adv/std_reasoning": 0.7578060626983643, "adv/std_step_conf": 0.9342533349990845, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.6203609000584454, "calib/avg_num_step_conf": 10.51953125, "calib/ece": 0.42354700854700866, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.7735042735042735, "calib/gap": 0.07948421975452946, "calib/mean_conf": 0.8855128205128205, "calib/mu_c": 0.9249152542372879, "calib/mu_w": 0.8454310344827585, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4023931623931625, "calib/std_conf": 0.2280443469911254, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9110245901639344, "calib/step_q_c_n": 1098.0, "calib/step_q_gap": 0.10283650238963982, "calib/step_q_w": 0.8081880877742946, "calib/step_q_w_n": 1595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 750.37890625, "completions/mean_terminated_length": 800.4042358398438, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.1824, "grad_norm": 0.05241640657186508, "kl": 0.05883026123046875, "learning_rate": 8.055555555555557e-07, "loss": -0.2661, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.017459597438573837, "mask/share_reasoning": 0.8048005104064941, "mask/share_step_conf": 0.11523989588022232, "num_tokens": 51636768.0, "reward": 0.6549869775772095, "reward_std": 0.2620229721069336, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.5414218902587891, "rewards/format_reward_step": 0.91015625, "rewards/step_l1_reward": 0.49433326721191406, "step": 171 }, { "adv/mean_abs_final_conf": 0.5368980169296265, "adv/mean_abs_reasoning": 0.40605199337005615, "adv/mean_abs_step_conf": 0.7671827077865601, "adv/ratio_final_to_reasoning": 1.3222395794036248, "adv/ratio_step_to_reasoning": 1.8893706232526406, "adv/std_final_conf": 0.7779921293258667, "adv/std_reasoning": 0.6816897392272949, "adv/std_step_conf": 0.9344073534011841, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6197683397683398, "calib/avg_num_step_conf": 8.84375, "calib/ece": 0.2395943775100402, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8192771084337349, "calib/gap": 0.07640432432432442, "calib/mean_conf": 0.9343734939759037, "calib/mu_c": 0.9570800000000002, "calib/mu_w": 0.8806756756756757, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2355783132530121, "calib/std_conf": 0.14996382783483647, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8967925973197193, "calib/step_q_c_n": 1567.0, "calib/step_q_gap": 0.0013837020542958323, "calib/step_q_w": 0.8954088952654234, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 747.42578125, "completions/mean_terminated_length": 756.28857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.18346666666666667, "grad_norm": 0.027499012649059296, "kl": 0.06461334228515625, "learning_rate": 7.777777777777779e-07, "loss": 0.0052, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019513994455337524, "mask/share_reasoning": 0.8444157242774963, "mask/share_step_conf": 0.12435153126716614, "num_tokens": 51931461.0, "reward": 0.8563653826713562, "reward_std": 0.21522122621536255, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7265148162841797, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6549659371376038, "step": 172 }, { "adv/mean_abs_final_conf": 0.5885812044143677, "adv/mean_abs_reasoning": 0.4769488275051117, "adv/mean_abs_step_conf": 0.7240892648696899, "adv/ratio_final_to_reasoning": 1.234055249686214, "adv/ratio_step_to_reasoning": 1.5181697136301893, "adv/std_final_conf": 0.8601162433624268, "adv/std_reasoning": 0.775355339050293, "adv/std_step_conf": 0.9352255463600159, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5762152777777778, "calib/avg_num_step_conf": 9.84375, "calib/ece": 0.32524390243902435, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8211382113821138, "calib/gap": 0.06849999999999978, "calib/mean_conf": 0.9309349593495935, "calib/mu_c": 0.9576666666666666, "calib/mu_w": 0.8891666666666668, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3232113821138211, "calib/std_conf": 0.1544474146750119, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8994468704512372, "calib/step_q_c_n": 1374.0, "calib/step_q_gap": 0.0021795638776478388, "calib/step_q_w": 0.8972673065735893, "calib/step_q_w_n": 1146.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 806.3203125, "completions/mean_terminated_length": 825.6720581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.18453333333333333, "grad_norm": 0.07306650280952454, "kl": 0.057220458984375, "learning_rate": 7.5e-07, "loss": -0.0718, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01831592619419098, "mask/share_reasoning": 0.8370583057403564, "mask/share_step_conf": 0.12118831276893616, "num_tokens": 52241039.0, "reward": 0.7683874368667603, "reward_std": 0.25729119777679443, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6415566205978394, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5858431458473206, "step": 173 }, { "adv/mean_abs_final_conf": 0.6525813937187195, "adv/mean_abs_reasoning": 0.581325888633728, "adv/mean_abs_step_conf": 0.7921609282493591, "adv/ratio_final_to_reasoning": 1.1225741128654376, "adv/ratio_step_to_reasoning": 1.3626795980326114, "adv/std_final_conf": 0.843597412109375, "adv/std_reasoning": 0.792957603931427, "adv/std_step_conf": 0.9348820447921753, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.622702205882353, "calib/avg_num_step_conf": 9.7421875, "calib/ece": 0.32099999999999973, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7291666666666666, "calib/gap": 0.13005656108597285, "calib/mean_conf": 0.8765833333333334, "calib/mu_c": 0.9329411764705883, "calib/mu_w": 0.8028846153846154, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31545833333333306, "calib/std_conf": 0.23909898868227966, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8955775316455697, "calib/step_q_c_n": 1264.0, "calib/step_q_gap": 0.04651493001955331, "calib/step_q_w": 0.8490626016260164, "calib/step_q_w_n": 1230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 850.3828125, "completions/mean_terminated_length": 884.951171875, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.1856, "grad_norm": 0.03435070812702179, "kl": 0.05667877197265625, "learning_rate": 7.222222222222222e-07, "loss": -0.0622, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016576357185840607, "mask/share_reasoning": 0.8326107263565063, "mask/share_step_conf": 0.11175040900707245, "num_tokens": 52562969.0, "reward": 0.7409429550170898, "reward_std": 0.2846534252166748, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6235312223434448, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.5638233423233032, "step": 174 }, { "adv/mean_abs_final_conf": 0.5845508575439453, "adv/mean_abs_reasoning": 0.4365823566913605, "adv/mean_abs_step_conf": 0.7559595704078674, "adv/ratio_final_to_reasoning": 1.3389246005586761, "adv/ratio_step_to_reasoning": 1.7315394422644728, "adv/std_final_conf": 0.8250952959060669, "adv/std_reasoning": 0.7392560243606567, "adv/std_step_conf": 0.9347420930862427, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6401459854014598, "calib/avg_num_step_conf": 9.359375, "calib/ece": 0.4161983471074381, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6776859504132231, "calib/gap": 0.12556969064998258, "calib/mean_conf": 0.82900826446281, "calib/mu_c": 0.9000952380952382, "calib/mu_w": 0.7745255474452556, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40566115702479344, "calib/std_conf": 0.2803315774580673, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.908609062170706, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.06408936624810757, "calib/step_q_w": 0.8445196959225985, "calib/step_q_w_n": 1447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 823.83203125, "completions/mean_terminated_length": 857.3211059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.18666666666666668, "grad_norm": 0.05315734073519707, "kl": 0.0612945556640625, "learning_rate": 6.944444444444446e-07, "loss": -0.1763, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017256654798984528, "mask/share_reasoning": 0.8306593894958496, "mask/share_step_conf": 0.11302149295806885, "num_tokens": 52879694.0, "reward": 0.6704280972480774, "reward_std": 0.21455532312393188, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.5495570302009583, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.5202054381370544, "step": 175 }, { "adv/mean_abs_final_conf": 0.6492704153060913, "adv/mean_abs_reasoning": 0.479910671710968, "adv/mean_abs_step_conf": 0.7665454745292664, "adv/ratio_final_to_reasoning": 1.352898473774974, "adv/ratio_step_to_reasoning": 1.5972669909514485, "adv/std_final_conf": 0.8582326173782349, "adv/std_reasoning": 0.7394721508026123, "adv/std_step_conf": 0.9343740940093994, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6369176598049837, "calib/avg_num_step_conf": 9.859375, "calib/ece": 0.29304878048780497, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6463414634146342, "calib/gap": 0.1483613217768146, "calib/mean_conf": 0.8304471544715448, "calib/mu_c": 0.893169014084507, "calib/mu_w": 0.7448076923076924, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27313008130081307, "calib/std_conf": 0.2756105625093126, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8961136536994663, "calib/step_q_c_n": 1311.0, "calib/step_q_gap": 0.04517883521761801, "calib/step_q_w": 0.8509348184818483, "calib/step_q_w_n": 1212.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 775.703125, "completions/mean_terminated_length": 800.7257690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.18773333333333334, "grad_norm": 0.041484564542770386, "kl": 0.0646820068359375, "learning_rate": 6.666666666666667e-07, "loss": -0.1311, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018372666090726852, "mask/share_reasoning": 0.828335702419281, "mask/share_step_conf": 0.12204164266586304, "num_tokens": 53182338.0, "reward": 0.7937588691711426, "reward_std": 0.254478394985199, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6575058698654175, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6276681423187256, "step": 176 }, { "adv/mean_abs_final_conf": 0.5691676735877991, "adv/mean_abs_reasoning": 0.37612125277519226, "adv/mean_abs_step_conf": 0.7652287483215332, "adv/ratio_final_to_reasoning": 1.5132558168096677, "adv/ratio_step_to_reasoning": 2.034526745498507, "adv/std_final_conf": 0.8127015829086304, "adv/std_reasoning": 0.6817793846130371, "adv/std_step_conf": 0.9344162344932556, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7425963197239793, "calib/avg_num_step_conf": 10.34765625, "calib/ece": 0.2562396694214875, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6859504132231405, "calib/gap": 0.20173519263944817, "calib/mean_conf": 0.8595454545454545, "calib/mu_c": 0.9379054054054055, "calib/mu_w": 0.7361702127659573, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2521074380165288, "calib/std_conf": 0.23704316655393667, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.905525998492841, "calib/step_q_c_n": 1327.0, "calib/step_q_gap": 0.08445262481659288, "calib/step_q_w": 0.8210733736762481, "calib/step_q_w_n": 1322.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 766.640625, "completions/mean_terminated_length": 797.8048706054688, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.1888, "grad_norm": 0.05364028364419937, "kl": 0.06444549560546875, "learning_rate": 6.388888888888889e-07, "loss": -0.2257, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.0180435199290514, "mask/share_reasoning": 0.8248641490936279, "mask/share_step_conf": 0.1180298775434494, "num_tokens": 53482430.0, "reward": 0.7916520833969116, "reward_std": 0.22819365561008453, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7000319957733154, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.5793658494949341, "step": 177 }, { "adv/mean_abs_final_conf": 0.6649892330169678, "adv/mean_abs_reasoning": 0.45371317863464355, "adv/mean_abs_step_conf": 0.7933188676834106, "adv/ratio_final_to_reasoning": 1.4656599462641047, "adv/ratio_step_to_reasoning": 1.7485030301979336, "adv/std_final_conf": 0.8735648393630981, "adv/std_reasoning": 0.7208279371261597, "adv/std_step_conf": 0.9342332482337952, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.73158312870143, "calib/avg_num_step_conf": 9.875, "calib/ece": 0.20234817813765182, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6275303643724697, "calib/gap": 0.21397804420049116, "calib/mean_conf": 0.8513360323886638, "calib/mu_c": 0.9258385093167703, "calib/mu_w": 0.7118604651162791, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20093117408906885, "calib/std_conf": 0.21808570084302475, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9033420707732635, "calib/step_q_c_n": 1526.0, "calib/step_q_gap": 0.036481791332145685, "calib/step_q_w": 0.8668602794411178, "calib/step_q_w_n": 1002.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 775.62109375, "completions/mean_terminated_length": 797.4256591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 418.0, "epoch": 0.18986666666666666, "grad_norm": 0.05140404403209686, "kl": 0.067779541015625, "learning_rate": 6.111111111111112e-07, "loss": -0.0966, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018296916037797928, "mask/share_reasoning": 0.8330808877944946, "mask/share_step_conf": 0.12127845734357834, "num_tokens": 53787061.0, "reward": 0.8549030423164368, "reward_std": 0.24177849292755127, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.755286693572998, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6357693076133728, "step": 178 }, { "adv/mean_abs_final_conf": 0.6497317552566528, "adv/mean_abs_reasoning": 0.45641210675239563, "adv/mean_abs_step_conf": 0.7805640697479248, "adv/ratio_final_to_reasoning": 1.4235638048250403, "adv/ratio_step_to_reasoning": 1.7102177137719579, "adv/std_final_conf": 0.8614363670349121, "adv/std_reasoning": 0.7207245230674744, "adv/std_step_conf": 0.9349228143692017, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6598185117967332, "calib/avg_num_step_conf": 9.15234375, "calib/ece": 0.2562916666666666, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6333333333333333, "calib/gap": 0.17090744101633404, "calib/mean_conf": 0.8546250000000002, "calib/mu_c": 0.9222758620689656, "calib/mu_w": 0.7513684210526316, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2533749999999999, "calib/std_conf": 0.24396179900754952, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8930182481751826, "calib/step_q_c_n": 1370.0, "calib/step_q_gap": 0.08982194807240762, "calib/step_q_w": 0.803196300102775, "calib/step_q_w_n": 973.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 804.73046875, "completions/mean_terminated_length": 834.0526733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.19093333333333334, "grad_norm": 0.05745590478181839, "kl": 0.06463623046875, "learning_rate": 5.833333333333334e-07, "loss": -0.0987, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017795901745557785, "mask/share_reasoning": 0.8279945254325867, "mask/share_step_conf": 0.11905330419540405, "num_tokens": 54099336.0, "reward": 0.7860234975814819, "reward_std": 0.24726131558418274, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.675326943397522, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.5959386825561523, "step": 179 }, { "adv/mean_abs_final_conf": 0.6072914004325867, "adv/mean_abs_reasoning": 0.37272268533706665, "adv/mean_abs_step_conf": 0.7750673294067383, "adv/ratio_final_to_reasoning": 1.6293384447029056, "adv/ratio_step_to_reasoning": 2.0794745259624237, "adv/std_final_conf": 0.8151358366012573, "adv/std_reasoning": 0.6614524722099304, "adv/std_step_conf": 0.9343709349632263, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6556508967223253, "calib/avg_num_step_conf": 9.953125, "calib/ece": 0.18306122448979573, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.18631493506493502, "calib/mean_conf": 0.8098367346938776, "calib/mu_c": 0.8683928571428572, "calib/mu_w": 0.6820779220779222, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15359183673469368, "calib/std_conf": 0.2748758047524688, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8753670647391578, "calib/step_q_c_n": 1591.0, "calib/step_q_gap": -0.004841921676725036, "calib/step_q_w": 0.8802089864158829, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 868.38671875, "completions/mean_terminated_length": 892.7991333007812, "completions/min_length": 0.0, "completions/min_terminated_length": 425.0, "epoch": 0.192, "grad_norm": 0.044392045587301254, "kl": 0.0648193359375, "learning_rate": 5.555555555555555e-07, "loss": -0.046, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016595961526036263, "mask/share_reasoning": 0.845601499080658, "mask/share_step_conf": 0.11045877635478973, "num_tokens": 54425499.0, "reward": 0.8599202632904053, "reward_std": 0.2037505954504013, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7405816316604614, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.656602680683136, "step": 180 }, { "adv/mean_abs_final_conf": 0.684099018573761, "adv/mean_abs_reasoning": 0.6443017721176147, "adv/mean_abs_step_conf": 0.7614454627037048, "adv/ratio_final_to_reasoning": 1.061768022653958, "adv/ratio_step_to_reasoning": 1.1818149439525458, "adv/std_final_conf": 0.876131534576416, "adv/std_reasoning": 0.8590683937072754, "adv/std_step_conf": 0.9356979131698608, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6834362139917696, "calib/avg_num_step_conf": 9.94921875, "calib/ece": 0.2602880658436213, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5925925925925926, "calib/gap": 0.18396296296296322, "calib/mean_conf": 0.8064609053497943, "calib/mu_c": 0.8882222222222224, "calib/mu_w": 0.7042592592592591, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25559670781892996, "calib/std_conf": 0.2715491026283736, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9053762711864407, "calib/step_q_c_n": 1180.0, "calib/step_q_gap": 0.04352550308110048, "calib/step_q_w": 0.8618507681053402, "calib/step_q_w_n": 1367.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 743.5078125, "completions/mean_terminated_length": 776.8897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.19306666666666666, "grad_norm": 0.07317768782377243, "kl": 0.07617950439453125, "learning_rate": 5.277777777777779e-07, "loss": -0.1169, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01819986291229725, "mask/share_reasoning": 0.8194365501403809, "mask/share_step_conf": 0.11939479410648346, "num_tokens": 54722101.0, "reward": 0.7572405934333801, "reward_std": 0.29411470890045166, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6693332195281982, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.5506167411804199, "step": 181 }, { "adv/mean_abs_final_conf": 0.6801034212112427, "adv/mean_abs_reasoning": 0.4527340531349182, "adv/mean_abs_step_conf": 0.7628204822540283, "adv/ratio_final_to_reasoning": 1.5022139741906417, "adv/ratio_step_to_reasoning": 1.684919605609393, "adv/std_final_conf": 0.8764859437942505, "adv/std_reasoning": 0.739275336265564, "adv/std_step_conf": 0.9347026348114014, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7248870581982461, "calib/avg_num_step_conf": 9.30078125, "calib/ece": 0.25870967741935486, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.592741935483871, "calib/gap": 0.22101249003454704, "calib/mean_conf": 0.8058870967741936, "calib/mu_c": 0.9003521126760565, "calib/mu_w": 0.6793396226415095, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24600806451612905, "calib/std_conf": 0.278820997314019, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9006148867313915, "calib/step_q_c_n": 1236.0, "calib/step_q_gap": 0.06895986489733019, "calib/step_q_w": 0.8316550218340613, "calib/step_q_w_n": 1145.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 770.171875, "completions/mean_terminated_length": 785.5139770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.19413333333333332, "grad_norm": 0.05326711758971214, "kl": 0.0720672607421875, "learning_rate": 5.000000000000001e-07, "loss": -0.0435, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018472693860530853, "mask/share_reasoning": 0.8421061038970947, "mask/share_step_conf": 0.11988990753889084, "num_tokens": 55025425.0, "reward": 0.784070611000061, "reward_std": 0.24252068996429443, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7084202766418457, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.5550333261489868, "step": 182 }, { "adv/mean_abs_final_conf": 0.6686722040176392, "adv/mean_abs_reasoning": 0.522902250289917, "adv/mean_abs_step_conf": 0.7876584529876709, "adv/ratio_final_to_reasoning": 1.2787709436073411, "adv/ratio_step_to_reasoning": 1.5063206412880474, "adv/std_final_conf": 0.873296320438385, "adv/std_reasoning": 0.7756194472312927, "adv/std_step_conf": 0.9346939921379089, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6828478964401294, "calib/avg_num_step_conf": 9.8125, "calib/ece": 0.21331950207468878, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5020746887966805, "calib/gap": 0.21650415083720298, "calib/mean_conf": 0.7426141078838175, "calib/mu_c": 0.835144927536232, "calib/mu_w": 0.6186407766990291, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1916597510373444, "calib/std_conf": 0.31346843708064276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9056351236146633, "calib/step_q_c_n": 1173.0, "calib/step_q_gap": 0.06097492943990601, "calib/step_q_w": 0.8446601941747572, "calib/step_q_w_n": 1339.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 807.72265625, "completions/mean_terminated_length": 850.9341430664062, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.1952, "grad_norm": 0.04934949427843094, "kl": 0.0679779052734375, "learning_rate": 4.7222222222222226e-07, "loss": -0.2123, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016763310879468918, "mask/share_reasoning": 0.8293523788452148, "mask/share_step_conf": 0.10310307145118713, "num_tokens": 55338882.0, "reward": 0.7845203280448914, "reward_std": 0.2688310742378235, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6910668611526489, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.5818799734115601, "step": 183 }, { "adv/mean_abs_final_conf": 0.6820645332336426, "adv/mean_abs_reasoning": 0.4836152195930481, "adv/mean_abs_step_conf": 0.7581703662872314, "adv/ratio_final_to_reasoning": 1.4103454680512026, "adv/ratio_step_to_reasoning": 1.5677140329148773, "adv/std_final_conf": 0.8702086806297302, "adv/std_reasoning": 0.7394979596138, "adv/std_step_conf": 0.9349130988121033, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.736623475609756, "calib/avg_num_step_conf": 9.62109375, "calib/ece": 0.17921487603305797, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6074380165289256, "calib/gap": 0.24499542682926845, "calib/mean_conf": 0.829297520661157, "calib/mu_c": 0.9123125000000002, "calib/mu_w": 0.6673170731707317, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17367768595041333, "calib/std_conf": 0.2624517317173887, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9109391553328561, "calib/step_q_c_n": 1397.0, "calib/step_q_gap": 0.1102127638381718, "calib/step_q_w": 0.8007263914946843, "calib/step_q_w_n": 1066.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 809.125, "completions/mean_terminated_length": 838.6072998046875, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.19626666666666667, "grad_norm": 0.0646803006529808, "kl": 0.06688690185546875, "learning_rate": 4.444444444444445e-07, "loss": -0.1008, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017484432086348534, "mask/share_reasoning": 0.8337000608444214, "mask/share_step_conf": 0.11365921050310135, "num_tokens": 55651298.0, "reward": 0.8425024151802063, "reward_std": 0.24076125025749207, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7454652786254883, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.6254770755767822, "step": 184 }, { "adv/mean_abs_final_conf": 0.6692211627960205, "adv/mean_abs_reasoning": 0.5714598298072815, "adv/mean_abs_step_conf": 0.7615892887115479, "adv/ratio_final_to_reasoning": 1.1710729746685922, "adv/ratio_step_to_reasoning": 1.3327083532159827, "adv/std_final_conf": 0.8775772452354431, "adv/std_reasoning": 0.8268687129020691, "adv/std_step_conf": 0.9354509711265564, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.698912288822032, "calib/avg_num_step_conf": 10.3828125, "calib/ece": 0.24258474576271183, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.1505754840700454, "calib/mean_conf": 0.7552966101694916, "calib/mu_c": 0.8108053691275169, "calib/mu_w": 0.6602298850574715, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18326271186440674, "calib/std_conf": 0.29865006474961586, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8650375375375375, "calib/step_q_c_n": 1332.0, "calib/step_q_gap": 0.04064085578791454, "calib/step_q_w": 0.824396681749623, "calib/step_q_w_n": 1326.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 782.56640625, "completions/mean_terminated_length": 834.737548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.19733333333333333, "grad_norm": 0.06889063119888306, "kl": 0.06777191162109375, "learning_rate": 4.1666666666666667e-07, "loss": -0.2824, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.016981428489089012, "mask/share_reasoning": 0.8096021413803101, "mask/share_step_conf": 0.1109163910150528, "num_tokens": 55958555.0, "reward": 0.7719869017601013, "reward_std": 0.2715288996696472, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6732874512672424, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.5706863403320312, "step": 185 }, { "adv/mean_abs_final_conf": 0.6912783980369568, "adv/mean_abs_reasoning": 0.5170702338218689, "adv/mean_abs_step_conf": 0.7700475454330444, "adv/ratio_final_to_reasoning": 1.336913929327254, "adv/ratio_step_to_reasoning": 1.4892513532278218, "adv/std_final_conf": 0.9046430587768555, "adv/std_reasoning": 0.7755802273750305, "adv/std_step_conf": 0.9337618350982666, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7176522446765223, "calib/avg_num_step_conf": 9.65625, "calib/ece": 0.25356275303643727, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5101214574898786, "calib/gap": 0.22800217008002177, "calib/mean_conf": 0.7499190283400811, "calib/mu_c": 0.8431506849315069, "calib/mu_w": 0.6151485148514851, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20619433198380568, "calib/std_conf": 0.3146426440322263, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8626199750312111, "calib/step_q_c_n": 1335.0, "calib/step_q_gap": 0.03157922451816508, "calib/step_q_w": 0.831040750513046, "calib/step_q_w_n": 1137.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 793.97265625, "completions/mean_terminated_length": 816.2931518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.1984, "grad_norm": 0.054222866892814636, "kl": 0.0751190185546875, "learning_rate": 3.8888888888888895e-07, "loss": -0.1349, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018149716779589653, "mask/share_reasoning": 0.8339352607727051, "mask/share_step_conf": 0.12057129293680191, "num_tokens": 56266852.0, "reward": 0.8087828755378723, "reward_std": 0.257120281457901, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7117167711257935, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6003800630569458, "step": 186 }, { "adv/mean_abs_final_conf": 0.7086143493652344, "adv/mean_abs_reasoning": 0.4930305480957031, "adv/mean_abs_step_conf": 0.775668203830719, "adv/ratio_final_to_reasoning": 1.4372625633486789, "adv/ratio_step_to_reasoning": 1.5732660112576888, "adv/std_final_conf": 0.8794147372245789, "adv/std_reasoning": 0.7395173907279968, "adv/std_step_conf": 0.9348717331886292, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6558219178082191, "calib/avg_num_step_conf": 10.26953125, "calib/ece": 0.2400819672131147, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4713114754098361, "calib/gap": 0.14184232597148483, "calib/mean_conf": 0.7438524590163934, "calib/mu_c": 0.8008219178082193, "calib/mu_w": 0.6589795918367345, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1927868852459016, "calib/std_conf": 0.28849509676008184, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8616146230007617, "calib/step_q_c_n": 1313.0, "calib/step_q_gap": 0.14515945582750944, "calib/step_q_w": 0.7164551671732523, "calib/step_q_w_n": 1316.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 860.22265625, "completions/mean_terminated_length": 877.3585815429688, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.19946666666666665, "grad_norm": 0.05153043195605278, "kl": 0.0677947998046875, "learning_rate": 3.611111111111111e-07, "loss": -0.0226, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.0167331974953413, "mask/share_reasoning": 0.8521455526351929, "mask/share_step_conf": 0.11158999800682068, "num_tokens": 56588613.0, "reward": 0.7962475419044495, "reward_std": 0.2418176233768463, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6870421171188354, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6015465259552002, "step": 187 }, { "adv/mean_abs_final_conf": 0.728238046169281, "adv/mean_abs_reasoning": 0.46052050590515137, "adv/mean_abs_step_conf": 0.7711305022239685, "adv/ratio_final_to_reasoning": 1.5813368500017861, "adv/ratio_step_to_reasoning": 1.6744759295969118, "adv/std_final_conf": 0.9081566333770752, "adv/std_reasoning": 0.7394147515296936, "adv/std_step_conf": 0.9348183274269104, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6867738399707709, "calib/avg_num_step_conf": 9.64453125, "calib/ece": 0.1543089430894308, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.483739837398374, "calib/gap": 0.2231348191450493, "calib/mean_conf": 0.7454471544715447, "calib/mu_c": 0.8225465838509317, "calib/mu_w": 0.5994117647058824, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12264227642276412, "calib/std_conf": 0.30079367109898403, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8945784381478922, "calib/step_q_c_n": 1447.0, "calib/step_q_gap": 0.05269879039838132, "calib/step_q_w": 0.8418796477495109, "calib/step_q_w_n": 1022.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2262.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 820.75, "completions/mean_terminated_length": 843.8232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 0.20053333333333334, "grad_norm": 0.10698122531175613, "kl": 0.06858062744140625, "learning_rate": 3.3333333333333335e-07, "loss": -0.1138, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017628012225031853, "mask/share_reasoning": 0.8367440104484558, "mask/share_step_conf": 0.11828421801328659, "num_tokens": 56902797.0, "reward": 0.8345286846160889, "reward_std": 0.2370331734418869, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7457132935523987, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.605375349521637, "step": 188 }, { "adv/mean_abs_final_conf": 0.6068315505981445, "adv/mean_abs_reasoning": 0.39968180656433105, "adv/mean_abs_step_conf": 0.7506299018859863, "adv/ratio_final_to_reasoning": 1.5182866486079885, "adv/ratio_step_to_reasoning": 1.8780687275670833, "adv/std_final_conf": 0.8442689776420593, "adv/std_reasoning": 0.7012975215911865, "adv/std_step_conf": 0.9340417981147766, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7970951343500363, "calib/avg_num_step_conf": 9.78515625, "calib/ece": 0.10344129554655862, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4048582995951417, "calib/gap": 0.3640530137981117, "calib/mean_conf": 0.6695951417004048, "calib/mu_c": 0.7948765432098766, "calib/mu_w": 0.4308235294117648, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05858299595141691, "calib/std_conf": 0.32990956627493656, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8672979166666668, "calib/step_q_c_n": 1440.0, "calib/step_q_gap": 0.03278617957746488, "calib/step_q_w": 0.8345117370892019, "calib/step_q_w_n": 1065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 811.11328125, "completions/mean_terminated_length": 823.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.2016, "grad_norm": 0.07192602753639221, "kl": 0.074310302734375, "learning_rate": 3.055555555555556e-07, "loss": -0.0406, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018070422112941742, "mask/share_reasoning": 0.8476791381835938, "mask/share_step_conf": 0.1186254620552063, "num_tokens": 57218210.0, "reward": 0.8747564554214478, "reward_std": 0.18241506814956665, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.800437867641449, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6295437812805176, "step": 189 }, { "adv/mean_abs_final_conf": 0.6713325381278992, "adv/mean_abs_reasoning": 0.45265400409698486, "adv/mean_abs_step_conf": 0.7748057842254639, "adv/ratio_final_to_reasoning": 1.4831030589625815, "adv/ratio_step_to_reasoning": 1.7116954168364218, "adv/std_final_conf": 0.8905377388000488, "adv/std_reasoning": 0.7206981778144836, "adv/std_step_conf": 0.934777557849884, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6814162422184493, "calib/avg_num_step_conf": 9.87890625, "calib/ece": 0.16971428571428568, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4448979591836735, "calib/gap": 0.20692062818336154, "calib/mean_conf": 0.7190204081632652, "calib/mu_c": 0.7975657894736843, "calib/mu_w": 0.5906451612903227, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1341632653061224, "calib/std_conf": 0.311720471969249, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8788965093411997, "calib/step_q_c_n": 1356.0, "calib/step_q_gap": 0.049893951796442626, "calib/step_q_w": 0.829002557544757, "calib/step_q_w_n": 1173.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 827.65625, "completions/mean_terminated_length": 850.9236450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.20266666666666666, "grad_norm": 0.04944543167948723, "kl": 0.07180023193359375, "learning_rate": 2.7777777777777776e-07, "loss": -0.1137, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017103388905525208, "mask/share_reasoning": 0.8414227962493896, "mask/share_step_conf": 0.11413010209798813, "num_tokens": 57535698.0, "reward": 0.8249231576919556, "reward_std": 0.22116057574748993, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7226202487945557, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6170697212219238, "step": 190 }, { "adv/mean_abs_final_conf": 0.690582811832428, "adv/mean_abs_reasoning": 0.426123708486557, "adv/mean_abs_step_conf": 0.7383967638015747, "adv/ratio_final_to_reasoning": 1.6206157932050709, "adv/ratio_step_to_reasoning": 1.7328225327431388, "adv/std_final_conf": 0.8894121646881104, "adv/std_reasoning": 0.7014713287353516, "adv/std_step_conf": 0.935016930103302, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7255667164381703, "calib/avg_num_step_conf": 10.13671875, "calib/ece": 0.2516326530612245, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6081632653061224, "calib/gap": 0.25055314239174675, "calib/mean_conf": 0.7822448979591836, "calib/mu_c": 0.8906474820143884, "calib/mu_w": 0.6400943396226416, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.233265306122449, "calib/std_conf": 0.3042962124689507, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9067540983606558, "calib/step_q_c_n": 1220.0, "calib/step_q_gap": 0.06829591654247391, "calib/step_q_w": 0.8384581818181819, "calib/step_q_w_n": 1375.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 755.5546875, "completions/mean_terminated_length": 789.4775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 409.0, "epoch": 0.20373333333333332, "grad_norm": 0.0591316819190979, "kl": 0.071075439453125, "learning_rate": 2.5000000000000004e-07, "loss": -0.2139, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01827659085392952, "mask/share_reasoning": 0.8171859979629517, "mask/share_step_conf": 0.1215687245130539, "num_tokens": 57833288.0, "reward": 0.7744136452674866, "reward_std": 0.23828265070915222, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7037370800971985, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.545871376991272, "step": 191 }, { "adv/mean_abs_final_conf": 0.7076407074928284, "adv/mean_abs_reasoning": 0.43131551146507263, "adv/mean_abs_step_conf": 0.7656826376914978, "adv/ratio_final_to_reasoning": 1.6406567551654867, "adv/ratio_step_to_reasoning": 1.775226295689349, "adv/std_final_conf": 0.890299916267395, "adv/std_reasoning": 0.7208864688873291, "adv/std_step_conf": 0.9349159002304077, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7080642557313529, "calib/avg_num_step_conf": 9.7890625, "calib/ece": 0.14920502092050214, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5481171548117155, "calib/gap": 0.21411365837907648, "calib/mean_conf": 0.7823430962343098, "calib/mu_c": 0.8504294478527608, "calib/mu_w": 0.6363157894736843, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1247698744769875, "calib/std_conf": 0.28081089890896666, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8886009421265142, "calib/step_q_c_n": 1486.0, "calib/step_q_gap": 0.15244407938141602, "calib/step_q_w": 0.7361568627450982, "calib/step_q_w_n": 1020.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 780.51171875, "completions/mean_terminated_length": 818.8975219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 375.0, "epoch": 0.2048, "grad_norm": 0.04957636818289757, "kl": 0.07782745361328125, "learning_rate": 2.2222222222222224e-07, "loss": -0.2414, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018003789708018303, "mask/share_reasoning": 0.8114763498306274, "mask/share_step_conf": 0.1236448884010315, "num_tokens": 58138075.0, "reward": 0.8275803923606873, "reward_std": 0.2583814859390259, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7309035062789917, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.6109760403633118, "step": 192 }, { "adv/mean_abs_final_conf": 0.7112095355987549, "adv/mean_abs_reasoning": 0.4996627867221832, "adv/mean_abs_step_conf": 0.7590423226356506, "adv/ratio_final_to_reasoning": 1.4233790358179974, "adv/ratio_step_to_reasoning": 1.5191091728383699, "adv/std_final_conf": 0.8913341760635376, "adv/std_reasoning": 0.7754184007644653, "adv/std_step_conf": 0.9352988004684448, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6429493713897384, "calib/avg_num_step_conf": 9.7421875, "calib/ece": 0.23543032786885248, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4918032786885246, "calib/gap": 0.16506931702344563, "calib/mean_conf": 0.7448155737704918, "calib/mu_c": 0.8185555555555556, "calib/mu_w": 0.65348623853211, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21348360655737708, "calib/std_conf": 0.30274540716703924, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8925401459854014, "calib/step_q_c_n": 1233.0, "calib/step_q_gap": 0.09319438864995322, "calib/step_q_w": 0.7993457573354482, "calib/step_q_w_n": 1261.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 782.25, "completions/mean_terminated_length": 817.3713989257812, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.20586666666666667, "grad_norm": 0.07504615932703018, "kl": 0.072113037109375, "learning_rate": 1.9444444444444447e-07, "loss": -0.1529, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017368696630001068, "mask/share_reasoning": 0.827225923538208, "mask/share_step_conf": 0.11243665218353271, "num_tokens": 58444043.0, "reward": 0.7569782137870789, "reward_std": 0.22785191237926483, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6729967594146729, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.5448658466339111, "step": 193 }, { "adv/mean_abs_final_conf": 0.5718756914138794, "adv/mean_abs_reasoning": 0.4823354482650757, "adv/mean_abs_step_conf": 0.7542473673820496, "adv/ratio_final_to_reasoning": 1.185638943749362, "adv/ratio_step_to_reasoning": 1.563740276803251, "adv/std_final_conf": 0.831807017326355, "adv/std_reasoning": 0.7577318549156189, "adv/std_step_conf": 0.9345336556434631, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7718676122931442, "calib/avg_num_step_conf": 9.95703125, "calib/ece": 0.1869166666666666, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6125, "calib/gap": 0.3344895766172362, "calib/mean_conf": 0.7669166666666667, "calib/mu_c": 0.9048936170212766, "calib/mu_w": 0.5704040404040404, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18316666666666662, "calib/std_conf": 0.319433863351329, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8862143432715551, "calib/step_q_c_n": 1241.0, "calib/step_q_gap": 0.04246816590152458, "calib/step_q_w": 0.8437461773700305, "calib/step_q_w_n": 1308.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 753.66796875, "completions/mean_terminated_length": 793.9876098632812, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 0.20693333333333333, "grad_norm": 0.051577258855104446, "kl": 0.0688323974609375, "learning_rate": 1.6666666666666668e-07, "loss": -0.2472, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.018072867766022682, "mask/share_reasoning": 0.816690981388092, "mask/share_step_conf": 0.11445486545562744, "num_tokens": 58742926.0, "reward": 0.8116989135742188, "reward_std": 0.2461298704147339, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7364538908004761, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.5892876386642456, "step": 194 }, { "adv/mean_abs_final_conf": 0.7048993110656738, "adv/mean_abs_reasoning": 0.4602048397064209, "adv/mean_abs_step_conf": 0.7671737670898438, "adv/ratio_final_to_reasoning": 1.5317077315296188, "adv/ratio_step_to_reasoning": 1.6670267257059876, "adv/std_final_conf": 0.8785596489906311, "adv/std_reasoning": 0.7207231521606445, "adv/std_step_conf": 0.934549868106842, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6614285714285715, "calib/avg_num_step_conf": 9.59375, "calib/ece": 0.25583673469387763, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5306122448979592, "calib/gap": 0.16304761904761922, "calib/mean_conf": 0.7369795918367347, "calib/mu_c": 0.8068571428571429, "calib/mu_w": 0.6438095238095237, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21069387755102048, "calib/std_conf": 0.32297699986322514, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8810873142250532, "calib/step_q_c_n": 1256.0, "calib/step_q_gap": 0.02071314755838638, "calib/step_q_w": 0.8603741666666668, "calib/step_q_w_n": 1200.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 786.08984375, "completions/mean_terminated_length": 814.7327880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.208, "grad_norm": 0.0542498379945755, "kl": 0.07525634765625, "learning_rate": 1.3888888888888888e-07, "loss": -0.179, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01755117066204548, "mask/share_reasoning": 0.8345537185668945, "mask/share_step_conf": 0.11273886263370514, "num_tokens": 59050149.0, "reward": 0.7701911926269531, "reward_std": 0.21878653764724731, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6730234622955322, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.5665776133537292, "step": 195 }, { "adv/mean_abs_final_conf": 0.4969114065170288, "adv/mean_abs_reasoning": 0.30748242139816284, "adv/mean_abs_step_conf": 0.7540386915206909, "adv/ratio_final_to_reasoning": 1.6160644379522822, "adv/ratio_step_to_reasoning": 2.4522985349600743, "adv/std_final_conf": 0.7552186846733093, "adv/std_reasoning": 0.6184966564178467, "adv/std_step_conf": 0.932518482208252, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7099794941900205, "calib/avg_num_step_conf": 9.1015625, "calib/ece": 0.2502008032128514, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7028112449799196, "calib/gap": 0.20612098427887915, "calib/mean_conf": 0.8511646586345383, "calib/mu_c": 0.9298051948051947, "calib/mu_w": 0.7236842105263156, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24144578313253012, "calib/std_conf": 0.25713092823111716, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.904220788530466, "calib/step_q_c_n": 1395.0, "calib/step_q_gap": 0.06034913077645532, "calib/step_q_w": 0.8438716577540106, "calib/step_q_w_n": 935.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 727.0546875, "completions/mean_terminated_length": 747.4939575195312, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.20906666666666668, "grad_norm": 0.04341113567352295, "kl": 0.06845855712890625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0891, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019082296639680862, "mask/share_reasoning": 0.8294082283973694, "mask/share_step_conf": 0.12416577339172363, "num_tokens": 59338819.0, "reward": 0.8234367370605469, "reward_std": 0.16358217597007751, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.720785915851593, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6112438440322876, "step": 196 }, { "adv/mean_abs_final_conf": 0.6498538255691528, "adv/mean_abs_reasoning": 0.533227801322937, "adv/mean_abs_step_conf": 0.7692816257476807, "adv/ratio_final_to_reasoning": 1.218717073560056, "adv/ratio_step_to_reasoning": 1.4426885166885421, "adv/std_final_conf": 0.8588353395462036, "adv/std_reasoning": 0.7929235696792603, "adv/std_step_conf": 0.9352068901062012, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7199796126401631, "calib/avg_num_step_conf": 9.53515625, "calib/ece": 0.22280737704918027, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4385245901639344, "calib/gap": 0.20918008834522617, "calib/mean_conf": 0.7551844262295082, "calib/mu_c": 0.8486296296296298, "calib/mu_w": 0.6394495412844037, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21235655737704914, "calib/std_conf": 0.2742819739481024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8934718826405867, "calib/step_q_c_n": 1227.0, "calib/step_q_gap": 0.08670499631439232, "calib/step_q_w": 0.8067668863261944, "calib/step_q_w_n": 1214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 766.0390625, "completions/mean_terminated_length": 803.7130737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.21013333333333334, "grad_norm": 0.0639037936925888, "kl": 0.075775146484375, "learning_rate": 8.333333333333334e-08, "loss": -0.2186, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017740493640303612, "mask/share_reasoning": 0.8179367780685425, "mask/share_step_conf": 0.11744774132966995, "num_tokens": 59639981.0, "reward": 0.7840213775634766, "reward_std": 0.22477491199970245, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7055456042289734, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.5664034485816956, "step": 197 }, { "adv/mean_abs_final_conf": 0.6817752718925476, "adv/mean_abs_reasoning": 0.5082120895385742, "adv/mean_abs_step_conf": 0.7704319953918457, "adv/ratio_final_to_reasoning": 1.3415172246523261, "adv/ratio_step_to_reasoning": 1.5159655019056144, "adv/std_final_conf": 0.8869988322257996, "adv/std_reasoning": 0.7577569484710693, "adv/std_step_conf": 0.9344480037689209, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.78292589763178, "calib/avg_num_step_conf": 9.609375, "calib/ece": 0.15046025104602515, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5271966527196653, "calib/gap": 0.29537280366692154, "calib/mean_conf": 0.7789121338912134, "calib/mu_c": 0.8839610389610392, "calib/mu_w": 0.5885882352941176, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14251046025104605, "calib/std_conf": 0.2863236688183009, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8935914985590778, "calib/step_q_c_n": 1388.0, "calib/step_q_gap": 0.04145530452922708, "calib/step_q_w": 0.8521361940298507, "calib/step_q_w_n": 1072.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 734.08203125, "completions/mean_terminated_length": 783.0208740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.2112, "grad_norm": 0.06836410611867905, "kl": 0.0736541748046875, "learning_rate": 5.555555555555556e-08, "loss": -0.2358, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018041130155324936, "mask/share_reasoning": 0.7997227311134338, "mask/share_step_conf": 0.11973617970943451, "num_tokens": 59933290.0, "reward": 0.8341923952102661, "reward_std": 0.22902759909629822, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7511105537414551, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.6118054389953613, "step": 198 }, { "adv/mean_abs_final_conf": 0.6972668170928955, "adv/mean_abs_reasoning": 0.6235437393188477, "adv/mean_abs_step_conf": 0.7461246252059937, "adv/ratio_final_to_reasoning": 1.118232407969619, "adv/ratio_step_to_reasoning": 1.1965874695832117, "adv/std_final_conf": 0.8916330337524414, "adv/std_reasoning": 0.8591709136962891, "adv/std_step_conf": 0.9358724355697632, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6319540229885057, "calib/avg_num_step_conf": 9.6484375, "calib/ece": 0.2516595744680851, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.13508045977011496, "calib/mean_conf": 0.7966808510638297, "calib/mu_c": 0.8484137931034483, "calib/mu_w": 0.7133333333333334, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2156595744680851, "calib/std_conf": 0.28745553634953547, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8878342455043002, "calib/step_q_c_n": 1279.0, "calib/step_q_gap": 0.04233886347239424, "calib/step_q_w": 0.845495382031906, "calib/step_q_w_n": 1191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 781.07421875, "completions/mean_terminated_length": 833.1458740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.21226666666666666, "grad_norm": 0.05004898086190224, "kl": 0.06656646728515625, "learning_rate": 2.777777777777778e-08, "loss": -0.2956, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.017658531665802002, "mask/share_reasoning": 0.8055081367492676, "mask/share_step_conf": 0.11433329433202744, "num_tokens": 60237445.0, "reward": 0.7555822134017944, "reward_std": 0.3044376075267792, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6541687250137329, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.560120701789856, "step": 199 }, { "adv/mean_abs_final_conf": 0.6082898378372192, "adv/mean_abs_reasoning": 0.4397979974746704, "adv/mean_abs_step_conf": 0.7207082509994507, "adv/ratio_final_to_reasoning": 1.3831118862069236, "adv/ratio_step_to_reasoning": 1.6387256311710672, "adv/std_final_conf": 0.8417679071426392, "adv/std_reasoning": 0.7392775416374207, "adv/std_step_conf": 0.9338216781616211, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7269190483315876, "calib/avg_num_step_conf": 9.04296875, "calib/ece": 0.1732244897959183, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5959183673469388, "calib/gap": 0.2493947329043843, "calib/mean_conf": 0.7872653061224489, "calib/mu_c": 0.8707361963190184, "calib/mu_w": 0.6213414634146341, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1475918367346938, "calib/std_conf": 0.2898772981452813, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8987449617790131, "calib/step_q_c_n": 1439.0, "calib/step_q_gap": 0.036130806527871484, "calib/step_q_w": 0.8626141552511416, "calib/step_q_w_n": 876.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 806.62109375, "completions/mean_terminated_length": 836.0121459960938, "completions/min_length": 0.0, "completions/min_terminated_length": 469.0, "epoch": 0.21333333333333335, "grad_norm": 0.045651182532310486, "kl": 0.06322479248046875, "learning_rate": 0.0, "loss": -0.1477, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01771700009703636, "mask/share_reasoning": 0.8353173732757568, "mask/share_step_conf": 0.11180936545133591, "num_tokens": 60551988.0, "reward": 0.8539600372314453, "reward_std": 0.1936844438314438, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.755567193031311, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6336028575897217, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.1237042972794734, "train_runtime": 21618.204, "train_samples_per_second": 2.368, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 60551988, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }