{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.47760647535324097, "adv/mean_abs_reasoning": 0.4569147527217865, "adv/mean_abs_step_conf": 0.6562294363975525, "adv/ratio_final_to_reasoning": 1.0452857398632815, "adv/ratio_step_to_reasoning": 1.4362185341761724, "adv/std_final_conf": 0.7227410674095154, "adv/std_reasoning": 0.7206857204437256, "adv/std_step_conf": 0.85791015625, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.02375916764140129, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.0868, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018737709149718285, "mask/share_reasoning": 0.845859944820404, "mask/share_step_conf": 0.10805858671665192, "num_tokens": 300991.0, "reward": 0.5539767742156982, "reward_std": 0.18287032842636108, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.09311568737030029, "step": 1 }, { "adv/mean_abs_final_conf": 0.437887966632843, "adv/mean_abs_reasoning": 0.4207462966442108, "adv/mean_abs_step_conf": 0.5934939384460449, "adv/ratio_final_to_reasoning": 1.0407411072310102, "adv/ratio_step_to_reasoning": 1.4105743607956507, "adv/std_final_conf": 0.6832791566848755, "adv/std_reasoning": 0.6817297339439392, "adv/std_step_conf": 0.8236269354820251, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.02313089929521084, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": 0.0108, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01788702979683876, "mask/share_reasoning": 0.8706268668174744, "mask/share_step_conf": 0.09976735711097717, "num_tokens": 619483.0, "reward": 0.5118981599807739, "reward_std": 0.17219942808151245, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.08474448323249817, "step": 2 }, { "adv/mean_abs_final_conf": 0.4353755712509155, "adv/mean_abs_reasoning": 0.4266285300254822, "adv/mean_abs_step_conf": 0.6578761339187622, "adv/ratio_final_to_reasoning": 1.0205027104608098, "adv/ratio_step_to_reasoning": 1.5420350202070823, "adv/std_final_conf": 0.7030425667762756, "adv/std_reasoning": 0.7013959288597107, "adv/std_step_conf": 0.8683823347091675, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4994512328967587, "calib/avg_num_step_conf": 7.6484375, "calib/ece": 0.30361111111111105, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.0975342064512006e-05, "calib/mean_conf": 0.9901190476190476, "calib/mu_c": 0.990115606936416, "calib/mu_w": 0.9901265822784805, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30361111111111105, "calib/std_conf": 0.0010845754260886081, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9123809523809524, "calib/step_q_c_n": 1302.0, "calib/step_q_gap": 0.0037376596980255306, "calib/step_q_w": 0.9086432926829269, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 807.5859375, "completions/mean_terminated_length": 813.9448852539062, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.036146312952041626, "kl": 0.0005174875259399414, "learning_rate": 7.5e-07, "loss": -0.0001, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01844394952058792, "mask/share_reasoning": 0.8726816177368164, "mask/share_step_conf": 0.10106190294027328, "num_tokens": 931481.0, "reward": 0.5520459413528442, "reward_std": 0.1757698506116867, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6817777156829834, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.09028282761573792, "step": 3 }, { "adv/mean_abs_final_conf": 0.44063931703567505, "adv/mean_abs_reasoning": 0.4074084460735321, "adv/mean_abs_step_conf": 0.6268638372421265, "adv/ratio_final_to_reasoning": 1.0815664753208016, "adv/ratio_step_to_reasoning": 1.5386618595751582, "adv/std_final_conf": 0.7006613612174988, "adv/std_reasoning": 0.6816275715827942, "adv/std_step_conf": 0.8392693400382996, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4958608058608059, "calib/avg_num_step_conf": 7.390625, "calib/ece": 0.2984782608695652, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.0879120879002677e-05, "calib/mean_conf": 0.9901778656126482, "calib/mu_c": 0.9901714285714284, "calib/mu_w": 0.9901923076923074, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2984782608695652, "calib/std_conf": 0.001283824645763773, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9100699844479004, "calib/step_q_c_n": 1286.0, "calib/step_q_gap": 0.002578235272982843, "calib/step_q_w": 0.9074917491749176, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 762.82421875, "completions/mean_terminated_length": 768.8306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 458.0, "epoch": 0.004266666666666667, "grad_norm": 0.01895269751548767, "kl": 0.000613868236541748, "learning_rate": 1.0000000000000002e-06, "loss": -0.0135, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019087232649326324, "mask/share_reasoning": 0.8699307441711426, "mask/share_step_conf": 0.10316955298185349, "num_tokens": 1232932.0, "reward": 0.5568417906761169, "reward_std": 0.16131022572517395, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6893956065177917, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.09069420397281647, "step": 4 }, { "adv/mean_abs_final_conf": 0.4351027309894562, "adv/mean_abs_reasoning": 0.4335722327232361, "adv/mean_abs_step_conf": 0.4304155707359314, "adv/ratio_final_to_reasoning": 1.0035299729796052, "adv/ratio_step_to_reasoning": 0.9927194092493472, "adv/std_final_conf": 0.6995216608047485, "adv/std_reasoning": 0.7013534903526306, "adv/std_step_conf": 0.7158880829811096, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5027740641711229, "calib/avg_num_step_conf": 7.5234375, "calib/ece": 0.4372723577235772, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.8796791444094545e-05, "calib/mean_conf": 0.9901178861788618, "calib/mu_c": 0.9901397058823529, "calib/mu_w": 0.9900909090909088, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4372723577235772, "calib/std_conf": 0.0010622519812161315, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9138263358778627, "calib/step_q_c_n": 1048.0, "calib/step_q_gap": 0.004748887130709978, "calib/step_q_w": 0.9090774487471527, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 814.140625, "completions/mean_terminated_length": 823.7944946289062, "completions/min_length": 0.0, "completions/min_terminated_length": 437.0, "epoch": 0.005333333333333333, "grad_norm": 0.02160327136516571, "kl": 0.0006667971611022949, "learning_rate": 1.25e-06, "loss": -0.0506, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018197569996118546, "mask/share_reasoning": 0.8682137727737427, "mask/share_step_conf": 0.10186988115310669, "num_tokens": 1548040.0, "reward": 0.44798406958580017, "reward_std": 0.14094765484333038, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5357648134231567, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.06254701316356659, "step": 5 }, { "adv/mean_abs_final_conf": 0.377677321434021, "adv/mean_abs_reasoning": 0.34791645407676697, "adv/mean_abs_step_conf": 0.5846976041793823, "adv/ratio_final_to_reasoning": 1.08554026982204, "adv/ratio_step_to_reasoning": 1.6805689909979658, "adv/std_final_conf": 0.6607995629310608, "adv/std_reasoning": 0.6403562426567078, "adv/std_step_conf": 0.82439786195755, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4936075781664017, "calib/avg_num_step_conf": 8.33984375, "calib/ece": 0.398116, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011870694223614642, "calib/mean_conf": 0.990116, "calib/mu_c": 0.9900675675675674, "calib/mu_w": 0.9901862745098036, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.398116, "calib/std_conf": 0.001053823514636109, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9127594728171335, "calib/step_q_c_n": 1214.0, "calib/step_q_gap": -0.002965825771357289, "calib/step_q_w": 0.9157252985884908, "calib/step_q_w_n": 921.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 730.74609375, "completions/mean_terminated_length": 742.3452758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.0064, "grad_norm": 1.111169457435608, "kl": 2.625835120677948, "learning_rate": 1.5e-06, "loss": -0.0174, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019656438380479813, "mask/share_reasoning": 0.8467674255371094, "mask/share_step_conf": 0.11795111745595932, "num_tokens": 1841063.0, "reward": 0.4827464818954468, "reward_std": 0.1388968974351883, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5858488082885742, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.06870661675930023, "step": 6 }, { "adv/mean_abs_final_conf": 0.4797869920730591, "adv/mean_abs_reasoning": 0.4646647572517395, "adv/mean_abs_step_conf": 0.6939883232116699, "adv/ratio_final_to_reasoning": 1.0325443980532547, "adv/ratio_step_to_reasoning": 1.493524766793731, "adv/std_final_conf": 0.7410189509391785, "adv/std_reasoning": 0.7392896413803101, "adv/std_step_conf": 0.9044371843338013, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5060975609756098, "calib/avg_num_step_conf": 7.5390625, "calib/ece": 0.3392857142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00012195121951208421, "calib/mean_conf": 0.9900793650793651, "calib/mu_c": 0.9901219512195122, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3392857142857143, "calib/std_conf": 0.0008873285624999172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.910952762209768, "calib/step_q_c_n": 1249.0, "calib/step_q_gap": 0.006929267349268908, "calib/step_q_w": 0.904023494860499, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 824.03125, "completions/mean_terminated_length": 837.1111450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.007466666666666667, "grad_norm": 0.026778535917401314, "kl": 0.0014523863792419434, "learning_rate": 1.75e-06, "loss": -0.0064, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.01766549050807953, "mask/share_reasoning": 0.8701153993606567, "mask/share_step_conf": 0.09659412503242493, "num_tokens": 2159439.0, "reward": 0.5411956310272217, "reward_std": 0.19584408402442932, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.647402286529541, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.10998891294002533, "step": 7 }, { "adv/mean_abs_final_conf": 0.40551865100860596, "adv/mean_abs_reasoning": 0.37076336145401, "adv/mean_abs_step_conf": 0.6113842725753784, "adv/ratio_final_to_reasoning": 1.0937398167345806, "adv/ratio_step_to_reasoning": 1.648987834660182, "adv/std_final_conf": 0.6797915697097778, "adv/std_reasoning": 0.661353349685669, "adv/std_step_conf": 0.8538694977760315, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49281979783896834, "calib/avg_num_step_conf": 7.2578125, "calib/ece": 0.3762967479674797, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00013377483443721694, "calib/mean_conf": 0.9901178861788618, "calib/mu_c": 0.9900662251655625, "calib/mu_w": 0.9901999999999997, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3762967479674797, "calib/std_conf": 0.0010622519812161315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9090705987488831, "calib/step_q_c_n": 1119.0, "calib/step_q_gap": 0.004450842321278237, "calib/step_q_w": 0.9046197564276048, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 828.50390625, "completions/mean_terminated_length": 835.0275268554688, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.008533333333333334, "grad_norm": 0.022890863940119743, "kl": 0.0005786418914794922, "learning_rate": 2.0000000000000003e-06, "loss": -0.0574, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017660701647400856, "mask/share_reasoning": 0.8782448172569275, "mask/share_step_conf": 0.0962819904088974, "num_tokens": 2478048.0, "reward": 0.49228063225746155, "reward_std": 0.14467144012451172, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.597022294998169, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.07738274335861206, "step": 8 }, { "adv/mean_abs_final_conf": 0.45169568061828613, "adv/mean_abs_reasoning": 0.42771801352500916, "adv/mean_abs_step_conf": 0.6686674356460571, "adv/ratio_final_to_reasoning": 1.0560595213085993, "adv/ratio_step_to_reasoning": 1.5633370924345213, "adv/std_final_conf": 0.7222773432731628, "adv/std_reasoning": 0.7207134962081909, "adv/std_step_conf": 0.885751485824585, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5152286163286687, "calib/avg_num_step_conf": 7.640625, "calib/ece": 0.3262704918032787, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.011993564319389693, "calib/mean_conf": 0.9861065573770492, "calib/mu_c": 0.9901863354037268, "calib/mu_w": 0.9781927710843371, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3262704918032787, "calib/std_conf": 0.06262696054024179, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9094254787676938, "calib/step_q_c_n": 1201.0, "calib/step_q_gap": 0.0076241542643824545, "calib/step_q_w": 0.9018013245033113, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 795.34375, "completions/mean_terminated_length": 824.3239135742188, "completions/min_length": 0.0, "completions/min_terminated_length": 478.0, "epoch": 0.0096, "grad_norm": 0.02588689513504505, "kl": 0.0006021857261657715, "learning_rate": 2.25e-06, "loss": -0.0851, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017839480191469193, "mask/share_reasoning": 0.84670090675354, "mask/share_step_conf": 0.10030336678028107, "num_tokens": 2789192.0, "reward": 0.5188800096511841, "reward_std": 0.18360626697540283, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6390469074249268, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.082306869328022, "step": 9 }, { "adv/mean_abs_final_conf": 0.49289408326148987, "adv/mean_abs_reasoning": 0.48456206917762756, "adv/mean_abs_step_conf": 0.6818910241127014, "adv/ratio_final_to_reasoning": 1.0171949366528068, "adv/ratio_step_to_reasoning": 1.4072315343831387, "adv/std_final_conf": 0.7211974859237671, "adv/std_reasoning": 0.7207997441291809, "adv/std_step_conf": 0.8718588948249817, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4888365349764407, "calib/avg_num_step_conf": 7.5078125, "calib/ece": 0.35478237704918036, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002720913374411049, "calib/mean_conf": 0.9900282786885246, "calib/mu_c": 0.9899290322580645, "calib/mu_w": 0.9902011235955056, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35478237704918036, "calib/std_conf": 0.0016231103477648205, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.909689419795222, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": -0.00035644687144464893, "calib/step_q_w": 0.9100458666666666, "calib/step_q_w_n": 750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 844.05859375, "completions/mean_terminated_length": 857.4564208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 498.0, "epoch": 0.010666666666666666, "grad_norm": 0.02240643836557865, "kl": 0.0006812214851379395, "learning_rate": 2.5e-06, "loss": -0.0539, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017220359295606613, "mask/share_reasoning": 0.8717728853225708, "mask/share_step_conf": 0.09538174420595169, "num_tokens": 3112071.0, "reward": 0.5147616863250732, "reward_std": 0.19408252835273743, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6121847629547119, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.105619877576828, "step": 10 }, { "adv/mean_abs_final_conf": 0.46699559688568115, "adv/mean_abs_reasoning": 0.42676591873168945, "adv/mean_abs_step_conf": 0.599322497844696, "adv/ratio_final_to_reasoning": 1.0942663797370482, "adv/ratio_step_to_reasoning": 1.4043354249698041, "adv/std_final_conf": 0.756411075592041, "adv/std_reasoning": 0.7393324971199036, "adv/std_step_conf": 0.8417291045188904, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4894845360824743, "calib/avg_num_step_conf": 7.5859375, "calib/ece": 0.3829757085020242, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00016082474226830623, "calib/mean_conf": 0.9902631578947367, "calib/mu_c": 0.9901999999999999, "calib/mu_w": 0.9903608247422682, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3829757085020242, "calib/std_conf": 0.001568793260728549, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.910226480836237, "calib/step_q_c_n": 1148.0, "calib/step_q_gap": 0.0031987730276726856, "calib/step_q_w": 0.9070277078085643, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 832.9609375, "completions/mean_terminated_length": 849.5538330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.011733333333333333, "grad_norm": 0.020833250135183334, "kl": 0.000508427619934082, "learning_rate": 2.7500000000000004e-06, "loss": -0.003, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017751820385456085, "mask/share_reasoning": 0.8656059503555298, "mask/share_step_conf": 0.09711091220378876, "num_tokens": 3429789.0, "reward": 0.48372191190719604, "reward_std": 0.16617152094841003, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5930706262588501, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.06499826163053513, "step": 11 }, { "adv/mean_abs_final_conf": 0.4253380596637726, "adv/mean_abs_reasoning": 0.39448434114456177, "adv/mean_abs_step_conf": 0.6090289354324341, "adv/ratio_final_to_reasoning": 1.0782127838831104, "adv/ratio_step_to_reasoning": 1.5438608631850632, "adv/std_final_conf": 0.7034652829170227, "adv/std_reasoning": 0.701582133769989, "adv/std_step_conf": 0.8410204648971558, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.505650406504065, "calib/avg_num_step_conf": 8.0234375, "calib/ece": 0.3040083682008369, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011780487804902151, "calib/mean_conf": 0.9902008368200838, "calib/mu_c": 0.9902378048780488, "calib/mu_w": 0.9901199999999998, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3040083682008369, "calib/std_conf": 0.0013757612311023004, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9132255125284737, "calib/step_q_c_n": 1317.0, "calib/step_q_gap": 0.01564885038464736, "calib/step_q_w": 0.8975766621438264, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 758.75, "completions/mean_terminated_length": 796.0655517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.0128, "grad_norm": 0.02203812450170517, "kl": 0.0005838871002197266, "learning_rate": 3e-06, "loss": -0.1245, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018417973071336746, "mask/share_reasoning": 0.8247957229614258, "mask/share_step_conf": 0.10991127789020538, "num_tokens": 3728205.0, "reward": 0.517718493938446, "reward_std": 0.17244943976402283, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6463226079940796, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.07427066564559937, "step": 12 }, { "adv/mean_abs_final_conf": 0.4694608449935913, "adv/mean_abs_reasoning": 0.4375338554382324, "adv/mean_abs_step_conf": 0.6185725927352905, "adv/ratio_final_to_reasoning": 1.0729703294008663, "adv/ratio_step_to_reasoning": 1.4137708089257008, "adv/std_final_conf": 0.740425169467926, "adv/std_reasoning": 0.7206034064292908, "adv/std_step_conf": 0.8563767075538635, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.500970066518847, "calib/avg_num_step_conf": 7.8359375, "calib/ece": 0.33943214285714274, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.4395787139707146e-05, "calib/mean_conf": 0.9902257936507936, "calib/mu_c": 0.9902378048780488, "calib/mu_w": 0.9902034090909091, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33943214285714274, "calib/std_conf": 0.001699862671005265, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9102325962325964, "calib/step_q_c_n": 1221.0, "calib/step_q_gap": -0.006341926060397185, "calib/step_q_w": 0.9165745222929936, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 810.328125, "completions/mean_terminated_length": 816.7086791992188, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.013866666666666666, "grad_norm": 0.029545899480581284, "kl": 0.0006132125854492188, "learning_rate": 3.2500000000000002e-06, "loss": 0.0067, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.018678510561585426, "mask/share_reasoning": 0.8695738315582275, "mask/share_step_conf": 0.10393518209457397, "num_tokens": 4040241.0, "reward": 0.5299364328384399, "reward_std": 0.18000411987304688, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.64726322889328, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.08760970830917358, "step": 13 }, { "adv/mean_abs_final_conf": 0.490660160779953, "adv/mean_abs_reasoning": 0.4737318754196167, "adv/mean_abs_step_conf": 0.6872192621231079, "adv/ratio_final_to_reasoning": 1.0357338955613695, "adv/ratio_step_to_reasoning": 1.4506502470714913, "adv/std_final_conf": 0.7587310671806335, "adv/std_reasoning": 0.7575871348381042, "adv/std_step_conf": 0.9029594659805298, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5006708407871198, "calib/avg_num_step_conf": 7.43359375, "calib/ece": 0.3454954545454546, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.3088849135534097e-05, "calib/mean_conf": 0.9901235537190083, "calib/mu_c": 0.990128205128205, "calib/mu_w": 0.9901151162790695, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3454954545454546, "calib/std_conf": 0.001102805857579187, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9110016977928693, "calib/step_q_c_n": 1178.0, "calib/step_q_gap": 0.004389973654938317, "calib/step_q_w": 0.906611724137931, "calib/step_q_w_n": 725.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 803.46484375, "completions/mean_terminated_length": 836.1259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 445.0, "epoch": 0.014933333333333333, "grad_norm": 0.0220224317163229, "kl": 0.0008367300033569336, "learning_rate": 3.5e-06, "loss": -0.0476, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017497293651103973, "mask/share_reasoning": 0.8445008397102356, "mask/share_step_conf": 0.09893938153982162, "num_tokens": 4351328.0, "reward": 0.4997730851173401, "reward_std": 0.19383662939071655, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6159230470657349, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.07268565148115158, "step": 14 }, { "adv/mean_abs_final_conf": 0.40281879901885986, "adv/mean_abs_reasoning": 0.3550412356853485, "adv/mean_abs_step_conf": 0.6045442223548889, "adv/ratio_final_to_reasoning": 1.1345690543276885, "adv/ratio_step_to_reasoning": 1.7027436860620317, "adv/std_final_conf": 0.6806594729423523, "adv/std_reasoning": 0.6402491927146912, "adv/std_step_conf": 0.8389008045196533, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4869388544891641, "calib/avg_num_step_conf": 7.578125, "calib/ece": 0.39179527559055094, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00023774509803908828, "calib/mean_conf": 0.9902204724409447, "calib/mu_c": 0.9901249999999999, "calib/mu_w": 0.990362745098039, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39179527559055094, "calib/std_conf": 0.0014275875020509797, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.91445819112628, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": 0.006006368209613311, "calib/step_q_w": 0.9084518229166667, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 759.3984375, "completions/mean_terminated_length": 765.3779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.016, "grad_norm": 0.024850942194461823, "kl": 0.0008978843688964844, "learning_rate": 3.7500000000000005e-06, "loss": -0.0064, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019321627914905548, "mask/share_reasoning": 0.867222785949707, "mask/share_step_conf": 0.10564309358596802, "num_tokens": 4653614.0, "reward": 0.49917930364608765, "reward_std": 0.1498795747756958, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6013327836990356, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.07983829081058502, "step": 15 }, { "adv/mean_abs_final_conf": 0.35396450757980347, "adv/mean_abs_reasoning": 0.31694984436035156, "adv/mean_abs_step_conf": 0.531715989112854, "adv/ratio_final_to_reasoning": 1.1167839766388041, "adv/ratio_step_to_reasoning": 1.6776029348931536, "adv/std_final_conf": 0.6389526128768921, "adv/std_reasoning": 0.6185487508773804, "adv/std_step_conf": 0.7904064059257507, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4907988900248284, "calib/avg_num_step_conf": 7.8046875, "calib/ece": 0.31943775100401617, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001840221995034197, "calib/mean_conf": 0.990120481927711, "calib/mu_c": 0.9900598802395209, "calib/mu_w": 0.9902439024390243, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31943775100401617, "calib/std_conf": 0.0010910102576069185, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9121538461538463, "calib/step_q_c_n": 1300.0, "calib/step_q_gap": 0.021580780251267306, "calib/step_q_w": 0.890573065902579, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 897.140625, "completions/mean_terminated_length": 904.2047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.017066666666666667, "grad_norm": 0.015497280284762383, "kl": 0.0007807016372680664, "learning_rate": 4.000000000000001e-06, "loss": -0.0315, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.016737807542085648, "mask/share_reasoning": 0.8816334009170532, "mask/share_step_conf": 0.09381629526615143, "num_tokens": 4992130.0, "reward": 0.5291246771812439, "reward_std": 0.12536513805389404, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6545917987823486, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.0786575973033905, "step": 16 }, { "adv/mean_abs_final_conf": 0.47310739755630493, "adv/mean_abs_reasoning": 0.46891841292381287, "adv/mean_abs_step_conf": 0.6789902448654175, "adv/ratio_final_to_reasoning": 1.0089332909884532, "adv/ratio_step_to_reasoning": 1.4479922864017196, "adv/std_final_conf": 0.7208500504493713, "adv/std_reasoning": 0.7207322120666504, "adv/std_step_conf": 0.883650004863739, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49545604758757433, "calib/avg_num_step_conf": 7.7421875, "calib/ece": 0.2665040650406503, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.087904824833792e-05, "calib/mean_conf": 0.990081300813008, "calib/mu_c": 0.9900561797752808, "calib/mu_w": 0.9901470588235292, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2665040650406503, "calib/std_conf": 0.0008979968306656315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9065855457227138, "calib/step_q_c_n": 1356.0, "calib/step_q_gap": -0.006481546929043347, "calib/step_q_w": 0.9130670926517571, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 819.2421875, "completions/mean_terminated_length": 842.2730712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.018133333333333335, "grad_norm": 0.019301526248455048, "kl": 0.0012952089309692383, "learning_rate": 4.25e-06, "loss": -0.0088, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017654668539762497, "mask/share_reasoning": 0.855949342250824, "mask/share_step_conf": 0.09905220568180084, "num_tokens": 5305384.0, "reward": 0.564923107624054, "reward_std": 0.18626902997493744, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7004516124725342, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.09814468026161194, "step": 17 }, { "adv/mean_abs_final_conf": 0.5087032318115234, "adv/mean_abs_reasoning": 0.49621325731277466, "adv/mean_abs_step_conf": 0.5594228506088257, "adv/ratio_final_to_reasoning": 1.0251705780018612, "adv/ratio_step_to_reasoning": 1.1273839268994148, "adv/std_final_conf": 0.7751443386077881, "adv/std_reasoning": 0.7754399180412292, "adv/std_step_conf": 0.8083623647689819, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.4954954954954955, "calib/avg_num_step_conf": 7.5703125, "calib/ece": 0.4449467213114754, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.306306306302911e-05, "calib/mean_conf": 0.9900286885245901, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900630630630629, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4449467213114754, "calib/std_conf": 0.0004472098396591778, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8994117647058825, "calib/step_q_c_n": 1088.0, "calib/step_q_gap": -0.00918941176470578, "calib/step_q_w": 0.9086011764705882, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 875.83984375, "completions/mean_terminated_length": 889.7421264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 483.0, "epoch": 0.0192, "grad_norm": 0.017118962481617928, "kl": 0.001046299934387207, "learning_rate": 4.5e-06, "loss": -0.071, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017369866371154785, "mask/share_reasoning": 0.8725413680076599, "mask/share_step_conf": 0.0944637730717659, "num_tokens": 5640319.0, "reward": 0.4441087245941162, "reward_std": 0.18918488919734955, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5278979539871216, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.0665694922208786, "step": 18 }, { "adv/mean_abs_final_conf": 0.420177698135376, "adv/mean_abs_reasoning": 0.3982063829898834, "adv/mean_abs_step_conf": 0.590377926826477, "adv/ratio_final_to_reasoning": 1.055175698040106, "adv/ratio_step_to_reasoning": 1.4825928263472257, "adv/std_final_conf": 0.7004992961883545, "adv/std_reasoning": 0.6815143823623657, "adv/std_step_conf": 0.8227755427360535, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49434974747474747, "calib/avg_num_step_conf": 7.33984375, "calib/ece": 0.42318110236220463, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011022727272702593, "calib/mean_conf": 0.9901102362204723, "calib/mu_c": 0.9900625000000001, "calib/mu_w": 0.9901727272727271, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42318110236220463, "calib/std_conf": 0.0010096256923672652, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9134383954154727, "calib/step_q_c_n": 1047.0, "calib/step_q_gap": 0.00427613580008801, "calib/step_q_w": 0.9091622596153847, "calib/step_q_w_n": 832.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 786.2109375, "completions/mean_terminated_length": 789.294189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.020266666666666665, "grad_norm": 0.029656028375029564, "kl": 0.0014562606811523438, "learning_rate": 4.75e-06, "loss": 0.0062, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018753767013549805, "mask/share_reasoning": 0.877987265586853, "mask/share_step_conf": 0.09935271739959717, "num_tokens": 5946349.0, "reward": 0.48736435174942017, "reward_std": 0.1544978767633438, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.57084721326828, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.09294396638870239, "step": 19 }, { "adv/mean_abs_final_conf": 0.3809193968772888, "adv/mean_abs_reasoning": 0.3230840563774109, "adv/mean_abs_step_conf": 0.5315940976142883, "adv/ratio_final_to_reasoning": 1.179010196753001, "adv/ratio_step_to_reasoning": 1.6453739734940875, "adv/std_final_conf": 0.6795678734779358, "adv/std_reasoning": 0.640274167060852, "adv/std_step_conf": 0.7912455201148987, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.48514851485148514, "calib/avg_num_step_conf": 7.9296875, "calib/ece": 0.3954056224899599, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.0005346534653465351, "calib/mean_conf": 0.9897831325301205, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9894653465346533, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3954056224899599, "calib/std_conf": 0.0058124258104265605, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9152345058626465, "calib/step_q_c_n": 1194.0, "calib/step_q_gap": 0.005669912561211188, "calib/step_q_w": 0.9095645933014354, "calib/step_q_w_n": 836.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 783.125, "completions/mean_terminated_length": 795.5556030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.021333333333333333, "grad_norm": 0.018992777913808823, "kl": 0.001955866813659668, "learning_rate": 5e-06, "loss": 0.0101, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019337989389896393, "mask/share_reasoning": 0.8542299270629883, "mask/share_step_conf": 0.11080703139305115, "num_tokens": 6251701.0, "reward": 0.4819880723953247, "reward_std": 0.13258323073387146, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5862253904342651, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.06837578862905502, "step": 20 }, { "adv/mean_abs_final_conf": 0.5759612321853638, "adv/mean_abs_reasoning": 0.5443170070648193, "adv/mean_abs_step_conf": 0.7146705985069275, "adv/ratio_final_to_reasoning": 1.05813565387417, "adv/ratio_step_to_reasoning": 1.3129676075357715, "adv/std_final_conf": 0.7779689431190491, "adv/std_reasoning": 0.7756016254425049, "adv/std_step_conf": 0.8874878883361816, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5050167224080268, "calib/avg_num_step_conf": 7.8515625, "calib/ece": 0.36124193548387107, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 9.030100334450619e-05, "calib/mean_conf": 0.9902741935483872, "calib/mu_c": 0.9903076923076921, "calib/mu_w": 0.9902173913043476, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36124193548387107, "calib/std_conf": 0.0016106435784283791, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9136597510373444, "calib/step_q_c_n": 1205.0, "calib/step_q_gap": 0.0002932914100152173, "calib/step_q_w": 0.9133664596273292, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 770.5546875, "completions/mean_terminated_length": 789.0480346679688, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.0224, "grad_norm": 0.019365724176168442, "kl": 0.0024155378341674805, "learning_rate": 4.9722222222222224e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019052758812904358, "mask/share_reasoning": 0.8532441854476929, "mask/share_step_conf": 0.10426557064056396, "num_tokens": 6551923.0, "reward": 0.49608340859413147, "reward_std": 0.2246149778366089, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6163120865821838, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.06022978574037552, "step": 21 }, { "adv/mean_abs_final_conf": 0.43752947449684143, "adv/mean_abs_reasoning": 0.41081422567367554, "adv/mean_abs_step_conf": 0.6186083555221558, "adv/ratio_final_to_reasoning": 1.065029999336943, "adv/ratio_step_to_reasoning": 1.5058104536368673, "adv/std_final_conf": 0.7573110461235046, "adv/std_reasoning": 0.7390194535255432, "adv/std_step_conf": 0.8727254867553711, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4995361450082067, "calib/avg_num_step_conf": 7.80078125, "calib/ece": 0.30901141732283466, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.4208235210255715e-05, "calib/mean_conf": 0.9901137795275591, "calib/mu_c": 0.9901092485549132, "calib/mu_w": 0.9901234567901235, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30901141732283466, "calib/std_conf": 0.0010418823283065964, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9171899462778205, "calib/step_q_c_n": 1303.0, "calib/step_q_gap": -0.007838872165983557, "calib/step_q_w": 0.925028818443804, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 740.640625, "completions/mean_terminated_length": 746.472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.023466666666666667, "grad_norm": 0.027902813628315926, "kl": 0.009773969650268555, "learning_rate": 4.944444444444445e-06, "loss": -0.0236, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01975053921341896, "mask/share_reasoning": 0.863436222076416, "mask/share_step_conf": 0.1090007796883583, "num_tokens": 6843343.0, "reward": 0.5620276927947998, "reward_std": 0.16170284152030945, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6819331645965576, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.10852846503257751, "step": 22 }, { "adv/mean_abs_final_conf": 0.5960447788238525, "adv/mean_abs_reasoning": 0.5879702568054199, "adv/mean_abs_step_conf": 0.6331772804260254, "adv/ratio_final_to_reasoning": 1.0137328749625931, "adv/ratio_step_to_reasoning": 1.0768865824373937, "adv/std_final_conf": 0.8256348371505737, "adv/std_reasoning": 0.826520562171936, "adv/std_step_conf": 0.8576090335845947, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49056603773584906, "calib/avg_num_step_conf": 7.83984375, "calib/ece": 0.4157791164658634, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001792452830191582, "calib/mean_conf": 0.9900763052208835, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9901792452830189, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4157791164658634, "calib/std_conf": 0.0008491673237872526, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9190516431924883, "calib/step_q_c_n": 1065.0, "calib/step_q_gap": -0.005096072004111107, "calib/step_q_w": 0.9241477151965994, "calib/step_q_w_n": 941.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2615.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 775.57421875, "completions/mean_terminated_length": 787.8849487304688, "completions/min_length": 0.0, "completions/min_terminated_length": 374.0, "epoch": 0.024533333333333334, "grad_norm": 0.019075796008110046, "kl": 0.006722688674926758, "learning_rate": 4.9166666666666665e-06, "loss": -0.0164, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018941929563879967, "mask/share_reasoning": 0.8601007461547852, "mask/share_step_conf": 0.10533229261636734, "num_tokens": 7145826.0, "reward": 0.4781431257724762, "reward_std": 0.21690842509269714, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5626464486122131, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.08895233273506165, "step": 23 }, { "adv/mean_abs_final_conf": 0.44700348377227783, "adv/mean_abs_reasoning": 0.438474178314209, "adv/mean_abs_step_conf": 0.5623143911361694, "adv/ratio_final_to_reasoning": 1.0194522411578746, "adv/ratio_step_to_reasoning": 1.2824344487013715, "adv/std_final_conf": 0.701201856136322, "adv/std_reasoning": 0.7014211416244507, "adv/std_step_conf": 0.8045052289962769, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5034965034965035, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.41574297188755016, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 6.993006992994655e-05, "calib/mean_conf": 0.9900401606425703, "calib/mu_c": 0.9900699300699299, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41574297188755016, "calib/std_conf": 0.0006324504316475355, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9175092592592593, "calib/step_q_c_n": 1080.0, "calib/step_q_gap": -0.0005889348716661136, "calib/step_q_w": 0.9180981941309254, "calib/step_q_w_n": 886.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3042.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 811.51171875, "completions/mean_terminated_length": 824.3928833007812, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 0.0256, "grad_norm": 0.013942546211183071, "kl": 0.0044553279876708984, "learning_rate": 4.888888888888889e-06, "loss": -0.0631, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018382534384727478, "mask/share_reasoning": 0.8635358810424805, "mask/share_step_conf": 0.10245657712221146, "num_tokens": 7458085.0, "reward": 0.4683263599872589, "reward_std": 0.16613644361495972, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5667780637741089, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.06362461298704147, "step": 24 }, { "adv/mean_abs_final_conf": 0.44899922609329224, "adv/mean_abs_reasoning": 0.44442665576934814, "adv/mean_abs_step_conf": 0.5876548886299133, "adv/ratio_final_to_reasoning": 1.0102886950289436, "adv/ratio_step_to_reasoning": 1.3222764228950723, "adv/std_final_conf": 0.7194580435752869, "adv/std_reasoning": 0.7205483913421631, "adv/std_step_conf": 0.815241813659668, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49756515775034293, "calib/avg_num_step_conf": 7.40625, "calib/ece": 0.347218253968254, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.8271604938300285e-05, "calib/mean_conf": 0.9900753968253969, "calib/mu_c": 0.9900617283950617, "calib/mu_w": 0.9901, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.347218253968254, "calib/std_conf": 0.0008441381918702944, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9194467728415759, "calib/step_q_c_n": 1193.0, "calib/step_q_gap": 0.0006089350037381536, "calib/step_q_w": 0.9188378378378378, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 757.90234375, "completions/mean_terminated_length": 760.8745727539062, "completions/min_length": 0.0, "completions/min_terminated_length": 350.0, "epoch": 0.02666666666666667, "grad_norm": 0.02139492705464363, "kl": 0.005379199981689453, "learning_rate": 4.861111111111111e-06, "loss": 0.0378, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.019525248557329178, "mask/share_reasoning": 0.8722825050354004, "mask/share_step_conf": 0.10428602993488312, "num_tokens": 7755332.0, "reward": 0.5215007066726685, "reward_std": 0.1627618968486786, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6395202875137329, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.08160613477230072, "step": 25 }, { "adv/mean_abs_final_conf": 0.36643359065055847, "adv/mean_abs_reasoning": 0.3592587411403656, "adv/mean_abs_step_conf": 0.6197366118431091, "adv/ratio_final_to_reasoning": 1.019971259397665, "adv/ratio_step_to_reasoning": 1.725042541417169, "adv/std_final_conf": 0.6395753622055054, "adv/std_reasoning": 0.6402528882026672, "adv/std_step_conf": 0.8247919678688049, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5030864197530864, "calib/avg_num_step_conf": 7.1171875, "calib/ece": 0.34971936758893274, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.555555555569747e-05, "calib/mean_conf": 0.9900355731225295, "calib/mu_c": 0.9900555555555556, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34971936758893274, "calib/std_conf": 0.0005647058134288065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9190464912280704, "calib/step_q_c_n": 1140.0, "calib/step_q_gap": 0.001803089468539465, "calib/step_q_w": 0.9172434017595309, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 770.80078125, "completions/mean_terminated_length": 773.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 486.0, "epoch": 0.027733333333333332, "grad_norm": 0.028968777507543564, "kl": 0.00805807113647461, "learning_rate": 4.833333333333333e-06, "loss": 0.0343, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.019127503037452698, "mask/share_reasoning": 0.8790621757507324, "mask/share_step_conf": 0.09790406376123428, "num_tokens": 8057897.0, "reward": 0.5272977352142334, "reward_std": 0.14071059226989746, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6398234367370605, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.09055326879024506, "step": 26 }, { "adv/mean_abs_final_conf": 0.4889024496078491, "adv/mean_abs_reasoning": 0.4714679419994354, "adv/mean_abs_step_conf": 0.619888186454773, "adv/ratio_final_to_reasoning": 1.0369792006100693, "adv/ratio_step_to_reasoning": 1.3148045311965564, "adv/std_final_conf": 0.7212886214256287, "adv/std_reasoning": 0.7207216620445251, "adv/std_step_conf": 0.8250879049301147, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5026298701298701, "calib/avg_num_step_conf": 7.7734375, "calib/ece": 0.43011200000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.389610389616717e-05, "calib/mean_conf": 0.9901120000000001, "calib/mu_c": 0.9901357142857141, "calib/mu_w": 0.990081818181818, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43011200000000005, "calib/std_conf": 0.0010175735845628076, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9208622540250446, "calib/step_q_c_n": 1118.0, "calib/step_q_gap": 0.0015342723736684327, "calib/step_q_w": 0.9193279816513762, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 769.74609375, "completions/mean_terminated_length": 785.0797119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.0288, "grad_norm": 0.01685965247452259, "kl": 0.006510257720947266, "learning_rate": 4.805555555555556e-06, "loss": -0.0527, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01864682137966156, "mask/share_reasoning": 0.8566619157791138, "mask/share_step_conf": 0.10516005754470825, "num_tokens": 8360168.0, "reward": 0.47186827659606934, "reward_std": 0.19281867146492004, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5553019642829895, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.08374712616205215, "step": 27 }, { "adv/mean_abs_final_conf": 0.37701892852783203, "adv/mean_abs_reasoning": 0.3589438199996948, "adv/mean_abs_step_conf": 0.6417146921157837, "adv/ratio_final_to_reasoning": 1.0503563720031523, "adv/ratio_step_to_reasoning": 1.787785877233739, "adv/std_final_conf": 0.6619911193847656, "adv/std_reasoning": 0.6612706780433655, "adv/std_step_conf": 0.8566312789916992, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4962452107279694, "calib/avg_num_step_conf": 6.76171875, "calib/ece": 0.2912570281124499, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.0689655172323462e-05, "calib/mean_conf": 0.9900522088353415, "calib/mu_c": 0.9900459770114941, "calib/mu_w": 0.9900666666666664, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2912570281124499, "calib/std_conf": 0.0005955702782945109, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9177899505766064, "calib/step_q_c_n": 1214.0, "calib/step_q_gap": 0.0013682871336663238, "calib/step_q_w": 0.9164216634429401, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 777.7890625, "completions/mean_terminated_length": 793.2828979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.029866666666666666, "grad_norm": 0.02047092281281948, "kl": 0.005346775054931641, "learning_rate": 4.777777777777778e-06, "loss": -0.075, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018771663308143616, "mask/share_reasoning": 0.8672536015510559, "mask/share_step_conf": 0.09444352984428406, "num_tokens": 8666226.0, "reward": 0.5469997525215149, "reward_std": 0.14946556091308594, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6854112148284912, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.07811960577964783, "step": 28 }, { "adv/mean_abs_final_conf": 0.3974316418170929, "adv/mean_abs_reasoning": 0.385578989982605, "adv/mean_abs_step_conf": 0.5162817239761353, "adv/ratio_final_to_reasoning": 1.0307398798752563, "adv/ratio_step_to_reasoning": 1.3389778421262704, "adv/std_final_conf": 0.6608829498291016, "adv/std_reasoning": 0.6612251400947571, "adv/std_step_conf": 0.7739951014518738, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5020087884494664, "calib/avg_num_step_conf": 7.30078125, "calib/ece": 0.45664031620553347, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.5630885122241978e-05, "calib/mean_conf": 0.9902371541501975, "calib/mu_c": 0.9902444444444441, "calib/mu_w": 0.9902288135593219, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45664031620553347, "calib/std_conf": 0.0014166859270012572, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9178138832997988, "calib/step_q_c_n": 994.0, "calib/step_q_gap": -0.0007678309859153742, "calib/step_q_w": 0.9185817142857142, "calib/step_q_w_n": 875.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 856.39453125, "completions/mean_terminated_length": 863.1378173828125, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 0.030933333333333334, "grad_norm": 0.053197044879198074, "kl": 0.011476516723632812, "learning_rate": 4.75e-06, "loss": -0.0211, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01776573807001114, "mask/share_reasoning": 0.8802124857902527, "mask/share_step_conf": 0.09420927613973618, "num_tokens": 8992591.0, "reward": 0.4661726951599121, "reward_std": 0.14427511394023895, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5362553596496582, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.09296499192714691, "step": 29 }, { "adv/mean_abs_final_conf": 0.5404569506645203, "adv/mean_abs_reasoning": 0.5381417870521545, "adv/mean_abs_step_conf": 0.705502986907959, "adv/ratio_final_to_reasoning": 1.0043021442825464, "adv/ratio_step_to_reasoning": 1.3109983351647518, "adv/std_final_conf": 0.7751840353012085, "adv/std_reasoning": 0.775407075881958, "adv/std_step_conf": 0.903175950050354, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.48936170212765956, "calib/avg_num_step_conf": 7.55859375, "calib/ece": 0.3691048387096775, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00019148936170199082, "calib/mean_conf": 0.9900725806451613, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901914893617019, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3691048387096775, "calib/std_conf": 0.0008049582948200391, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9205236486486488, "calib/step_q_c_n": 1184.0, "calib/step_q_gap": 0.007987030805772566, "calib/step_q_w": 0.9125366178428762, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2632.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 820.296875, "completions/mean_terminated_length": 833.3175048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.032, "grad_norm": 0.026751358062028885, "kl": 0.006610393524169922, "learning_rate": 4.722222222222222e-06, "loss": -0.0139, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01762595772743225, "mask/share_reasoning": 0.8690439462661743, "mask/share_step_conf": 0.09770509600639343, "num_tokens": 9309571.0, "reward": 0.5059753060340881, "reward_std": 0.2025582194328308, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6086695194244385, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.0892186313867569, "step": 30 }, { "adv/mean_abs_final_conf": 0.5019617080688477, "adv/mean_abs_reasoning": 0.470305860042572, "adv/mean_abs_step_conf": 0.5981417894363403, "adv/ratio_final_to_reasoning": 1.067309065686339, "adv/ratio_step_to_reasoning": 1.2718144515192658, "adv/std_final_conf": 0.7537896037101746, "adv/std_reasoning": 0.7574817538261414, "adv/std_step_conf": 0.8251199126243591, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.47157302618362024, "calib/avg_num_step_conf": 7.546875, "calib/ece": 0.43336585365853664, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0004755909730125296, "calib/mean_conf": 0.9902764227642277, "calib/mu_c": 0.990065693430657, "calib/mu_w": 0.9905412844036695, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43336585365853664, "calib/std_conf": 0.001526464713138631, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9185205078125, "calib/step_q_c_n": 1024.0, "calib/step_q_gap": 0.00890597036756624, "calib/step_q_w": 0.9096145374449338, "calib/step_q_w_n": 908.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3021.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 851.1875, "completions/mean_terminated_length": 882.2024536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 495.0, "epoch": 0.03306666666666667, "grad_norm": 0.034380145370960236, "kl": 0.0059261322021484375, "learning_rate": 4.694444444444445e-06, "loss": -0.1133, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0168706513941288, "mask/share_reasoning": 0.8561941981315613, "mask/share_step_conf": 0.09177893400192261, "num_tokens": 9633387.0, "reward": 0.466298907995224, "reward_std": 0.15734335780143738, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5431178212165833, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.09026122093200684, "step": 31 }, { "adv/mean_abs_final_conf": 0.46154144406318665, "adv/mean_abs_reasoning": 0.42569875717163086, "adv/mean_abs_step_conf": 0.4990188479423523, "adv/ratio_final_to_reasoning": 1.0841973021713684, "adv/ratio_step_to_reasoning": 1.1722346836478093, "adv/std_final_conf": 0.7329168915748596, "adv/std_reasoning": 0.7013481259346008, "adv/std_step_conf": 0.7360641360282898, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4900101936799184, "calib/avg_num_step_conf": 7.546875, "calib/ece": 0.4365450819672131, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.0005603126061841657, "calib/mean_conf": 0.9898237704918033, "calib/mu_c": 0.9900740740740739, "calib/mu_w": 0.9895137614678897, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4365450819672131, "calib/std_conf": 0.005914493912619059, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9203695652173913, "calib/step_q_c_n": 1012.0, "calib/step_q_gap": 0.012544565217391335, "calib/step_q_w": 0.907825, "calib/step_q_w_n": 920.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 789.859375, "completions/mean_terminated_length": 821.9674682617188, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.034133333333333335, "grad_norm": 0.03487340360879898, "kl": 0.00685882568359375, "learning_rate": 4.666666666666667e-06, "loss": -0.1287, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017978699877858162, "mask/share_reasoning": 0.845503032207489, "mask/share_step_conf": 0.09745573997497559, "num_tokens": 9942295.0, "reward": 0.46160465478897095, "reward_std": 0.12649011611938477, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5322355031967163, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.09566129744052887, "step": 32 }, { "adv/mean_abs_final_conf": 0.5081977248191833, "adv/mean_abs_reasoning": 0.4854148030281067, "adv/mean_abs_step_conf": 0.613527774810791, "adv/ratio_final_to_reasoning": 1.0469349547005007, "adv/ratio_step_to_reasoning": 1.263924731968395, "adv/std_final_conf": 0.7568607330322266, "adv/std_reasoning": 0.7392957210540771, "adv/std_step_conf": 0.8239318132400513, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4872881355932203, "calib/avg_num_step_conf": 7.64453125, "calib/ece": 0.4565098814229248, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00022881355932202752, "calib/mean_conf": 0.9901067193675889, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9902288135593219, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4565098814229248, "calib/std_conf": 0.000974210082519071, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9192660550458716, "calib/step_q_c_n": 981.0, "calib/step_q_gap": 0.027317284554068344, "calib/step_q_w": 0.8919487704918032, "calib/step_q_w_n": 976.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 770.78515625, "completions/mean_terminated_length": 776.8543090820312, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.0352, "grad_norm": 0.029613912105560303, "kl": 0.00690460205078125, "learning_rate": 4.638888888888889e-06, "loss": -0.0021, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.01927175186574459, "mask/share_reasoning": 0.8702056407928467, "mask/share_step_conf": 0.10271012783050537, "num_tokens": 10246488.0, "reward": 0.46980202198028564, "reward_std": 0.1874239593744278, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5362539291381836, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.10022514313459396, "step": 33 }, { "adv/mean_abs_final_conf": 0.4541032910346985, "adv/mean_abs_reasoning": 0.4353451728820801, "adv/mean_abs_step_conf": 0.7151603698730469, "adv/ratio_final_to_reasoning": 1.0430879203929966, "adv/ratio_step_to_reasoning": 1.6427433090357453, "adv/std_final_conf": 0.7026164531707764, "adv/std_reasoning": 0.7015430927276611, "adv/std_step_conf": 0.8879923224449158, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49871031746031746, "calib/avg_num_step_conf": 7.421875, "calib/ece": 0.4117590361445784, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.3214285714279193e-05, "calib/mean_conf": 0.9900722891566266, "calib/mu_c": 0.9900625000000001, "calib/mu_w": 0.9900857142857143, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4117590361445784, "calib/std_conf": 0.000803353401357574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9170599812558576, "calib/step_q_c_n": 1067.0, "calib/step_q_gap": 0.00049095364481333, "calib/step_q_w": 0.9165690276110443, "calib/step_q_w_n": 833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 749.9453125, "completions/mean_terminated_length": 764.8844604492188, "completions/min_length": 0.0, "completions/min_terminated_length": 378.0, "epoch": 0.03626666666666667, "grad_norm": 0.024869216606020927, "kl": 0.008152961730957031, "learning_rate": 4.611111111111112e-06, "loss": -0.0622, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018939541652798653, "mask/share_reasoning": 0.8582547307014465, "mask/share_step_conf": 0.10327447950839996, "num_tokens": 10543586.0, "reward": 0.484887033700943, "reward_std": 0.1882270872592926, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5705362558364868, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.09220652282238007, "step": 34 }, { "adv/mean_abs_final_conf": 0.5379250049591064, "adv/mean_abs_reasoning": 0.5143882632255554, "adv/mean_abs_step_conf": 0.6165981292724609, "adv/ratio_final_to_reasoning": 1.045756762772852, "adv/ratio_step_to_reasoning": 1.1987017849240609, "adv/std_final_conf": 0.7575049996376038, "adv/std_reasoning": 0.757745623588562, "adv/std_step_conf": 0.8392533659934998, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49374401750307667, "calib/avg_num_step_conf": 7.1953125, "calib/ece": 0.4105183673469388, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011137699986329963, "calib/mean_conf": 0.9901102040816326, "calib/mu_c": 0.9900633802816899, "calib/mu_w": 0.9901747572815532, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4105183673469388, "calib/std_conf": 0.0013401560825880053, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9123455809334657, "calib/step_q_c_n": 1007.0, "calib/step_q_gap": 0.006536718658016638, "calib/step_q_w": 0.9058088622754491, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 843.5703125, "completions/mean_terminated_length": 867.2850952148438, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.037333333333333336, "grad_norm": 0.044989511370658875, "kl": 0.007879257202148438, "learning_rate": 4.583333333333333e-06, "loss": -0.0743, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017067985609173775, "mask/share_reasoning": 0.8678691387176514, "mask/share_step_conf": 0.08771911263465881, "num_tokens": 10868796.0, "reward": 0.47709739208221436, "reward_std": 0.1962638944387436, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5624983906745911, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.08935261517763138, "step": 35 }, { "adv/mean_abs_final_conf": 0.38720130920410156, "adv/mean_abs_reasoning": 0.37243080139160156, "adv/mean_abs_step_conf": 0.666766881942749, "adv/ratio_final_to_reasoning": 1.0396597374795786, "adv/ratio_step_to_reasoning": 1.7903107891488828, "adv/std_final_conf": 0.6617724299430847, "adv/std_reasoning": 0.6613468527793884, "adv/std_step_conf": 0.8568493127822876, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.48504681392600674, "calib/avg_num_step_conf": 8.03125, "calib/ece": 0.21810799999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002691573493318611, "calib/mean_conf": 0.990108, "calib/mu_c": 0.9900466321243524, "calib/mu_w": 0.9903157894736843, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21810799999999997, "calib/std_conf": 0.0009799673463947672, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9171544818817546, "calib/step_q_c_n": 1573.0, "calib/step_q_gap": 0.002127566767882949, "calib/step_q_w": 0.9150269151138717, "calib/step_q_w_n": 483.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 781.16796875, "completions/mean_terminated_length": 793.5675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 331.0, "epoch": 0.0384, "grad_norm": 0.03296198695898056, "kl": 0.008137226104736328, "learning_rate": 4.555555555555556e-06, "loss": -0.0435, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019157538190484047, "mask/share_reasoning": 0.8529132604598999, "mask/share_step_conf": 0.11230425536632538, "num_tokens": 11171487.0, "reward": 0.608841598033905, "reward_std": 0.15445595979690552, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7581222653388977, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.11346717923879623, "step": 36 }, { "adv/mean_abs_final_conf": 0.4047836661338806, "adv/mean_abs_reasoning": 0.39348751306533813, "adv/mean_abs_step_conf": 0.5305731296539307, "adv/ratio_final_to_reasoning": 1.0287077802813702, "adv/ratio_step_to_reasoning": 1.348386192793441, "adv/std_final_conf": 0.7002204656600952, "adv/std_reasoning": 0.7013662457466125, "adv/std_step_conf": 0.8072525858879089, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5042735042735043, "calib/avg_num_step_conf": 7.9453125, "calib/ece": 0.5085555555555556, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 7.692307692308553e-05, "calib/mean_conf": 0.990037037037037, "calib/mu_c": 0.990076923076923, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5085555555555556, "calib/std_conf": 0.0005761610809668171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9135027685492803, "calib/step_q_c_n": 903.0, "calib/step_q_gap": 0.00091214078623858, "calib/step_q_w": 0.9125906277630417, "calib/step_q_w_n": 1131.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 787.68359375, "completions/mean_terminated_length": 819.7032470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.039466666666666664, "grad_norm": 0.026742523536086082, "kl": 0.0076446533203125, "learning_rate": 4.527777777777778e-06, "loss": -0.1265, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017800282686948776, "mask/share_reasoning": 0.8447290658950806, "mask/share_step_conf": 0.09840816259384155, "num_tokens": 11480230.0, "reward": 0.4046792984008789, "reward_std": 0.14460505545139313, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.46670272946357727, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.06218715012073517, "step": 37 }, { "adv/mean_abs_final_conf": 0.5496396422386169, "adv/mean_abs_reasoning": 0.512334942817688, "adv/mean_abs_step_conf": 0.6126781702041626, "adv/ratio_final_to_reasoning": 1.0728131077996834, "adv/ratio_step_to_reasoning": 1.195854740718283, "adv/std_final_conf": 0.8069299459457397, "adv/std_reasoning": 0.7928873896598816, "adv/std_step_conf": 0.8423981070518494, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.49514563106796117, "calib/avg_num_step_conf": 7.82421875, "calib/ece": 0.41742323651452296, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.737864077645874e-05, "calib/mean_conf": 0.9900373443983403, "calib/mu_c": 0.99, "calib/mu_w": 0.9900873786407764, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41742323651452296, "calib/std_conf": 0.0005785369313836811, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9172990654205607, "calib/step_q_c_n": 1070.0, "calib/step_q_gap": 0.0016591940379240278, "calib/step_q_w": 0.9156398713826367, "calib/step_q_w_n": 933.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 836.31640625, "completions/mean_terminated_length": 863.2943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.04053333333333333, "grad_norm": 0.023593056946992874, "kl": 0.007700443267822266, "learning_rate": 4.5e-06, "loss": -0.0804, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017227908596396446, "mask/share_reasoning": 0.8561974763870239, "mask/share_step_conf": 0.09532465040683746, "num_tokens": 11801215.0, "reward": 0.4685192108154297, "reward_std": 0.1908518224954605, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.546945333480835, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.09399936348199844, "step": 38 }, { "adv/mean_abs_final_conf": 0.5071004033088684, "adv/mean_abs_reasoning": 0.41346973180770874, "adv/mean_abs_step_conf": 0.6606694459915161, "adv/ratio_final_to_reasoning": 1.226451090124063, "adv/ratio_step_to_reasoning": 1.5978665308897917, "adv/std_final_conf": 0.7393047213554382, "adv/std_reasoning": 0.6817445755004883, "adv/std_step_conf": 0.8568510413169861, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.46403485952133194, "calib/avg_num_step_conf": 7.953125, "calib/ece": 0.48999556451612913, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": -7.25806451606914e-06, "calib/mean_conf": 0.9899955645161291, "calib/mu_c": 0.9899919354838709, "calib/mu_w": 0.989999193548387, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48999556451612913, "calib/std_conf": 0.006054525473856693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9141459353574928, "calib/step_q_c_n": 1021.0, "calib/step_q_gap": -0.0014997789282215068, "calib/step_q_w": 0.9156457142857143, "calib/step_q_w_n": 1015.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 841.85546875, "completions/mean_terminated_length": 858.6255493164062, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0416, "grad_norm": 0.04179238900542259, "kl": 0.008008956909179688, "learning_rate": 4.472222222222223e-06, "loss": -0.0299, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017614908516407013, "mask/share_reasoning": 0.861843466758728, "mask/share_step_conf": 0.10101038217544556, "num_tokens": 12122818.0, "reward": 0.43144068121910095, "reward_std": 0.1641264408826828, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.4939308166503906, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.07832558453083038, "step": 39 }, { "adv/mean_abs_final_conf": 0.5284343957901001, "adv/mean_abs_reasoning": 0.5218814611434937, "adv/mean_abs_step_conf": 0.5959415435791016, "adv/ratio_final_to_reasoning": 1.0125563660227521, "adv/ratio_step_to_reasoning": 1.1419097782728955, "adv/std_final_conf": 0.7917024493217468, "adv/std_reasoning": 0.7927063703536987, "adv/std_step_conf": 0.8258811831474304, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5044642857142857, "calib/avg_num_step_conf": 8.0234375, "calib/ece": 0.43943775100401616, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.000803571428571237, "calib/mean_conf": 0.9896385542168675, "calib/mu_c": 0.99, "calib/mu_w": 0.9891964285714288, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43943775100401616, "calib/std_conf": 0.005692053884827813, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9147750229568412, "calib/step_q_c_n": 1089.0, "calib/step_q_gap": -0.001535857872174362, "calib/step_q_w": 0.9163108808290156, "calib/step_q_w_n": 965.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 834.62890625, "completions/mean_terminated_length": 847.8770141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 446.0, "epoch": 0.042666666666666665, "grad_norm": 0.06637033820152283, "kl": 0.008520126342773438, "learning_rate": 4.444444444444444e-06, "loss": -0.0376, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01767815835773945, "mask/share_reasoning": 0.8664635419845581, "mask/share_step_conf": 0.100233294069767, "num_tokens": 12443243.0, "reward": 0.4634931683540344, "reward_std": 0.18711933493614197, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.54447340965271, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.08095047622919083, "step": 40 }, { "adv/mean_abs_final_conf": 0.3372285068035126, "adv/mean_abs_reasoning": 0.3330034017562866, "adv/mean_abs_step_conf": 0.6218752861022949, "adv/ratio_final_to_reasoning": 1.0126878735320493, "adv/ratio_step_to_reasoning": 1.867474274504329, "adv/std_final_conf": 0.6196356415748596, "adv/std_reasoning": 0.6186854243278503, "adv/std_step_conf": 0.8399285674095154, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4888888888888889, "calib/avg_num_step_conf": 8.2890625, "calib/ece": 0.17003600000000008, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00020000000000031104, "calib/mean_conf": 0.990036, "calib/mu_c": 0.99, "calib/mu_w": 0.9902000000000003, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17003600000000008, "calib/std_conf": 0.0005680704181701424, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9153654188948306, "calib/step_q_c_n": 1683.0, "calib/step_q_gap": -0.005192667665533834, "calib/step_q_w": 0.9205580865603644, "calib/step_q_w_n": 439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 754.05078125, "completions/mean_terminated_length": 772.1480102539062, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.04373333333333333, "grad_norm": 0.020984956994652748, "kl": 0.0063877105712890625, "learning_rate": 4.416666666666667e-06, "loss": -0.09, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.018900573253631592, "mask/share_reasoning": 0.846361517906189, "mask/share_step_conf": 0.11130040884017944, "num_tokens": 12743528.0, "reward": 0.6339575052261353, "reward_std": 0.1435820609331131, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8041292428970337, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.10831693559885025, "step": 41 }, { "adv/mean_abs_final_conf": 0.4162845015525818, "adv/mean_abs_reasoning": 0.3924916982650757, "adv/mean_abs_step_conf": 0.6317875385284424, "adv/ratio_final_to_reasoning": 1.060619889268173, "adv/ratio_step_to_reasoning": 1.609683825979306, "adv/std_final_conf": 0.6985313892364502, "adv/std_reasoning": 0.6815931797027588, "adv/std_step_conf": 0.8386624455451965, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4731182795698925, "calib/avg_num_step_conf": 7.7734375, "calib/ece": 0.36218000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00048387096774182847, "calib/mean_conf": 0.9901800000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9904838709677417, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36218000000000006, "calib/std_conf": 0.0012600000000000014, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9125294117647058, "calib/step_q_c_n": 1190.0, "calib/step_q_gap": -0.006409338235294326, "calib/step_q_w": 0.9189387500000001, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 751.9609375, "completions/mean_terminated_length": 766.9402465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.0448, "grad_norm": 0.01770918071269989, "kl": 0.0061492919921875, "learning_rate": 4.388888888888889e-06, "loss": -0.0379, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01905568316578865, "mask/share_reasoning": 0.8554246425628662, "mask/share_step_conf": 0.10598836839199066, "num_tokens": 13040398.0, "reward": 0.5209317207336426, "reward_std": 0.1415383666753769, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6200218200683594, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.10465407371520996, "step": 42 }, { "adv/mean_abs_final_conf": 0.5328917503356934, "adv/mean_abs_reasoning": 0.5094095468521118, "adv/mean_abs_step_conf": 0.7035216093063354, "adv/ratio_final_to_reasoning": 1.046096904992632, "adv/ratio_step_to_reasoning": 1.3810530518199668, "adv/std_final_conf": 0.7593136429786682, "adv/std_reasoning": 0.7577374577522278, "adv/std_step_conf": 0.9018055200576782, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5057803468208093, "calib/avg_num_step_conf": 8.0390625, "calib/ece": 0.2868211382113821, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00010404624277449148, "calib/mean_conf": 0.9900731707317073, "calib/mu_c": 0.9901040462427745, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2868211382113821, "calib/std_conf": 0.0008081971475990684, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.91319, "calib/step_q_c_n": 1400.0, "calib/step_q_gap": 0.006487872340425471, "calib/step_q_w": 0.9067021276595745, "calib/step_q_w_n": 658.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 824.4453125, "completions/mean_terminated_length": 837.5317993164062, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.04586666666666667, "grad_norm": 0.03040093369781971, "kl": 0.005013942718505859, "learning_rate": 4.361111111111112e-06, "loss": 0.0023, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01777583733201027, "mask/share_reasoning": 0.8648500442504883, "mask/share_step_conf": 0.10174912214279175, "num_tokens": 13356680.0, "reward": 0.5546459555625916, "reward_std": 0.215305894613266, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6735773086547852, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.1091521829366684, "step": 43 }, { "adv/mean_abs_final_conf": 0.4123830199241638, "adv/mean_abs_reasoning": 0.3843456506729126, "adv/mean_abs_step_conf": 0.5376432538032532, "adv/ratio_final_to_reasoning": 1.072948319311441, "adv/ratio_step_to_reasoning": 1.398853487380297, "adv/std_final_conf": 0.7005766034126282, "adv/std_reasoning": 0.6815363168716431, "adv/std_step_conf": 0.7874688506126404, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5202967171717172, "calib/avg_num_step_conf": 7.74609375, "calib/ece": 0.4647539682539684, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0031598484848484043, "calib/mean_conf": 0.9885634920634921, "calib/mu_c": 0.9900681818181818, "calib/mu_w": 0.9869083333333334, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4647539682539684, "calib/std_conf": 0.012156712191113027, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9141165523996083, "calib/step_q_c_n": 1021.0, "calib/step_q_gap": 0.0032911885742445968, "calib/step_q_w": 0.9108253638253637, "calib/step_q_w_n": 962.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 811.140625, "completions/mean_terminated_length": 824.0159301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.046933333333333334, "grad_norm": 0.018216362223029137, "kl": 0.006135463714599609, "learning_rate": 4.333333333333334e-06, "loss": -0.0304, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.017669133841991425, "mask/share_reasoning": 0.8677611947059631, "mask/share_step_conf": 0.09894467145204544, "num_tokens": 13670652.0, "reward": 0.4570249915122986, "reward_std": 0.14610937237739563, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.527624249458313, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.08642578125, "step": 44 }, { "adv/mean_abs_final_conf": 0.45895838737487793, "adv/mean_abs_reasoning": 0.404518187046051, "adv/mean_abs_step_conf": 0.66061931848526, "adv/ratio_final_to_reasoning": 1.1345803528053222, "adv/ratio_step_to_reasoning": 1.633101649419916, "adv/std_final_conf": 0.7361606359481812, "adv/std_reasoning": 0.7014278769493103, "adv/std_step_conf": 0.8715624809265137, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5007148692810457, "calib/avg_num_step_conf": 8.35546875, "calib/ece": 0.37032128514056234, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9799196787148594, "calib/gap": 0.008964460784313566, "calib/mean_conf": 0.9847791164658636, "calib/mu_c": 0.988235294117647, "calib/mu_w": 0.9792708333333334, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37032128514056234, "calib/std_conf": 0.06042728882351692, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.913917525773196, "calib/step_q_c_n": 1261.0, "calib/step_q_gap": 0.0008537444520113624, "calib/step_q_w": 0.9130637813211846, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 815.9453125, "completions/mean_terminated_length": 832.19921875, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.048, "grad_norm": 0.02665361762046814, "kl": 0.005267143249511719, "learning_rate": 4.305555555555556e-06, "loss": -0.0478, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01788826286792755, "mask/share_reasoning": 0.8560677766799927, "mask/share_step_conf": 0.10651271045207977, "num_tokens": 13984582.0, "reward": 0.5027612447738647, "reward_std": 0.1638513207435608, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6094261407852173, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.08203388005495071, "step": 45 }, { "adv/mean_abs_final_conf": 0.4618817865848541, "adv/mean_abs_reasoning": 0.4293670654296875, "adv/mean_abs_step_conf": 0.6539697647094727, "adv/ratio_final_to_reasoning": 1.0757270964009027, "adv/ratio_step_to_reasoning": 1.5231018337538647, "adv/std_final_conf": 0.7171543836593628, "adv/std_reasoning": 0.7015634179115295, "adv/std_step_conf": 0.8558611869812012, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5004960317460317, "calib/avg_num_step_conf": 8.74609375, "calib/ece": 0.4598319327731092, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9915966386554622, "calib/gap": 8.928571428545862e-05, "calib/mean_conf": 0.9892436974789915, "calib/mu_c": 0.9892857142857142, "calib/mu_w": 0.9891964285714288, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4598319327731092, "calib/std_conf": 0.008215548270655624, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9102967359050446, "calib/step_q_c_n": 1011.0, "calib/step_q_gap": -0.013925576798538675, "calib/step_q_w": 0.9242223127035832, "calib/step_q_w_n": 1228.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 837.4296875, "completions/mean_terminated_length": 871.4714965820312, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.04906666666666667, "grad_norm": 0.03318821266293526, "kl": 0.006639957427978516, "learning_rate": 4.277777777777778e-06, "loss": -0.1445, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017392775043845177, "mask/share_reasoning": 0.8425112366676331, "mask/share_step_conf": 0.10103348642587662, "num_tokens": 14303732.0, "reward": 0.42180848121643066, "reward_std": 0.16216880083084106, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5013925433158875, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.05863059312105179, "step": 46 }, { "adv/mean_abs_final_conf": 0.4371130168437958, "adv/mean_abs_reasoning": 0.3986102342605591, "adv/mean_abs_step_conf": 0.6053522229194641, "adv/ratio_final_to_reasoning": 1.0965925590311578, "adv/ratio_step_to_reasoning": 1.5186570009734477, "adv/std_final_conf": 0.7007460594177246, "adv/std_reasoning": 0.6818104982376099, "adv/std_step_conf": 0.8250387907028198, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.4814814814814815, "calib/avg_num_step_conf": 7.7734375, "calib/ece": 0.32482231404958684, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00033333333333340764, "calib/mean_conf": 0.990111570247934, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9903333333333333, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32482231404958684, "calib/std_conf": 0.000995833475627794, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9095280716029294, "calib/step_q_c_n": 1229.0, "calib/step_q_gap": -0.0013523488964136687, "calib/step_q_w": 0.910880420499343, "calib/step_q_w_n": 761.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 831.9140625, "completions/mean_terminated_length": 862.2267456054688, "completions/min_length": 0.0, "completions/min_terminated_length": 477.0, "epoch": 0.050133333333333335, "grad_norm": 0.017103997990489006, "kl": 0.006692409515380859, "learning_rate": 4.25e-06, "loss": -0.1294, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016787182539701462, "mask/share_reasoning": 0.8566350936889648, "mask/share_step_conf": 0.09142149239778519, "num_tokens": 14622678.0, "reward": 0.5265778303146362, "reward_std": 0.1641303300857544, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6349300146102905, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.1033819317817688, "step": 47 }, { "adv/mean_abs_final_conf": 0.5257114768028259, "adv/mean_abs_reasoning": 0.5039148330688477, "adv/mean_abs_step_conf": 0.5479354858398438, "adv/ratio_final_to_reasoning": 1.04325461824816, "adv/ratio_step_to_reasoning": 1.0873573268381678, "adv/std_final_conf": 0.7919268012046814, "adv/std_reasoning": 0.775353193283081, "adv/std_step_conf": 0.8077905178070068, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5047619047619047, "calib/avg_num_step_conf": 7.58984375, "calib/ece": 0.4194631147540984, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.0020095238095236834, "calib/mean_conf": 0.9891352459016393, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9879904761904762, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4194631147540984, "calib/std_conf": 0.012297296931634134, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9099449035812672, "calib/step_q_c_n": 1089.0, "calib/step_q_gap": 0.010992912948948774, "calib/step_q_w": 0.8989519906323185, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 821.234375, "completions/mean_terminated_length": 844.3212280273438, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.0512, "grad_norm": 0.017073359340429306, "kl": 0.006745338439941406, "learning_rate": 4.222222222222223e-06, "loss": -0.0713, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017865825444459915, "mask/share_reasoning": 0.8551771640777588, "mask/share_step_conf": 0.09961327910423279, "num_tokens": 14936602.0, "reward": 0.47426408529281616, "reward_std": 0.18687653541564941, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5525636672973633, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.09674571454524994, "step": 48 }, { "adv/mean_abs_final_conf": 0.4595940411090851, "adv/mean_abs_reasoning": 0.449293851852417, "adv/mean_abs_step_conf": 0.6292790174484253, "adv/ratio_final_to_reasoning": 1.022925284230356, "adv/ratio_step_to_reasoning": 1.4005956566152378, "adv/std_final_conf": 0.7192720770835876, "adv/std_reasoning": 0.7208415269851685, "adv/std_step_conf": 0.8543146848678589, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.502266081871345, "calib/avg_num_step_conf": 7.8125, "calib/ece": 0.3611570247933884, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9917355371900827, "calib/gap": 0.0004078947368421071, "calib/mean_conf": 0.9892561983471074, "calib/mu_c": 0.989407894736842, "calib/mu_w": 0.9889999999999999, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3611570247933884, "calib/std_conf": 0.008147938871977678, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9114995857497927, "calib/step_q_c_n": 1207.0, "calib/step_q_gap": -2.6265448189621843e-05, "calib/step_q_w": 0.9115258511979824, "calib/step_q_w_n": 793.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 788.55078125, "completions/mean_terminated_length": 813.9878540039062, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.05226666666666667, "grad_norm": 0.023175053298473358, "kl": 0.007882118225097656, "learning_rate": 4.194444444444445e-06, "loss": -0.1371, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017746923491358757, "mask/share_reasoning": 0.8509671688079834, "mask/share_step_conf": 0.10003594309091568, "num_tokens": 15243007.0, "reward": 0.5011695623397827, "reward_std": 0.16718772053718567, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.601312518119812, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.09321413934230804, "step": 49 }, { "adv/mean_abs_final_conf": 0.4921131432056427, "adv/mean_abs_reasoning": 0.48984989523887634, "adv/mean_abs_step_conf": 0.5728212594985962, "adv/ratio_final_to_reasoning": 1.0046202887634847, "adv/ratio_step_to_reasoning": 1.1693812024176482, "adv/std_final_conf": 0.7738939523696899, "adv/std_reasoning": 0.7753498554229736, "adv/std_step_conf": 0.8254184126853943, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5031446540880503, "calib/avg_num_step_conf": 7.66015625, "calib/ece": 0.3514819277108435, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.660377358485125e-05, "calib/mean_conf": 0.9900361445783133, "calib/mu_c": 0.9900566037735848, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3514819277108435, "calib/std_conf": 0.0005692053884827821, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9104567901234567, "calib/step_q_c_n": 1215.0, "calib/step_q_gap": -0.000709429715685439, "calib/step_q_w": 0.9111662198391421, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 796.5390625, "completions/mean_terminated_length": 812.4063720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.05333333333333334, "grad_norm": 0.011566097848117352, "kl": 0.008397102355957031, "learning_rate": 4.166666666666667e-06, "loss": -0.034, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01855967566370964, "mask/share_reasoning": 0.8587042689323425, "mask/share_step_conf": 0.10320483148097992, "num_tokens": 15552281.0, "reward": 0.5237906575202942, "reward_std": 0.1779598593711853, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.628028154373169, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.1008031815290451, "step": 50 }, { "adv/mean_abs_final_conf": 0.4773007929325104, "adv/mean_abs_reasoning": 0.46361416578292847, "adv/mean_abs_step_conf": 0.6320586204528809, "adv/ratio_final_to_reasoning": 1.0295215896314742, "adv/ratio_step_to_reasoning": 1.3633289642595192, "adv/std_final_conf": 0.7180315852165222, "adv/std_reasoning": 0.7207627296447754, "adv/std_step_conf": 0.8397784233093262, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.495, "calib/avg_num_step_conf": 7.92578125, "calib/ece": 0.4032603305785123, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.00000000000345e-05, "calib/mean_conf": 0.9900371900826446, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900899999999999, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4032603305785123, "calib/std_conf": 0.0005773453399435551, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9121254355400696, "calib/step_q_c_n": 1148.0, "calib/step_q_gap": 0.004233267549150321, "calib/step_q_w": 0.9078921679909193, "calib/step_q_w_n": 881.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 796.359375, "completions/mean_terminated_length": 805.8024291992188, "completions/min_length": 0.0, "completions/min_terminated_length": 70.0, "epoch": 0.0544, "grad_norm": 0.01405671238899231, "kl": 0.008150100708007812, "learning_rate": 4.138888888888889e-06, "loss": 0.0027, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018297288566827774, "mask/share_reasoning": 0.8663866519927979, "mask/share_step_conf": 0.10359735786914825, "num_tokens": 15865445.0, "reward": 0.47372934222221375, "reward_std": 0.16854144632816315, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5623355507850647, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.08512316644191742, "step": 51 }, { "adv/mean_abs_final_conf": 0.4914446175098419, "adv/mean_abs_reasoning": 0.4863370358943939, "adv/mean_abs_step_conf": 0.6917616128921509, "adv/ratio_final_to_reasoning": 1.0105021440656992, "adv/ratio_step_to_reasoning": 1.422391390818042, "adv/std_final_conf": 0.7571762204170227, "adv/std_reasoning": 0.7577103972434998, "adv/std_step_conf": 0.8891385197639465, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.12109375, "calib/ece": 0.28629629629629627, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28629629629629627, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075261885576147, "calib/step_q_c_n": 1241.0, "calib/step_q_gap": 0.004897322578233387, "calib/step_q_w": 0.9026288659793813, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 831.3828125, "completions/mean_terminated_length": 834.6431884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.055466666666666664, "grad_norm": 0.016431264579296112, "kl": 0.008275032043457031, "learning_rate": 4.111111111111111e-06, "loss": -0.0237, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017841722816228867, "mask/share_reasoning": 0.8831788897514343, "mask/share_step_conf": 0.09507312625646591, "num_tokens": 16186231.0, "reward": 0.5599114894866943, "reward_std": 0.19265946745872498, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.673498809337616, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.12288664281368256, "step": 52 }, { "adv/mean_abs_final_conf": 0.4204365909099579, "adv/mean_abs_reasoning": 0.4040074646472931, "adv/mean_abs_step_conf": 0.7082507610321045, "adv/ratio_final_to_reasoning": 1.0406654027469708, "adv/ratio_step_to_reasoning": 1.7530635520569455, "adv/std_final_conf": 0.6820698976516724, "adv/std_reasoning": 0.6816956996917725, "adv/std_step_conf": 0.8729038238525391, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4939759036144578, "calib/avg_num_step_conf": 7.24609375, "calib/ece": 0.3233694779116466, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010843373493973907, "calib/mean_conf": 0.9900361445783132, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901084337349396, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3233694779116466, "calib/std_conf": 0.000569205388482782, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9095084745762713, "calib/step_q_c_n": 1180.0, "calib/step_q_gap": 0.006305511613308279, "calib/step_q_w": 0.903202962962963, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 810.41796875, "completions/mean_terminated_length": 826.561767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.05653333333333333, "grad_norm": 0.02731546014547348, "kl": 0.025946617126464844, "learning_rate": 4.083333333333334e-06, "loss": -0.0271, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017761999741196632, "mask/share_reasoning": 0.8694026470184326, "mask/share_step_conf": 0.09330415725708008, "num_tokens": 16499522.0, "reward": 0.5239921808242798, "reward_std": 0.1667671799659729, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.654754638671875, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.06901100277900696, "step": 53 }, { "adv/mean_abs_final_conf": 0.3426434397697449, "adv/mean_abs_reasoning": 0.3180202841758728, "adv/mean_abs_step_conf": 0.5529361963272095, "adv/ratio_final_to_reasoning": 1.0774263681251692, "adv/ratio_step_to_reasoning": 1.7386821653848425, "adv/std_final_conf": 0.6070969700813293, "adv/std_reasoning": 0.5961459875106812, "adv/std_step_conf": 0.8079472780227661, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49731182795698925, "calib/avg_num_step_conf": 7.73046875, "calib/ece": 0.23684210526315785, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00016129032258083154, "calib/mean_conf": 0.9898785425101214, "calib/mu_c": 0.9898387096774194, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23684210526315785, "calib/std_conf": 0.0019049862924726479, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9124880708929788, "calib/step_q_c_n": 1467.0, "calib/step_q_gap": -0.00012911660702119576, "calib/step_q_w": 0.9126171875, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 761.671875, "completions/mean_terminated_length": 770.70361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.0576, "grad_norm": 0.010707957670092583, "kl": 0.0113067626953125, "learning_rate": 4.055555555555556e-06, "loss": -0.0217, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01893680728971958, "mask/share_reasoning": 0.8616507649421692, "mask/share_step_conf": 0.10769366472959518, "num_tokens": 16800742.0, "reward": 0.5761798024177551, "reward_std": 0.12148773670196533, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7312257885932922, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.08207134902477264, "step": 54 }, { "adv/mean_abs_final_conf": 0.42023879289627075, "adv/mean_abs_reasoning": 0.40138623118400574, "adv/mean_abs_step_conf": 0.6069879531860352, "adv/ratio_final_to_reasoning": 1.04696863082885, "adv/ratio_step_to_reasoning": 1.5122291350043253, "adv/std_final_conf": 0.6802883148193359, "adv/std_reasoning": 0.6816930174827576, "adv/std_step_conf": 0.8228247761726379, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5106805929919137, "calib/avg_num_step_conf": 7.4296875, "calib/ece": 0.40886178861788613, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9878048780487805, "calib/gap": 0.027095687331536333, "calib/mean_conf": 0.9779674796747967, "calib/mu_c": 0.989642857142857, "calib/mu_w": 0.9625471698113207, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40886178861788613, "calib/std_conf": 0.10648836227565918, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9107555970149254, "calib/step_q_c_n": 1072.0, "calib/step_q_gap": 0.006092946412515676, "calib/step_q_w": 0.9046626506024097, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 792.6484375, "completions/mean_terminated_length": 805.230224609375, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.058666666666666666, "grad_norm": 0.021472781896591187, "kl": 0.011422157287597656, "learning_rate": 4.027777777777779e-06, "loss": -0.0787, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018600594252347946, "mask/share_reasoning": 0.8633120059967041, "mask/share_step_conf": 0.10246237367391586, "num_tokens": 17111484.0, "reward": 0.4833521842956543, "reward_std": 0.15108683705329895, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5664495825767517, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.0994734913110733, "step": 55 }, { "adv/mean_abs_final_conf": 0.5721842050552368, "adv/mean_abs_reasoning": 0.5660943388938904, "adv/mean_abs_step_conf": 0.6459726691246033, "adv/ratio_final_to_reasoning": 1.0107576877967825, "adv/ratio_step_to_reasoning": 1.1411042731619427, "adv/std_final_conf": 0.8073209524154663, "adv/std_reasoning": 0.8099371194839478, "adv/std_step_conf": 0.8578875660896301, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.5051020408163265, "calib/avg_num_step_conf": 6.96484375, "calib/ece": 0.41237068965517254, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00010204081632658735, "calib/mean_conf": 0.9899568965517243, "calib/mu_c": 0.99, "calib/mu_w": 0.9898979591836734, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41237068965517254, "calib/std_conf": 0.0006551156962745981, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9094709543568464, "calib/step_q_c_n": 964.0, "calib/step_q_gap": 0.007370832256724302, "calib/step_q_w": 0.9021001221001221, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 790.390625, "completions/mean_terminated_length": 825.8775024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.05973333333333333, "grad_norm": 0.019272800534963608, "kl": 0.01163482666015625, "learning_rate": 4.000000000000001e-06, "loss": -0.1155, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.01745789125561714, "mask/share_reasoning": 0.847215473651886, "mask/share_step_conf": 0.09235785901546478, "num_tokens": 17420664.0, "reward": 0.454298198223114, "reward_std": 0.18432661890983582, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5310801267623901, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.09157884120941162, "step": 56 }, { "adv/mean_abs_final_conf": 0.394421249628067, "adv/mean_abs_reasoning": 0.38101887702941895, "adv/mean_abs_step_conf": 0.6382747292518616, "adv/ratio_final_to_reasoning": 1.0351750881823456, "adv/ratio_step_to_reasoning": 1.6751787581447823, "adv/std_final_conf": 0.6921825408935547, "adv/std_reasoning": 0.6815177798271179, "adv/std_step_conf": 0.8579009175300598, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5110789980732178, "calib/avg_num_step_conf": 7.140625, "calib/ece": 0.2828163265306123, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": 0.003021034039820125, "calib/mean_conf": 0.988938775510204, "calib/mu_c": 0.9898265895953756, "calib/mu_w": 0.9868055555555555, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2828163265306123, "calib/std_conf": 0.0107521391573867, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9091732283464568, "calib/step_q_c_n": 1270.0, "calib/step_q_gap": 0.0070047695650946595, "calib/step_q_w": 0.9021684587813621, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 776.515625, "completions/mean_terminated_length": 798.3453369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.0608, "grad_norm": 0.017944002524018288, "kl": 0.012668609619140625, "learning_rate": 3.972222222222223e-06, "loss": 0.0063, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018619872629642487, "mask/share_reasoning": 0.8578202128410339, "mask/share_step_conf": 0.09621616452932358, "num_tokens": 17726244.0, "reward": 0.5478638410568237, "reward_std": 0.1483166217803955, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6790695190429688, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.09087696671485901, "step": 57 }, { "adv/mean_abs_final_conf": 0.5725237131118774, "adv/mean_abs_reasoning": 0.5564943552017212, "adv/mean_abs_step_conf": 0.6501367092132568, "adv/ratio_final_to_reasoning": 1.028804169818301, "adv/ratio_step_to_reasoning": 1.1682718847661835, "adv/std_final_conf": 0.7908139824867249, "adv/std_reasoning": 0.7930740714073181, "adv/std_step_conf": 0.8374752402305603, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.505871990604815, "calib/avg_num_step_conf": 7.296875, "calib/ece": 0.4280425531914893, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.9914893617021276, "calib/gap": 0.010019817968291433, "calib/mean_conf": 0.9854893617021278, "calib/mu_c": 0.9899236641221375, "calib/mu_w": 0.9799038461538461, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4280425531914893, "calib/std_conf": 0.06273954005909367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9080383795309168, "calib/step_q_c_n": 938.0, "calib/step_q_gap": 0.005651282756723219, "calib/step_q_w": 0.9023870967741936, "calib/step_q_w_n": 930.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 829.25390625, "completions/mean_terminated_length": 866.4856567382812, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.06186666666666667, "grad_norm": 0.03738880529999733, "kl": 0.01822662353515625, "learning_rate": 3.944444444444445e-06, "loss": -0.2312, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01653483882546425, "mask/share_reasoning": 0.853093147277832, "mask/share_step_conf": 0.08740321546792984, "num_tokens": 18044853.0, "reward": 0.43696129322052, "reward_std": 0.2052234709262848, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5241624712944031, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.06460385769605637, "step": 58 }, { "adv/mean_abs_final_conf": 0.5244466066360474, "adv/mean_abs_reasoning": 0.4894862473011017, "adv/mean_abs_step_conf": 0.6316576600074768, "adv/ratio_final_to_reasoning": 1.0714225568699998, "adv/ratio_step_to_reasoning": 1.2904502700336749, "adv/std_final_conf": 0.7405335903167725, "adv/std_reasoning": 0.7208307981491089, "adv/std_step_conf": 0.8248673677444458, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4893758765778401, "calib/avg_num_step_conf": 7.3359375, "calib/ece": 0.36076923076923084, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838056680161943, "calib/gap": -0.0011507713884990611, "calib/mean_conf": 0.9882995951417005, "calib/mu_c": 0.9878709677419355, "calib/mu_w": 0.9890217391304346, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36076923076923084, "calib/std_conf": 0.011643460043277919, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9067229129662523, "calib/step_q_c_n": 1126.0, "calib/step_q_gap": -0.005218576395449759, "calib/step_q_w": 0.911941489361702, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 804.5546875, "completions/mean_terminated_length": 823.864013671875, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.06293333333333333, "grad_norm": 0.01749482750892639, "kl": 0.014242172241210938, "learning_rate": 3.916666666666667e-06, "loss": -0.0826, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018056441098451614, "mask/share_reasoning": 0.8629554510116577, "mask/share_step_conf": 0.09555056691169739, "num_tokens": 18357067.0, "reward": 0.5010087490081787, "reward_std": 0.21150478720664978, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6130964756011963, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.07485847175121307, "step": 59 }, { "adv/mean_abs_final_conf": 0.47424575686454773, "adv/mean_abs_reasoning": 0.46962666511535645, "adv/mean_abs_step_conf": 0.5367334485054016, "adv/ratio_final_to_reasoning": 1.009835667546809, "adv/ratio_step_to_reasoning": 1.1428938950337528, "adv/std_final_conf": 0.7570666670799255, "adv/std_reasoning": 0.7576452493667603, "adv/std_step_conf": 0.8085055351257324, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5095238095238095, "calib/avg_num_step_conf": 7.65625, "calib/ece": 0.4199180327868852, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.0009523809523807047, "calib/mean_conf": 0.9895901639344262, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9890476190476192, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4199180327868852, "calib/std_conf": 0.005782611155559759, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9071698113207548, "calib/step_q_c_n": 1060.0, "calib/step_q_gap": 0.008580922431865923, "calib/step_q_w": 0.8985888888888889, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 788.81640625, "completions/mean_terminated_length": 807.748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 0.064, "grad_norm": 0.014264144003391266, "kl": 0.017797470092773438, "learning_rate": 3.88888888888889e-06, "loss": -0.084, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018555093556642532, "mask/share_reasoning": 0.8566738367080688, "mask/share_step_conf": 0.1013336181640625, "num_tokens": 18667860.0, "reward": 0.4674496352672577, "reward_std": 0.1731988787651062, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5517401695251465, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.08472155779600143, "step": 60 }, { "adv/mean_abs_final_conf": 0.37328630685806274, "adv/mean_abs_reasoning": 0.3615089952945709, "adv/mean_abs_step_conf": 0.6297774314880371, "adv/ratio_final_to_reasoning": 1.032578197822976, "adv/ratio_step_to_reasoning": 1.7420795600808525, "adv/std_final_conf": 0.6613009572029114, "adv/std_reasoning": 0.6612700819969177, "adv/std_step_conf": 0.8570879101753235, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.6875, "calib/ece": 0.2883870967741936, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2883870967741936, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9083868501529052, "calib/step_q_c_n": 1308.0, "calib/step_q_gap": -0.0008101195440644027, "calib/step_q_w": 0.9091969696969696, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 772.40625, "completions/mean_terminated_length": 781.5652465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.06506666666666666, "grad_norm": 0.013438318856060505, "kl": 0.016721725463867188, "learning_rate": 3.861111111111112e-06, "loss": 0.0283, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019184596836566925, "mask/share_reasoning": 0.8619869947433472, "mask/share_step_conf": 0.10710963606834412, "num_tokens": 18969660.0, "reward": 0.555734395980835, "reward_std": 0.15007826685905457, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6853718757629395, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.09640936553478241, "step": 61 }, { "adv/mean_abs_final_conf": 0.574912965297699, "adv/mean_abs_reasoning": 0.5460503697395325, "adv/mean_abs_step_conf": 0.6357508301734924, "adv/ratio_final_to_reasoning": 1.0528570204465462, "adv/ratio_step_to_reasoning": 1.1642714031614836, "adv/std_final_conf": 0.7941304445266724, "adv/std_reasoning": 0.7756006717681885, "adv/std_step_conf": 0.850304901599884, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5096281982942431, "calib/avg_num_step_conf": 7.46875, "calib/ece": 0.4432723577235772, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9796747967479674, "calib/gap": 0.0015511727078889592, "calib/mean_conf": 0.9879878048780487, "calib/mu_c": 0.9886940298507462, "calib/mu_w": 0.9871428571428572, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4432723577235772, "calib/std_conf": 0.014402597357025365, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9045140280561121, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.002511839872304633, "calib/step_q_w": 0.9020021881838075, "calib/step_q_w_n": 914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 775.1953125, "completions/mean_terminated_length": 787.5000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 425.0, "epoch": 0.06613333333333334, "grad_norm": 0.015270399861037731, "kl": 0.014867782592773438, "learning_rate": 3.833333333333334e-06, "loss": -0.0732, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018467646092176437, "mask/share_reasoning": 0.8652955889701843, "mask/share_step_conf": 0.10061179101467133, "num_tokens": 19275190.0, "reward": 0.45150452852249146, "reward_std": 0.2253757119178772, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5343495011329651, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.07178457081317902, "step": 62 }, { "adv/mean_abs_final_conf": 0.48102137446403503, "adv/mean_abs_reasoning": 0.46375536918640137, "adv/mean_abs_step_conf": 0.6127632856369019, "adv/ratio_final_to_reasoning": 1.0372308471768739, "adv/ratio_step_to_reasoning": 1.3213071510350716, "adv/std_final_conf": 0.7569454908370972, "adv/std_reasoning": 0.7576775550842285, "adv/std_step_conf": 0.8411734700202942, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5075443363844394, "calib/avg_num_step_conf": 7.0546875, "calib/ece": 0.3661475409836066, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": 0.0008209382151029843, "calib/mean_conf": 0.9890983606557378, "calib/mu_c": 0.989407894736842, "calib/mu_w": 0.988586956521739, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3661475409836066, "calib/std_conf": 0.008493401330739609, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9051300448430494, "calib/step_q_c_n": 1115.0, "calib/step_q_gap": 0.0038420564204733054, "calib/step_q_w": 0.9012879884225761, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 852.64453125, "completions/mean_terminated_length": 873.1080322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.0672, "grad_norm": 0.010692139156162739, "kl": 0.01459503173828125, "learning_rate": 3.8055555555555556e-06, "loss": -0.1328, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01723906397819519, "mask/share_reasoning": 0.868358850479126, "mask/share_step_conf": 0.09096457064151764, "num_tokens": 19602107.0, "reward": 0.5032086372375488, "reward_std": 0.1856970191001892, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6016933917999268, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.09534888714551926, "step": 63 }, { "adv/mean_abs_final_conf": 0.47164392471313477, "adv/mean_abs_reasoning": 0.4207276701927185, "adv/mean_abs_step_conf": 0.6345499753952026, "adv/ratio_final_to_reasoning": 1.1210195053182348, "adv/ratio_step_to_reasoning": 1.508220210723342, "adv/std_final_conf": 0.7202219367027283, "adv/std_reasoning": 0.681753396987915, "adv/std_step_conf": 0.8401838541030884, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5150887573964497, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.2461133603238866, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9757085020242915, "calib/gap": 0.021131868131867848, "calib/mean_conf": 0.9829554655870445, "calib/mu_c": 0.9885164835164834, "calib/mu_w": 0.9673846153846155, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2461133603238866, "calib/std_conf": 0.06830339883190079, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9066546242774566, "calib/step_q_c_n": 1384.0, "calib/step_q_gap": -8.07709115469013e-05, "calib/step_q_w": 0.9067353951890035, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 759.03125, "completions/mean_terminated_length": 774.1514282226562, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.06826666666666667, "grad_norm": 0.022808127105236053, "kl": 0.017221450805664062, "learning_rate": 3.777777777777778e-06, "loss": -0.0417, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01875794306397438, "mask/share_reasoning": 0.8583047389984131, "mask/share_step_conf": 0.10340610891580582, "num_tokens": 19900195.0, "reward": 0.5784550309181213, "reward_std": 0.1765058934688568, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.722718358039856, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.09903548657894135, "step": 64 }, { "adv/mean_abs_final_conf": 0.37748831510543823, "adv/mean_abs_reasoning": 0.3057681620121002, "adv/mean_abs_step_conf": 0.5369272232055664, "adv/ratio_final_to_reasoning": 1.2345572953749835, "adv/ratio_step_to_reasoning": 1.7559945406752926, "adv/std_final_conf": 0.6599483489990234, "adv/std_reasoning": 0.5960834622383118, "adv/std_step_conf": 0.7724637389183044, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5141004323245759, "calib/avg_num_step_conf": 7.6015625, "calib/ece": 0.3729365079365079, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": 0.0021376787495842597, "calib/mean_conf": 0.988015873015873, "calib/mu_c": 0.9888387096774193, "calib/mu_w": 0.986701030927835, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3729365079365079, "calib/std_conf": 0.012911164190179741, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9081849315068495, "calib/step_q_c_n": 1168.0, "calib/step_q_gap": 0.00015151248371325643, "calib/step_q_w": 0.9080334190231363, "calib/step_q_w_n": 778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 710.98046875, "completions/mean_terminated_length": 716.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 384.0, "epoch": 0.06933333333333333, "grad_norm": 0.026061657816171646, "kl": 0.018686294555664062, "learning_rate": 3.7500000000000005e-06, "loss": -0.0164, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02051321603357792, "mask/share_reasoning": 0.8589115142822266, "mask/share_step_conf": 0.11276276409626007, "num_tokens": 20187230.0, "reward": 0.5171620845794678, "reward_std": 0.13296625018119812, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6152414083480835, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.1003328263759613, "step": 65 }, { "adv/mean_abs_final_conf": 0.5113359689712524, "adv/mean_abs_reasoning": 0.47039955854415894, "adv/mean_abs_step_conf": 0.6045668721199036, "adv/ratio_final_to_reasoning": 1.087024763700433, "adv/ratio_step_to_reasoning": 1.2852198968701831, "adv/std_final_conf": 0.7546648383140564, "adv/std_reasoning": 0.720639705657959, "adv/std_step_conf": 0.8251810669898987, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5253801745987046, "calib/avg_num_step_conf": 7.38671875, "calib/ece": 0.43100000000000005, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006646015206983602, "calib/mean_conf": 0.9893333333333334, "calib/mu_c": 0.9896268656716417, "calib/mu_w": 0.9889622641509433, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43100000000000005, "calib/std_conf": 0.004027681991198195, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9069639278557116, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.0039964026597428814, "calib/step_q_w": 0.9029675251959687, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 859.7734375, "completions/mean_terminated_length": 876.9004516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.0704, "grad_norm": 0.02952716499567032, "kl": 0.018499374389648438, "learning_rate": 3.7222222222222225e-06, "loss": -0.0558, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01736615225672722, "mask/share_reasoning": 0.8694962859153748, "mask/share_step_conf": 0.09360626339912415, "num_tokens": 20513684.0, "reward": 0.4557473063468933, "reward_std": 0.1637571156024933, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5324562191963196, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.08685079962015152, "step": 66 }, { "adv/mean_abs_final_conf": 0.43661728501319885, "adv/mean_abs_reasoning": 0.32266223430633545, "adv/mean_abs_step_conf": 0.5627602934837341, "adv/ratio_final_to_reasoning": 1.3531713308557658, "adv/ratio_step_to_reasoning": 1.7441157769627593, "adv/std_final_conf": 0.7150957584381104, "adv/std_reasoning": 0.6185758113861084, "adv/std_step_conf": 0.8077980875968933, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5259920352723653, "calib/avg_num_step_conf": 7.80859375, "calib/ece": 0.3436437246963563, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.951417004048583, "calib/gap": 0.00782463376475584, "calib/mean_conf": 0.9833198380566801, "calib/mu_c": 0.986139240506329, "calib/mu_w": 0.9783146067415731, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3436437246963563, "calib/std_conf": 0.034727261968954794, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9065346534653466, "calib/step_q_c_n": 1212.0, "calib/step_q_gap": -0.0034526400543485325, "calib/step_q_w": 0.9099872935196951, "calib/step_q_w_n": 787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 798.78515625, "completions/mean_terminated_length": 817.9560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.07146666666666666, "grad_norm": 0.02919713407754898, "kl": 0.016553878784179688, "learning_rate": 3.694444444444445e-06, "loss": -0.075, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017789488658308983, "mask/share_reasoning": 0.8611921072006226, "mask/share_step_conf": 0.0975809246301651, "num_tokens": 20823181.0, "reward": 0.5265329480171204, "reward_std": 0.1243123710155487, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.630833625793457, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.10582610964775085, "step": 67 }, { "adv/mean_abs_final_conf": 0.6287695169448853, "adv/mean_abs_reasoning": 0.4485987424850464, "adv/mean_abs_step_conf": 0.5390548706054688, "adv/ratio_final_to_reasoning": 1.4016301371282704, "adv/ratio_step_to_reasoning": 1.201641510672397, "adv/std_final_conf": 0.8670635223388672, "adv/std_reasoning": 0.7392467260360718, "adv/std_step_conf": 0.8085440993309021, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5182761426827613, "calib/avg_num_step_conf": 7.3984375, "calib/ece": 0.38748987854251005, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8825910931174089, "calib/gap": 0.000952800759528194, "calib/mean_conf": 0.978582995951417, "calib/mu_c": 0.9789726027397261, "calib/mu_w": 0.9780198019801979, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38748987854251005, "calib/std_conf": 0.028921508224638584, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9041187214611873, "calib/step_q_c_n": 1095.0, "calib/step_q_gap": 0.005232613826644084, "calib/step_q_w": 0.8988861076345432, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 802.921875, "completions/mean_terminated_length": 815.6666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.07253333333333334, "grad_norm": 0.0696231871843338, "kl": 0.020893096923828125, "learning_rate": 3.6666666666666666e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01892545074224472, "mask/share_reasoning": 0.8621610403060913, "mask/share_step_conf": 0.10328847169876099, "num_tokens": 21132817.0, "reward": 0.4874420166015625, "reward_std": 0.16237157583236694, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5864066481590271, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.08144616335630417, "step": 68 }, { "adv/mean_abs_final_conf": 0.7065532207489014, "adv/mean_abs_reasoning": 0.5465849041938782, "adv/mean_abs_step_conf": 0.5947775840759277, "adv/ratio_final_to_reasoning": 1.2926687424544772, "adv/ratio_step_to_reasoning": 1.0881705285167467, "adv/std_final_conf": 0.8855745196342468, "adv/std_reasoning": 0.7928979992866516, "adv/std_step_conf": 0.8079348802566528, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.49295774647887325, "calib/avg_num_step_conf": 7.5625, "calib/ece": 0.38721991701244807, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8713692946058091, "calib/gap": -0.0014276568501920828, "calib/mean_conf": 0.9764315352697096, "calib/mu_c": 0.9758450704225352, "calib/mu_w": 0.9772727272727273, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38721991701244807, "calib/std_conf": 0.0349707121284594, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.905909090909091, "calib/step_q_c_n": 1034.0, "calib/step_q_gap": 0.0053325942350334055, "calib/step_q_w": 0.9005764966740576, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 812.6171875, "completions/mean_terminated_length": 838.8306274414062, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.0736, "grad_norm": 0.06695166975259781, "kl": 0.018238067626953125, "learning_rate": 3.638888888888889e-06, "loss": -0.0969, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017266161739826202, "mask/share_reasoning": 0.858033299446106, "mask/share_step_conf": 0.09345056116580963, "num_tokens": 21445343.0, "reward": 0.4726163148880005, "reward_std": 0.20219215750694275, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.570591390132904, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.0754224956035614, "step": 69 }, { "adv/mean_abs_final_conf": 0.5190539956092834, "adv/mean_abs_reasoning": 0.4108893871307373, "adv/mean_abs_step_conf": 0.6084713935852051, "adv/ratio_final_to_reasoning": 1.2632450773038102, "adv/ratio_step_to_reasoning": 1.4808642243942916, "adv/std_final_conf": 0.7847069501876831, "adv/std_reasoning": 0.7014268636703491, "adv/std_step_conf": 0.8406656384468079, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.56004243281471, "calib/avg_num_step_conf": 7.71484375, "calib/ece": 0.37646887966804987, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.941908713692946, "calib/gap": 0.07408203677510639, "calib/mean_conf": 0.9573817427385893, "calib/mu_c": 0.9884285714285714, "calib/mu_w": 0.914346534653465, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37646887966804987, "calib/std_conf": 0.16554911619861123, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075068744271311, "calib/step_q_c_n": 1091.0, "calib/step_q_gap": 0.008716150445230442, "calib/step_q_w": 0.8987907239819006, "calib/step_q_w_n": 884.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 810.859375, "completions/mean_terminated_length": 833.6546020507812, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.07466666666666667, "grad_norm": 0.04748120531439781, "kl": 0.020498275756835938, "learning_rate": 3.6111111111111115e-06, "loss": -0.0639, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01832003891468048, "mask/share_reasoning": 0.8515934944152832, "mask/share_step_conf": 0.10274268686771393, "num_tokens": 21759915.0, "reward": 0.48681142926216125, "reward_std": 0.1618727445602417, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5869503617286682, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.08901624381542206, "step": 70 }, { "adv/mean_abs_final_conf": 0.6313654780387878, "adv/mean_abs_reasoning": 0.6012966632843018, "adv/mean_abs_step_conf": 0.5767719745635986, "adv/ratio_final_to_reasoning": 1.0500066216736499, "adv/ratio_step_to_reasoning": 0.9592136623763244, "adv/std_final_conf": 0.8586423993110657, "adv/std_reasoning": 0.8429471850395203, "adv/std_step_conf": 0.8093698620796204, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5328379674017258, "calib/avg_num_step_conf": 7.40234375, "calib/ece": 0.389919028340081, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9554655870445344, "calib/gap": -0.008865223941925904, "calib/mean_conf": 0.9776113360323888, "calib/mu_c": 0.9740939597315436, "calib/mu_w": 0.9829591836734695, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3821457489878543, "calib/std_conf": 0.0882784215397571, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9034542157751586, "calib/step_q_c_n": 1103.0, "calib/step_q_gap": 0.003555225876168633, "calib/step_q_w": 0.89989898989899, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 770.86328125, "completions/mean_terminated_length": 789.364013671875, "completions/min_length": 0.0, "completions/min_terminated_length": 448.0, "epoch": 0.07573333333333333, "grad_norm": 0.03665672615170479, "kl": 0.020742416381835938, "learning_rate": 3.5833333333333335e-06, "loss": -0.0754, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01877676509320736, "mask/share_reasoning": 0.8564556241035461, "mask/share_step_conf": 0.10133013874292374, "num_tokens": 22061664.0, "reward": 0.488927960395813, "reward_std": 0.2223735898733139, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5870753526687622, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.08140552043914795, "step": 71 }, { "adv/mean_abs_final_conf": 0.5326349139213562, "adv/mean_abs_reasoning": 0.5320241451263428, "adv/mean_abs_step_conf": 0.643271267414093, "adv/ratio_final_to_reasoning": 1.001148009541688, "adv/ratio_step_to_reasoning": 1.2091016419214051, "adv/std_final_conf": 0.7801859378814697, "adv/std_reasoning": 0.7927574515342712, "adv/std_step_conf": 0.8576951622962952, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5295555265901982, "calib/avg_num_step_conf": 7.40625, "calib/ece": 0.4335742971887551, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9718875502008032, "calib/gap": 0.012054223149113641, "calib/mean_conf": 0.9837751004016065, "calib/mu_c": 0.9891970802919708, "calib/mu_w": 0.9771428571428572, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4335742971887551, "calib/std_conf": 0.06212603479449744, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9053515624999999, "calib/step_q_c_n": 1024.0, "calib/step_q_gap": 0.004009819380733859, "calib/step_q_w": 0.9013417431192661, "calib/step_q_w_n": 872.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 772.890625, "completions/mean_terminated_length": 778.9763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 403.0, "epoch": 0.0768, "grad_norm": 0.020385466516017914, "kl": 0.019151687622070312, "learning_rate": 3.555555555555556e-06, "loss": 0.0452, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019251573830842972, "mask/share_reasoning": 0.8693620562553406, "mask/share_step_conf": 0.10357387363910675, "num_tokens": 22363932.0, "reward": 0.47804099321365356, "reward_std": 0.18993939459323883, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5511460900306702, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.1033734381198883, "step": 72 }, { "adv/mean_abs_final_conf": 0.5937614440917969, "adv/mean_abs_reasoning": 0.5734392404556274, "adv/mean_abs_step_conf": 0.6711593866348267, "adv/ratio_final_to_reasoning": 1.0354391576342463, "adv/ratio_step_to_reasoning": 1.1704106368820444, "adv/std_final_conf": 0.81130450963974, "adv/std_reasoning": 0.8100379109382629, "adv/std_step_conf": 0.8734548687934875, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.551749165336043, "calib/avg_num_step_conf": 7.55078125, "calib/ece": 0.32040160642570287, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9799196787148594, "calib/gap": 0.008253012048192732, "calib/mean_conf": 0.9870682730923696, "calib/mu_c": 0.9898192771084335, "calib/mu_w": 0.9815662650602408, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32040160642570287, "calib/std_conf": 0.019549118382881366, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9048349834983497, "calib/step_q_c_n": 1212.0, "calib/step_q_gap": 0.004252459226504968, "calib/step_q_w": 0.9005825242718447, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 758.43359375, "completions/mean_terminated_length": 770.4722900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.07786666666666667, "grad_norm": 0.021740859374403954, "kl": 0.019433975219726562, "learning_rate": 3.5277777777777784e-06, "loss": -0.0335, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018972348421812057, "mask/share_reasoning": 0.8640187978744507, "mask/share_step_conf": 0.10138389468193054, "num_tokens": 22665123.0, "reward": 0.5364750623703003, "reward_std": 0.23655082285404205, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6598562002182007, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.08887514472007751, "step": 73 }, { "adv/mean_abs_final_conf": 0.46485772728919983, "adv/mean_abs_reasoning": 0.4380958676338196, "adv/mean_abs_step_conf": 0.6244065761566162, "adv/ratio_final_to_reasoning": 1.0610867657801077, "adv/ratio_step_to_reasoning": 1.4252738322529068, "adv/std_final_conf": 0.7419576048851013, "adv/std_reasoning": 0.7206048369407654, "adv/std_step_conf": 0.8554706573486328, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5075736325385694, "calib/avg_num_step_conf": 7.453125, "calib/ece": 0.35550607287449393, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.979757085020243, "calib/gap": 0.014538569424965009, "calib/mean_conf": 0.9830364372469635, "calib/mu_c": 0.9884516129032257, "calib/mu_w": 0.9739130434782607, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35550607287449393, "calib/std_conf": 0.06634539338141213, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9016083916083917, "calib/step_q_c_n": 1144.0, "calib/step_q_gap": 0.018702632446088052, "calib/step_q_w": 0.8829057591623036, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2939.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 751.9140625, "completions/mean_terminated_length": 769.9600219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 386.0, "epoch": 0.07893333333333333, "grad_norm": 0.029611723497509956, "kl": 0.021173477172851562, "learning_rate": 3.5e-06, "loss": -0.0513, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019501352682709694, "mask/share_reasoning": 0.8510788679122925, "mask/share_step_conf": 0.10598224401473999, "num_tokens": 22961541.0, "reward": 0.5154587030410767, "reward_std": 0.15994706749916077, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6196941137313843, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.09716075658798218, "step": 74 }, { "adv/mean_abs_final_conf": 0.4644741714000702, "adv/mean_abs_reasoning": 0.3942159116268158, "adv/mean_abs_step_conf": 0.6243501305580139, "adv/ratio_final_to_reasoning": 1.1782227903569866, "adv/ratio_step_to_reasoning": 1.58377709306938, "adv/std_final_conf": 0.7165085077285767, "adv/std_reasoning": 0.6818145513534546, "adv/std_step_conf": 0.8415147066116333, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.557784911717496, "calib/avg_num_step_conf": 7.57421875, "calib/ece": 0.2627822580645161, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9798387096774194, "calib/gap": 0.030386837881219875, "calib/mean_conf": 0.9805241935483872, "calib/mu_c": 0.9891011235955056, "calib/mu_w": 0.9587142857142857, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2627822580645161, "calib/std_conf": 0.08635149135847918, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9025000000000001, "calib/step_q_c_n": 1316.0, "calib/step_q_gap": 0.013286516853932673, "calib/step_q_w": 0.8892134831460674, "calib/step_q_w_n": 623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2951.0, "completions/max_terminated_length": 2951.0, "completions/mean_length": 774.23046875, "completions/mean_terminated_length": 780.3267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.08, "grad_norm": 0.01695588044822216, "kl": 0.020875930786132812, "learning_rate": 3.4722222222222224e-06, "loss": 0.0265, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019423652440309525, "mask/share_reasoning": 0.8641567230224609, "mask/share_step_conf": 0.10860709846019745, "num_tokens": 23264496.0, "reward": 0.566847562789917, "reward_std": 0.1740376055240631, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7102996110916138, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.09058299660682678, "step": 75 }, { "adv/mean_abs_final_conf": 0.42014482617378235, "adv/mean_abs_reasoning": 0.3789152503013611, "adv/mean_abs_step_conf": 0.613991379737854, "adv/ratio_final_to_reasoning": 1.1088094919368654, "adv/ratio_step_to_reasoning": 1.6203923680810703, "adv/std_final_conf": 0.705521285533905, "adv/std_reasoning": 0.6816098093986511, "adv/std_step_conf": 0.8410729169845581, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.506560449859419, "calib/avg_num_step_conf": 7.1171875, "calib/ece": 0.37545816733067733, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880478087649402, "calib/gap": 0.004651894497255338, "calib/mean_conf": 0.9815139442231077, "calib/mu_c": 0.9833116883116884, "calib/mu_w": 0.978659793814433, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3717131474103586, "calib/std_conf": 0.08551751762940091, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9019761029411765, "calib/step_q_c_n": 1088.0, "calib/step_q_gap": 0.0035701083907676567, "calib/step_q_w": 0.8984059945504088, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 803.703125, "completions/mean_terminated_length": 813.2332153320312, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.08106666666666666, "grad_norm": 0.018733568489551544, "kl": 0.018390655517578125, "learning_rate": 3.444444444444445e-06, "loss": 0.0051, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01879020780324936, "mask/share_reasoning": 0.873236894607544, "mask/share_step_conf": 0.09625409543514252, "num_tokens": 23573300.0, "reward": 0.5043225288391113, "reward_std": 0.15315832197666168, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6102288961410522, "rewards/format_reward_step": 0.98046875, "rewards/step_margin_reward": 0.08200985938310623, "step": 76 }, { "adv/mean_abs_final_conf": 0.5138951539993286, "adv/mean_abs_reasoning": 0.4246634542942047, "adv/mean_abs_step_conf": 0.7056369185447693, "adv/ratio_final_to_reasoning": 1.2101233313175677, "adv/ratio_step_to_reasoning": 1.661637966275072, "adv/std_final_conf": 0.7437146306037903, "adv/std_reasoning": 0.6818997859954834, "adv/std_step_conf": 0.8842389583587646, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5391908490710886, "calib/avg_num_step_conf": 7.48046875, "calib/ece": 0.3071836734693878, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": 0.0033417779824964633, "calib/mean_conf": 0.9888163265306122, "calib/mu_c": 0.9898802395209577, "calib/mu_w": 0.9865384615384613, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3071836734693878, "calib/std_conf": 0.00865594869035812, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8999682791435369, "calib/step_q_c_n": 1261.0, "calib/step_q_gap": 0.023607422874423634, "calib/step_q_w": 0.8763608562691133, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 819.3359375, "completions/mean_terminated_length": 825.7874145507812, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.08213333333333334, "grad_norm": 0.01720540039241314, "kl": 0.0196380615234375, "learning_rate": 3.416666666666667e-06, "loss": 0.0143, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018724631518125534, "mask/share_reasoning": 0.870591402053833, "mask/share_step_conf": 0.10287146270275116, "num_tokens": 23887714.0, "reward": 0.539682149887085, "reward_std": 0.18761983513832092, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.656449556350708, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.10182103514671326, "step": 77 }, { "adv/mean_abs_final_conf": 0.5096384286880493, "adv/mean_abs_reasoning": 0.4717886447906494, "adv/mean_abs_step_conf": 0.6330742835998535, "adv/ratio_final_to_reasoning": 1.080226144302806, "adv/ratio_step_to_reasoning": 1.3418599421373796, "adv/std_final_conf": 0.7696775794029236, "adv/std_reasoning": 0.7393324375152588, "adv/std_step_conf": 0.8575802445411682, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5233198924731183, "calib/avg_num_step_conf": 7.30859375, "calib/ece": 0.36806324110671956, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": -0.01576680107526862, "calib/mean_conf": 0.9772332015810277, "calib/mu_c": 0.9714375000000001, "calib/mu_w": 0.9872043010752687, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35644268774703575, "calib/std_conf": 0.10558819330606622, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8986626916524704, "calib/step_q_c_n": 1174.0, "calib/step_q_gap": -0.0005195178166831349, "calib/step_q_w": 0.8991822094691535, "calib/step_q_w_n": 697.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 822.328125, "completions/mean_terminated_length": 832.0791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 484.0, "epoch": 0.0832, "grad_norm": 0.02704591117799282, "kl": 0.023164749145507812, "learning_rate": 3.3888888888888893e-06, "loss": -0.0058, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.017867308109998703, "mask/share_reasoning": 0.8763711452484131, "mask/share_step_conf": 0.0940428078174591, "num_tokens": 24206254.0, "reward": 0.5272241830825806, "reward_std": 0.1896364688873291, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6226117014884949, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.1099616065621376, "step": 78 }, { "adv/mean_abs_final_conf": 0.5069111585617065, "adv/mean_abs_reasoning": 0.48231005668640137, "adv/mean_abs_step_conf": 0.608548641204834, "adv/ratio_final_to_reasoning": 1.0510068192322617, "adv/ratio_step_to_reasoning": 1.261737408889471, "adv/std_final_conf": 0.756949782371521, "adv/std_reasoning": 0.7394082546234131, "adv/std_step_conf": 0.8254485726356506, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.505041957548833, "calib/avg_num_step_conf": 7.296875, "calib/ece": 0.3366800000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.001853889006417364, "calib/mean_conf": 0.9886800000000002, "calib/mu_c": 0.9893251533742331, "calib/mu_w": 0.9874712643678157, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3366800000000002, "calib/std_conf": 0.00886891199640632, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8989408649602825, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": -0.00022920306692852943, "calib/step_q_w": 0.899170068027211, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 795.4375, "completions/mean_terminated_length": 804.8695678710938, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.08426666666666667, "grad_norm": 0.027482660487294197, "kl": 0.019435882568359375, "learning_rate": 3.3611111111111117e-06, "loss": -0.0119, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01822996512055397, "mask/share_reasoning": 0.8748025894165039, "mask/share_step_conf": 0.09524869173765182, "num_tokens": 24516262.0, "reward": 0.5263484120368958, "reward_std": 0.19288992881774902, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6450324058532715, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.08500824868679047, "step": 79 }, { "adv/mean_abs_final_conf": 0.43031853437423706, "adv/mean_abs_reasoning": 0.39704400300979614, "adv/mean_abs_step_conf": 0.6708499193191528, "adv/ratio_final_to_reasoning": 1.0838056515454282, "adv/ratio_step_to_reasoning": 1.6896110109553806, "adv/std_final_conf": 0.6989228129386902, "adv/std_reasoning": 0.6612381935119629, "adv/std_step_conf": 0.8723685145378113, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5160907859078591, "calib/avg_num_step_conf": 7.4609375, "calib/ece": 0.3433858267716535, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": 0.0009457994579946671, "calib/mean_conf": 0.9890551181102362, "calib/mu_c": 0.989390243902439, "calib/mu_w": 0.9884444444444443, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3433858267716535, "calib/std_conf": 0.008222599055181423, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9033737864077669, "calib/step_q_c_n": 1236.0, "calib/step_q_gap": 0.001786249315778865, "calib/step_q_w": 0.901587537091988, "calib/step_q_w_n": 674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 732.890625, "completions/mean_terminated_length": 738.6614379882812, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.08533333333333333, "grad_norm": 0.018796490505337715, "kl": 0.025392532348632812, "learning_rate": 3.3333333333333333e-06, "loss": 0.012, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.019986746832728386, "mask/share_reasoning": 0.865434467792511, "mask/share_step_conf": 0.10676628351211548, "num_tokens": 24806042.0, "reward": 0.536179780960083, "reward_std": 0.15412630140781403, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6485639810562134, "rewards/format_reward_step": 0.9921875, "rewards/step_margin_reward": 0.09723307192325592, "step": 80 }, { "adv/mean_abs_final_conf": 0.5034351348876953, "adv/mean_abs_reasoning": 0.4152452051639557, "adv/mean_abs_step_conf": 0.606434166431427, "adv/ratio_final_to_reasoning": 1.212380368579858, "adv/ratio_step_to_reasoning": 1.4604242478657452, "adv/std_final_conf": 0.7494115233421326, "adv/std_reasoning": 0.7014899849891663, "adv/std_step_conf": 0.8409460186958313, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5178936605316973, "calib/avg_num_step_conf": 6.8984375, "calib/ece": 0.3246558704453443, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838056680161943, "calib/gap": 0.019398919076833443, "calib/mean_conf": 0.9772064777327936, "calib/mu_c": 0.9838036809815952, "calib/mu_w": 0.9644047619047618, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3209716599190285, "calib/std_conf": 0.10573767229636917, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9000181323662738, "calib/step_q_c_n": 1103.0, "calib/step_q_gap": 0.009595809591009807, "calib/step_q_w": 0.890422322775264, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 767.01953125, "completions/mean_terminated_length": 779.1945190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 399.0, "epoch": 0.0864, "grad_norm": 0.020435577258467674, "kl": 0.021213531494140625, "learning_rate": 3.3055555555555558e-06, "loss": 0.0138, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019544146955013275, "mask/share_reasoning": 0.8684887886047363, "mask/share_step_conf": 0.0963420569896698, "num_tokens": 25108647.0, "reward": 0.5301726460456848, "reward_std": 0.16873633861541748, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6487894058227539, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.0912434458732605, "step": 81 }, { "adv/mean_abs_final_conf": 0.49838700890541077, "adv/mean_abs_reasoning": 0.4357077181339264, "adv/mean_abs_step_conf": 0.6757500171661377, "adv/ratio_final_to_reasoning": 1.1438562783324813, "adv/ratio_step_to_reasoning": 1.5509250560450891, "adv/std_final_conf": 0.7433985471725464, "adv/std_reasoning": 0.7014827728271484, "adv/std_step_conf": 0.8716046214103699, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5236158578263842, "calib/avg_num_step_conf": 7.47265625, "calib/ece": 0.36212851405622487, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9839357429718876, "calib/gap": 0.021057416267942886, "calib/mean_conf": 0.9806024096385543, "calib/mu_c": 0.9886363636363636, "calib/mu_w": 0.9675789473684208, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36212851405622487, "calib/std_conf": 0.08624262120940633, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9026831421006178, "calib/step_q_c_n": 1133.0, "calib/step_q_gap": 0.008978013895489467, "calib/step_q_w": 0.8937051282051284, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 735.109375, "completions/mean_terminated_length": 743.8261108398438, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.08746666666666666, "grad_norm": 0.0355253629386425, "kl": 0.024700164794921875, "learning_rate": 3.277777777777778e-06, "loss": -0.0334, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019872203469276428, "mask/share_reasoning": 0.8601292371749878, "mask/share_step_conf": 0.10827983170747757, "num_tokens": 25402387.0, "reward": 0.5043740272521973, "reward_std": 0.18331748247146606, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6180245876312256, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.075879767537117, "step": 82 }, { "adv/mean_abs_final_conf": 0.40140241384506226, "adv/mean_abs_reasoning": 0.3419298529624939, "adv/mean_abs_step_conf": 0.5910751819610596, "adv/ratio_final_to_reasoning": 1.17393205175651, "adv/ratio_step_to_reasoning": 1.7286445650766102, "adv/std_final_conf": 0.6834880113601685, "adv/std_reasoning": 0.6402841806411743, "adv/std_step_conf": 0.8230851888656616, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5375874125874126, "calib/avg_num_step_conf": 6.84765625, "calib/ece": 0.4096761133603239, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838056680161943, "calib/gap": 0.00564685314685287, "calib/mean_conf": 0.9807692307692307, "calib/mu_c": 0.983146853146853, "calib/mu_w": 0.9775000000000001, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.405748987854251, "calib/std_conf": 0.08792603272860709, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9010440122824975, "calib/step_q_c_n": 977.0, "calib/step_q_gap": 0.015348135993837775, "calib/step_q_w": 0.8856958762886598, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 810.12109375, "completions/mean_terminated_length": 816.5, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.08853333333333334, "grad_norm": 0.02385982684791088, "kl": 0.02167510986328125, "learning_rate": 3.2500000000000002e-06, "loss": 0.001, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.01879962347447872, "mask/share_reasoning": 0.8773390054702759, "mask/share_step_conf": 0.09604892134666443, "num_tokens": 25717042.0, "reward": 0.48794054985046387, "reward_std": 0.12468407303094864, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5690589547157288, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.10213463008403778, "step": 83 }, { "adv/mean_abs_final_conf": 0.5460108518600464, "adv/mean_abs_reasoning": 0.44815874099731445, "adv/mean_abs_step_conf": 0.649663507938385, "adv/ratio_final_to_reasoning": 1.218342524447868, "adv/ratio_step_to_reasoning": 1.449628108318606, "adv/std_final_conf": 0.7695329189300537, "adv/std_reasoning": 0.7206275463104248, "adv/std_step_conf": 0.8577162623405457, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5526623555310953, "calib/avg_num_step_conf": 7.07421875, "calib/ece": 0.3469999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.984, "calib/gap": 0.026967528893780734, "calib/mean_conf": 0.979, "calib/mu_c": 0.9889240506329112, "calib/mu_w": 0.9619565217391305, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3469999999999998, "calib/std_conf": 0.08903594779638166, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9004436557231589, "calib/step_q_c_n": 1127.0, "calib/step_q_gap": 0.005648334085732065, "calib/step_q_w": 0.8947953216374268, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 718.1171875, "completions/mean_terminated_length": 723.7716674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 309.0, "epoch": 0.0896, "grad_norm": 0.018636994063854218, "kl": 0.025386810302734375, "learning_rate": 3.2222222222222227e-06, "loss": 0.0276, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02051534876227379, "mask/share_reasoning": 0.8655624389648438, "mask/share_step_conf": 0.10610969364643097, "num_tokens": 26006800.0, "reward": 0.5129927396774292, "reward_std": 0.18007919192314148, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6324530839920044, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.07556359469890594, "step": 84 }, { "adv/mean_abs_final_conf": 0.5838895440101624, "adv/mean_abs_reasoning": 0.47307288646698, "adv/mean_abs_step_conf": 0.6337040662765503, "adv/ratio_final_to_reasoning": 1.2342485919469774, "adv/ratio_step_to_reasoning": 1.3395484805929208, "adv/std_final_conf": 0.7927996516227722, "adv/std_reasoning": 0.7393242716789246, "adv/std_step_conf": 0.841560959815979, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5883410005750431, "calib/avg_num_step_conf": 6.98046875, "calib/ece": 0.3593388429752067, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9710743801652892, "calib/gap": 0.04445227142035624, "calib/mean_conf": 0.970909090909091, "calib/mu_c": 0.9881756756756754, "calib/mu_w": 0.9437234042553192, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3593388429752067, "calib/std_conf": 0.12250822202125829, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9013618290258449, "calib/step_q_c_n": 1006.0, "calib/step_q_gap": 0.01279588792469255, "calib/step_q_w": 0.8885659411011524, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 790.48828125, "completions/mean_terminated_length": 803.0357666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.09066666666666667, "grad_norm": 0.0355561263859272, "kl": 0.026716232299804688, "learning_rate": 3.1944444444444443e-06, "loss": -0.0004, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018777258694171906, "mask/share_reasoning": 0.8689888119697571, "mask/share_step_conf": 0.09660893678665161, "num_tokens": 26316989.0, "reward": 0.5025588274002075, "reward_std": 0.16871732473373413, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6043878793716431, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.0968235582113266, "step": 85 }, { "adv/mean_abs_final_conf": 0.5974839329719543, "adv/mean_abs_reasoning": 0.44355282187461853, "adv/mean_abs_step_conf": 0.6157060265541077, "adv/ratio_final_to_reasoning": 1.3470412169778696, "adv/ratio_step_to_reasoning": 1.3881233444799335, "adv/std_final_conf": 0.7756420969963074, "adv/std_reasoning": 0.7016515731811523, "adv/std_step_conf": 0.8253141045570374, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5479437229437228, "calib/avg_num_step_conf": 7.046875, "calib/ece": 0.40096234309623435, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.99581589958159, "calib/gap": 0.0021883116883114617, "calib/mean_conf": 0.9867364016736402, "calib/mu_c": 0.987642857142857, "calib/mu_w": 0.9854545454545456, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40096234309623435, "calib/std_conf": 0.01027944150214303, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9007905544147844, "calib/step_q_c_n": 974.0, "calib/step_q_gap": 0.011573686944904904, "calib/step_q_w": 0.8892168674698795, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 769.515625, "completions/mean_terminated_length": 797.5546875, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.09173333333333333, "grad_norm": 0.06356382369995117, "kl": 0.02785491943359375, "learning_rate": 3.1666666666666667e-06, "loss": -0.0772, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.019127480685710907, "mask/share_reasoning": 0.8466145992279053, "mask/share_step_conf": 0.09910169243812561, "num_tokens": 26619497.0, "reward": 0.4679451584815979, "reward_std": 0.17001873254776, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5578621029853821, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.08193445205688477, "step": 86 }, { "adv/mean_abs_final_conf": 0.47159144282341003, "adv/mean_abs_reasoning": 0.37070751190185547, "adv/mean_abs_step_conf": 0.6526124477386475, "adv/ratio_final_to_reasoning": 1.2721388903180995, "adv/ratio_step_to_reasoning": 1.760451101706906, "adv/std_final_conf": 0.6977677345275879, "adv/std_reasoning": 0.6613085269927979, "adv/std_step_conf": 0.8724237680435181, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5246654772524533, "calib/avg_num_step_conf": 6.84375, "calib/ece": 0.22361445783132528, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007404103479036195, "calib/mean_conf": 0.9866666666666667, "calib/mu_c": 0.9868421052631579, "calib/mu_w": 0.9861016949152542, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22361445783132528, "calib/std_conf": 0.006122357911697838, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9011706948640484, "calib/step_q_c_n": 1324.0, "calib/step_q_gap": 0.0022220967332072794, "calib/step_q_w": 0.8989485981308412, "calib/step_q_w_n": 428.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 701.76171875, "completions/mean_terminated_length": 718.60400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 374.0, "epoch": 0.0928, "grad_norm": 0.03184402734041214, "kl": 0.03104400634765625, "learning_rate": 3.138888888888889e-06, "loss": -0.066, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020320512354373932, "mask/share_reasoning": 0.8536198139190674, "mask/share_step_conf": 0.10262220352888107, "num_tokens": 26904644.0, "reward": 0.5931057929992676, "reward_std": 0.15207459032535553, "rewards/accuracy_reward_step": 0.7421875, "rewards/final_brier_reward_step": 0.7444796562194824, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.09954449534416199, "step": 87 }, { "adv/mean_abs_final_conf": 0.48698878288269043, "adv/mean_abs_reasoning": 0.3855929374694824, "adv/mean_abs_step_conf": 0.5684686899185181, "adv/ratio_final_to_reasoning": 1.262960846945577, "adv/ratio_step_to_reasoning": 1.4742715300990419, "adv/std_final_conf": 0.7024607062339783, "adv/std_reasoning": 0.6615065932273865, "adv/std_step_conf": 0.807574450969696, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5275767226404169, "calib/avg_num_step_conf": 6.65625, "calib/ece": 0.34718367346938783, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0013122466705268732, "calib/mean_conf": 0.9880000000000001, "calib/mu_c": 0.9884713375796178, "calib/mu_w": 0.9871590909090909, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34718367346938783, "calib/std_conf": 0.005390846361175208, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9024738344433872, "calib/step_q_c_n": 1051.0, "calib/step_q_gap": 0.011264033524551076, "calib/step_q_w": 0.8912098009188362, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 732.18359375, "completions/mean_terminated_length": 749.7560424804688, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.09386666666666667, "grad_norm": 0.02061421051621437, "kl": 0.02970123291015625, "learning_rate": 3.1111111111111116e-06, "loss": -0.0356, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019529426470398903, "mask/share_reasoning": 0.8609858155250549, "mask/share_step_conf": 0.09604723751544952, "num_tokens": 27201931.0, "reward": 0.5190150141716003, "reward_std": 0.15574511885643005, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6219437122344971, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.10202378034591675, "step": 88 }, { "adv/mean_abs_final_conf": 0.4684966206550598, "adv/mean_abs_reasoning": 0.35824376344680786, "adv/mean_abs_step_conf": 0.577095627784729, "adv/ratio_final_to_reasoning": 1.3077593204902291, "adv/ratio_step_to_reasoning": 1.6109020914481778, "adv/std_final_conf": 0.7253900170326233, "adv/std_reasoning": 0.6402668952941895, "adv/std_step_conf": 0.807966411113739, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5316287510532115, "calib/avg_num_step_conf": 6.72265625, "calib/ece": 0.43244000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001245058007647959, "calib/mean_conf": 0.9884400000000001, "calib/mu_c": 0.9889928057553955, "calib/mu_w": 0.9877477477477475, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43244000000000005, "calib/std_conf": 0.00554674679429303, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9037346711259755, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.01057933132015021, "calib/step_q_w": 0.8931553398058253, "calib/step_q_w_n": 824.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 791.8828125, "completions/mean_terminated_length": 794.98828125, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.09493333333333333, "grad_norm": 0.03952114284038544, "kl": 0.0265960693359375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0024, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01961899921298027, "mask/share_reasoning": 0.8797329664230347, "mask/share_step_conf": 0.09674181044101715, "num_tokens": 27513541.0, "reward": 0.4682702124118805, "reward_std": 0.14151982963085175, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5534331798553467, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.07920098304748535, "step": 89 }, { "adv/mean_abs_final_conf": 0.536230742931366, "adv/mean_abs_reasoning": 0.39513659477233887, "adv/mean_abs_step_conf": 0.5973080992698669, "adv/ratio_final_to_reasoning": 1.3570768944858667, "adv/ratio_step_to_reasoning": 1.5116496603257181, "adv/std_final_conf": 0.7722048759460449, "adv/std_reasoning": 0.7014068961143494, "adv/std_step_conf": 0.8403605818748474, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5374828309115882, "calib/avg_num_step_conf": 7.2890625, "calib/ece": 0.34495934959349617, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": -0.0036087616569074887, "calib/mean_conf": 0.9834146341463416, "calib/mu_c": 0.9821383647798742, "calib/mu_w": 0.9857471264367816, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34101626016260184, "calib/std_conf": 0.06148341396137519, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8977544154751893, "calib/step_q_c_n": 1189.0, "calib/step_q_gap": 0.00041320424919222276, "calib/step_q_w": 0.8973412112259971, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 742.421875, "completions/mean_terminated_length": 754.2064208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.096, "grad_norm": 0.033743906766176224, "kl": 0.034824371337890625, "learning_rate": 3.055555555555556e-06, "loss": -0.0221, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02043253928422928, "mask/share_reasoning": 0.8547654151916504, "mask/share_step_conf": 0.10917702317237854, "num_tokens": 27806921.0, "reward": 0.5162938833236694, "reward_std": 0.1559755802154541, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6268843412399292, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.0892971009016037, "step": 90 }, { "adv/mean_abs_final_conf": 0.4670163691043854, "adv/mean_abs_reasoning": 0.4009590148925781, "adv/mean_abs_step_conf": 0.6211557388305664, "adv/ratio_final_to_reasoning": 1.164748395118401, "adv/ratio_step_to_reasoning": 1.5491751419954523, "adv/std_final_conf": 0.717831552028656, "adv/std_reasoning": 0.6817478537559509, "adv/std_step_conf": 0.8571966290473938, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5670557228915662, "calib/avg_num_step_conf": 6.75390625, "calib/ece": 0.31243902439024396, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.002387048192770913, "calib/mean_conf": 0.9872357723577236, "calib/mu_c": 0.988012048192771, "calib/mu_w": 0.9856250000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31243902439024396, "calib/std_conf": 0.00783994493594091, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9048003714020427, "calib/step_q_c_n": 1077.0, "calib/step_q_gap": 0.02969300943885267, "calib/step_q_w": 0.8751073619631901, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 772.70703125, "completions/mean_terminated_length": 778.7913208007812, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.09706666666666666, "grad_norm": 0.023183386772871017, "kl": 0.032321929931640625, "learning_rate": 3.0277777777777776e-06, "loss": -0.0135, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.019044730812311172, "mask/share_reasoning": 0.8802785873413086, "mask/share_step_conf": 0.09286415576934814, "num_tokens": 28112446.0, "reward": 0.5405303239822388, "reward_std": 0.16771180927753448, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6570515632629395, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.10291542112827301, "step": 91 }, { "adv/mean_abs_final_conf": 0.5757607221603394, "adv/mean_abs_reasoning": 0.4646397829055786, "adv/mean_abs_step_conf": 0.640633225440979, "adv/ratio_final_to_reasoning": 1.2391550257704518, "adv/ratio_step_to_reasoning": 1.378773942762376, "adv/std_final_conf": 0.8052237629890442, "adv/std_reasoning": 0.7573723793029785, "adv/std_step_conf": 0.8726325631141663, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.598082042181421, "calib/avg_num_step_conf": 7.09765625, "calib/ece": 0.3409920634920636, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841269841269841, "calib/gap": 0.0247539417104633, "calib/mean_conf": 0.9798809523809524, "calib/mu_c": 0.9888198757763974, "calib/mu_w": 0.9640659340659341, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3409920634920636, "calib/std_conf": 0.08541905173374947, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9062241379310345, "calib/step_q_c_n": 1160.0, "calib/step_q_gap": 0.0040780192095732914, "calib/step_q_w": 0.9021461187214612, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 698.2890625, "completions/mean_terminated_length": 706.5692138671875, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.09813333333333334, "grad_norm": 0.026200029999017715, "kl": 0.03961181640625, "learning_rate": 3e-06, "loss": -0.0573, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.020919611677527428, "mask/share_reasoning": 0.8589800596237183, "mask/share_step_conf": 0.10838152468204498, "num_tokens": 28397928.0, "reward": 0.5315855741500854, "reward_std": 0.17773553729057312, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6468722820281982, "rewards/format_reward_step": 0.984375, "rewards/step_margin_reward": 0.0928613543510437, "step": 92 }, { "adv/mean_abs_final_conf": 0.5948590040206909, "adv/mean_abs_reasoning": 0.4218021035194397, "adv/mean_abs_step_conf": 0.5686092376708984, "adv/ratio_final_to_reasoning": 1.4102798422703349, "adv/ratio_step_to_reasoning": 1.3480474206423505, "adv/std_final_conf": 0.7911317348480225, "adv/std_reasoning": 0.6817976832389832, "adv/std_step_conf": 0.8076203465461731, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6285464709993012, "calib/avg_num_step_conf": 7.41796875, "calib/ece": 0.344698795180723, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.013272536687630732, "calib/mean_conf": 0.9832530120481928, "calib/mu_c": 0.9880503144654086, "calib/mu_w": 0.9747777777777779, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.344698795180723, "calib/std_conf": 0.06074057879716255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9069128113879004, "calib/step_q_c_n": 1124.0, "calib/step_q_gap": -9.364022500291114e-05, "calib/step_q_w": 0.9070064516129033, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2901.0, "completions/max_terminated_length": 2901.0, "completions/mean_length": 758.98828125, "completions/mean_terminated_length": 767.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.0992, "grad_norm": 0.04559861496090889, "kl": 0.034160614013671875, "learning_rate": 2.9722222222222225e-06, "loss": -0.0194, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.019911646842956543, "mask/share_reasoning": 0.8632267713546753, "mask/share_step_conf": 0.10514278709888458, "num_tokens": 28698005.0, "reward": 0.5239921808242798, "reward_std": 0.17210304737091064, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6349667310714722, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.0942675769329071, "step": 93 }, { "adv/mean_abs_final_conf": 0.6306023597717285, "adv/mean_abs_reasoning": 0.39281824231147766, "adv/mean_abs_step_conf": 0.6336207389831543, "adv/ratio_final_to_reasoning": 1.6053286020044468, "adv/ratio_step_to_reasoning": 1.6130125099453425, "adv/std_final_conf": 0.779155969619751, "adv/std_reasoning": 0.6403993368148804, "adv/std_step_conf": 0.8244649171829224, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6701030927835052, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.37558704453441305, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.012082474226803863, "calib/mean_conf": 0.9812550607287449, "calib/mu_c": 0.9859999999999999, "calib/mu_w": 0.973917525773196, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3747773279352227, "calib/std_conf": 0.06198834065550579, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9045000000000002, "calib/step_q_c_n": 1020.0, "calib/step_q_gap": 0.017871266002845165, "calib/step_q_w": 0.886628733997155, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 710.078125, "completions/mean_terminated_length": 724.22314453125, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.10026666666666667, "grad_norm": 0.03576529771089554, "kl": 0.0402679443359375, "learning_rate": 2.944444444444445e-06, "loss": -0.0045, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.020559193566441536, "mask/share_reasoning": 0.8584494590759277, "mask/share_step_conf": 0.10146008431911469, "num_tokens": 28988465.0, "reward": 0.49986836314201355, "reward_std": 0.1569095402956009, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6016566753387451, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.08714261651039124, "step": 94 }, { "adv/mean_abs_final_conf": 0.6070525646209717, "adv/mean_abs_reasoning": 0.3995746374130249, "adv/mean_abs_step_conf": 0.5804909467697144, "adv/ratio_final_to_reasoning": 1.5192469886257691, "adv/ratio_step_to_reasoning": 1.4527722543352601, "adv/std_final_conf": 0.7939587831497192, "adv/std_reasoning": 0.6817163228988647, "adv/std_step_conf": 0.8087968230247498, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6079562151043196, "calib/avg_num_step_conf": 6.89453125, "calib/ece": 0.3145161290322582, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": 0.004325595063179533, "calib/mean_conf": 0.9838709677419357, "calib/mu_c": 0.9853012048192771, "calib/mu_w": 0.9809756097560975, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3145161290322582, "calib/std_conf": 0.012326844389861613, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9030408858603067, "calib/step_q_c_n": 1174.0, "calib/step_q_gap": 0.017321765724942928, "calib/step_q_w": 0.8857191201353638, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 738.203125, "completions/mean_terminated_length": 741.0980834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.10133333333333333, "grad_norm": 0.02444363757967949, "kl": 0.04119110107421875, "learning_rate": 2.916666666666667e-06, "loss": -0.0508, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020546311512589455, "mask/share_reasoning": 0.8693224787712097, "mask/share_step_conf": 0.10622496902942657, "num_tokens": 29283573.0, "reward": 0.5394783616065979, "reward_std": 0.17019683122634888, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6602257490158081, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.09529343992471695, "step": 95 }, { "adv/mean_abs_final_conf": 0.48972174525260925, "adv/mean_abs_reasoning": 0.21803729236125946, "adv/mean_abs_step_conf": 0.4690099358558655, "adv/ratio_final_to_reasoning": 2.2460458022988288, "adv/ratio_step_to_reasoning": 2.1510537522121536, "adv/std_final_conf": 0.6662155389785767, "adv/std_reasoning": 0.5227459669113159, "adv/std_step_conf": 0.7550627589225769, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6268187365061486, "calib/avg_num_step_conf": 6.8046875, "calib/ece": 0.1920078740157481, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": 0.006374730122970518, "calib/mean_conf": 0.9833464566929135, "calib/mu_c": 0.9846766169154231, "calib/mu_w": 0.9783018867924526, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1920078740157481, "calib/std_conf": 0.01112836904857111, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.90975073313783, "calib/step_q_c_n": 1364.0, "calib/step_q_gap": 0.014750733137830041, "calib/step_q_w": 0.8949999999999999, "calib/step_q_w_n": 378.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 689.03515625, "completions/mean_terminated_length": 689.03515625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.1024, "grad_norm": 0.0627409890294075, "kl": 0.047210693359375, "learning_rate": 2.888888888888889e-06, "loss": 0.0412, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.021980471909046173, "mask/share_reasoning": 0.8722586035728455, "mask/share_step_conf": 0.10576092451810837, "num_tokens": 29565782.0, "reward": 0.6128716468811035, "reward_std": 0.09135686606168747, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.7930004000663757, "rewards/format_reward_step": 0.98828125, "rewards/step_margin_reward": 0.07805542647838593, "step": 96 }, { "adv/mean_abs_final_conf": 0.6451533436775208, "adv/mean_abs_reasoning": 0.5259956121444702, "adv/mean_abs_step_conf": 0.6510102152824402, "adv/ratio_final_to_reasoning": 1.2265375010396904, "adv/ratio_step_to_reasoning": 1.2376723308171502, "adv/std_final_conf": 0.8117048144340515, "adv/std_reasoning": 0.7754231691360474, "adv/std_step_conf": 0.8423149585723877, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5260401721664274, "calib/avg_num_step_conf": 7.00390625, "calib/ece": 0.32012048192771114, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.018271162123385976, "calib/mean_conf": 0.9714457831325303, "calib/mu_c": 0.9776829268292685, "calib/mu_w": 0.9594117647058825, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31646586345381555, "calib/std_conf": 0.10419474194644517, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9067301038062284, "calib/step_q_c_n": 1156.0, "calib/step_q_gap": 0.005552709771691644, "calib/step_q_w": 0.9011773940345368, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3021.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 703.01171875, "completions/mean_terminated_length": 708.5472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 324.0, "epoch": 0.10346666666666667, "grad_norm": 0.023143086582422256, "kl": 0.055233001708984375, "learning_rate": 2.861111111111111e-06, "loss": 0.0085, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.020746413618326187, "mask/share_reasoning": 0.8638022541999817, "mask/share_step_conf": 0.10763886570930481, "num_tokens": 29850825.0, "reward": 0.5442949533462524, "reward_std": 0.18832965195178986, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6562254428863525, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.10970824956893921, "step": 97 }, { "adv/mean_abs_final_conf": 0.6230227947235107, "adv/mean_abs_reasoning": 0.48194950819015503, "adv/mean_abs_step_conf": 0.6910111308097839, "adv/ratio_final_to_reasoning": 1.2927138302581165, "adv/ratio_step_to_reasoning": 1.4337832471387082, "adv/std_final_conf": 0.7662563323974609, "adv/std_reasoning": 0.7207810878753662, "adv/std_step_conf": 0.8865677118301392, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5703787878787878, "calib/avg_num_step_conf": 6.6796875, "calib/ece": 0.3069387755102043, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.01341287878787878, "calib/mean_conf": 0.9804081632653063, "calib/mu_c": 0.9847878787878789, "calib/mu_w": 0.9713750000000001, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3069387755102043, "calib/std_conf": 0.06118420443330623, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.907230499561788, "calib/step_q_c_n": 1141.0, "calib/step_q_gap": 0.005332432777956586, "calib/step_q_w": 0.9018980667838314, "calib/step_q_w_n": 569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 732.6328125, "completions/mean_terminated_length": 744.261962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 287.0, "epoch": 0.10453333333333334, "grad_norm": 0.039286430925130844, "kl": 0.04454803466796875, "learning_rate": 2.8333333333333335e-06, "loss": -0.0071, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020196489989757538, "mask/share_reasoning": 0.8626416921615601, "mask/share_step_conf": 0.10153677314519882, "num_tokens": 30144563.0, "reward": 0.5446101427078247, "reward_std": 0.19345897436141968, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6584718823432922, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.11043588817119598, "step": 98 }, { "adv/mean_abs_final_conf": 0.7042887210845947, "adv/mean_abs_reasoning": 0.5367194414138794, "adv/mean_abs_step_conf": 0.569164514541626, "adv/ratio_final_to_reasoning": 1.3122101916585838, "adv/ratio_step_to_reasoning": 1.060450713397444, "adv/std_final_conf": 0.8553628921508789, "adv/std_reasoning": 0.7755724191665649, "adv/std_step_conf": 0.7922757267951965, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6152421652421651, "calib/avg_num_step_conf": 7.17578125, "calib/ece": 0.48556962025316475, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9915611814345991, "calib/gap": 0.011797008547008492, "calib/mean_conf": 0.9792405063291141, "calib/mu_c": 0.9852136752136754, "calib/mu_w": 0.9734166666666669, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48556962025316475, "calib/std_conf": 0.06443730119579529, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9094976635514018, "calib/step_q_c_n": 856.0, "calib/step_q_gap": 0.015940069259862577, "calib/step_q_w": 0.8935575942915392, "calib/step_q_w_n": 981.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 794.03125, "completions/mean_terminated_length": 822.9635620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.1056, "grad_norm": 0.04899228736758232, "kl": 0.038173675537109375, "learning_rate": 2.805555555555556e-06, "loss": -0.0632, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01818915829062462, "mask/share_reasoning": 0.8507558107376099, "mask/share_step_conf": 0.09589883685112, "num_tokens": 30453635.0, "reward": 0.39999672770500183, "reward_std": 0.19157007336616516, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.47771012783050537, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.04572080075740814, "step": 99 }, { "adv/mean_abs_final_conf": 0.6158119440078735, "adv/mean_abs_reasoning": 0.34032493829727173, "adv/mean_abs_step_conf": 0.5820865035057068, "adv/ratio_final_to_reasoning": 1.8094822762296827, "adv/ratio_step_to_reasoning": 1.7103845119844194, "adv/std_final_conf": 0.8010197281837463, "adv/std_reasoning": 0.6187408566474915, "adv/std_step_conf": 0.8245059847831726, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6804332386363636, "calib/avg_num_step_conf": 7.15625, "calib/ece": 0.3326209677419356, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.967741935483871, "calib/gap": 0.029079545454545386, "calib/mean_conf": 0.9645564516129034, "calib/mu_c": 0.9748750000000002, "calib/mu_w": 0.9457954545454548, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3260080645161292, "calib/std_conf": 0.12064596315715925, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9084753363228701, "calib/step_q_c_n": 1115.0, "calib/step_q_gap": 0.008168502292186797, "calib/step_q_w": 0.9003068340306833, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 803.80859375, "completions/mean_terminated_length": 813.3399658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.10666666666666667, "grad_norm": 0.030806303024291992, "kl": 0.03832244873046875, "learning_rate": 2.7777777777777783e-06, "loss": -0.028, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01913512498140335, "mask/share_reasoning": 0.8721370697021484, "mask/share_step_conf": 0.0970090925693512, "num_tokens": 30766818.0, "reward": 0.5343170166015625, "reward_std": 0.15101656317710876, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6469480395317078, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.10293594002723694, "step": 100 }, { "adv/mean_abs_final_conf": 0.6107825040817261, "adv/mean_abs_reasoning": 0.4879143238067627, "adv/mean_abs_step_conf": 0.6642156839370728, "adv/ratio_final_to_reasoning": 1.2518232695370202, "adv/ratio_step_to_reasoning": 1.3613367173867472, "adv/std_final_conf": 0.856407642364502, "adv/std_reasoning": 0.7754284739494324, "adv/std_step_conf": 0.8865810632705688, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.595035137948985, "calib/avg_num_step_conf": 7.10546875, "calib/ece": 0.42983935742971885, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9718875502008032, "calib/gap": 0.020963690786048983, "calib/mean_conf": 0.9687951807228916, "calib/mu_c": 0.9783088235294118, "calib/mu_w": 0.9573451327433629, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42622489959839355, "calib/std_conf": 0.10942783935072162, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9006729758149317, "calib/step_q_c_n": 951.0, "calib/step_q_gap": 0.0044056947089408105, "calib/step_q_w": 0.8962672811059909, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 810.4296875, "completions/mean_terminated_length": 820.03955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.10773333333333333, "grad_norm": 0.029020655900239944, "kl": 0.038661956787109375, "learning_rate": 2.7500000000000004e-06, "loss": -0.018, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018506541848182678, "mask/share_reasoning": 0.8732171058654785, "mask/share_step_conf": 0.0965576022863388, "num_tokens": 31081280.0, "reward": 0.46975669264793396, "reward_std": 0.20999515056610107, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.556312084197998, "rewards/format_reward_step": 0.97265625, "rewards/step_margin_reward": 0.08242004364728928, "step": 101 }, { "adv/mean_abs_final_conf": 0.5437313914299011, "adv/mean_abs_reasoning": 0.3697284460067749, "adv/mean_abs_step_conf": 0.5517661571502686, "adv/ratio_final_to_reasoning": 1.4706236355423348, "adv/ratio_step_to_reasoning": 1.4923551679876912, "adv/std_final_conf": 0.7598369121551514, "adv/std_reasoning": 0.6611893177032471, "adv/std_step_conf": 0.8055055737495422, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6532316630355846, "calib/avg_num_step_conf": 7.4609375, "calib/ece": 0.3122267206477735, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9757085020242915, "calib/gap": 0.0538823529411766, "calib/mean_conf": 0.9614574898785426, "calib/mu_c": 0.9800000000000001, "calib/mu_w": 0.9261176470588235, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30890688259109333, "calib/std_conf": 0.14719888366011583, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9087746710526318, "calib/step_q_c_n": 1216.0, "calib/step_q_gap": 0.017319339640527898, "calib/step_q_w": 0.8914553314121039, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 717.65234375, "completions/mean_terminated_length": 723.3031616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.1088, "grad_norm": 0.03661997243762016, "kl": 0.04579925537109375, "learning_rate": 2.7222222222222224e-06, "loss": 0.0228, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02077307738363743, "mask/share_reasoning": 0.8582336902618408, "mask/share_step_conf": 0.11318069696426392, "num_tokens": 31371695.0, "reward": 0.5324283838272095, "reward_std": 0.1493452787399292, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6595359444618225, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.08500830084085464, "step": 102 }, { "adv/mean_abs_final_conf": 0.535736083984375, "adv/mean_abs_reasoning": 0.3531227707862854, "adv/mean_abs_step_conf": 0.5209876894950867, "adv/ratio_final_to_reasoning": 1.5171383108245082, "adv/ratio_step_to_reasoning": 1.4753726822402946, "adv/std_final_conf": 0.7622296214103699, "adv/std_reasoning": 0.640420138835907, "adv/std_step_conf": 0.7732493877410889, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6995632572555648, "calib/avg_num_step_conf": 6.85546875, "calib/ece": 0.3472874493927127, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9757085020242915, "calib/gap": 0.02220695970695974, "calib/mean_conf": 0.9714979757085022, "calib/mu_c": 0.9796794871794873, "calib/mu_w": 0.9574725274725275, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34360323886639693, "calib/std_conf": 0.10629852069514006, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9108103130755065, "calib/step_q_c_n": 1086.0, "calib/step_q_gap": 0.005593571670424291, "calib/step_q_w": 0.9052167414050822, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 846.1796875, "completions/mean_terminated_length": 849.4981079101562, "completions/min_length": 0.0, "completions/min_terminated_length": 368.0, "epoch": 0.10986666666666667, "grad_norm": 0.024845009669661522, "kl": 0.035243988037109375, "learning_rate": 2.6944444444444444e-06, "loss": 0.0076, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018439754843711853, "mask/share_reasoning": 0.8834730982780457, "mask/share_step_conf": 0.0941808819770813, "num_tokens": 31692869.0, "reward": 0.5128688216209412, "reward_std": 0.14414164423942566, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6279234886169434, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.0829705148935318, "step": 103 }, { "adv/mean_abs_final_conf": 0.5839576721191406, "adv/mean_abs_reasoning": 0.4554803967475891, "adv/mean_abs_step_conf": 0.5735897421836853, "adv/ratio_final_to_reasoning": 1.2820698240560044, "adv/ratio_step_to_reasoning": 1.259307198025359, "adv/std_final_conf": 0.8070527911186218, "adv/std_reasoning": 0.7392352223396301, "adv/std_step_conf": 0.8255413770675659, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6267127799736495, "calib/avg_num_step_conf": 7.31640625, "calib/ece": 0.4272580645161291, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9838709677419355, "calib/gap": 0.008400527009222558, "calib/mean_conf": 0.9761290322580647, "calib/mu_c": 0.9798550724637681, "calib/mu_w": 0.9714545454545456, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4234677419354839, "calib/std_conf": 0.08728623360234324, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.908976076555024, "calib/step_q_c_n": 1045.0, "calib/step_q_gap": 0.0012466079559901866, "calib/step_q_w": 0.9077294685990338, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 748.828125, "completions/mean_terminated_length": 754.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.11093333333333333, "grad_norm": 0.05281037092208862, "kl": 0.041156768798828125, "learning_rate": 2.666666666666667e-06, "loss": -0.0175, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.019355978816747665, "mask/share_reasoning": 0.8669632077217102, "mask/share_step_conf": 0.10586832463741302, "num_tokens": 31991249.0, "reward": 0.47446608543395996, "reward_std": 0.16201914846897125, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5556609630584717, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.09170868992805481, "step": 104 }, { "adv/mean_abs_final_conf": 0.6759294271469116, "adv/mean_abs_reasoning": 0.5426207780838013, "adv/mean_abs_step_conf": 0.6598316431045532, "adv/ratio_final_to_reasoning": 1.2456755333510698, "adv/ratio_step_to_reasoning": 1.2160088034864196, "adv/std_final_conf": 0.8506894111633301, "adv/std_reasoning": 0.7930552363395691, "adv/std_step_conf": 0.8581895232200623, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.631141130465695, "calib/avg_num_step_conf": 8.09375, "calib/ece": 0.36276859504132253, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9586776859504132, "calib/gap": 0.0609605403483825, "calib/mean_conf": 0.9550826446280992, "calib/mu_c": 0.9795172413793104, "calib/mu_w": 0.9185567010309279, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35933884297520685, "calib/std_conf": 0.16161693751639863, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9089204025617567, "calib/step_q_c_n": 1093.0, "calib/step_q_gap": 0.004129799905985498, "calib/step_q_w": 0.9047906026557712, "calib/step_q_w_n": 979.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 815.9140625, "completions/mean_terminated_length": 845.6437377929688, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.112, "grad_norm": 0.024312181398272514, "kl": 0.041378021240234375, "learning_rate": 2.6388888888888893e-06, "loss": -0.1082, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.017779836431145668, "mask/share_reasoning": 0.8484976291656494, "mask/share_step_conf": 0.09856632351875305, "num_tokens": 32305883.0, "reward": 0.49662870168685913, "reward_std": 0.22006474435329437, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.601526141166687, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.08938749134540558, "step": 105 }, { "adv/mean_abs_final_conf": 0.6190087199211121, "adv/mean_abs_reasoning": 0.4825575649738312, "adv/mean_abs_step_conf": 0.5808240175247192, "adv/ratio_final_to_reasoning": 1.2827665854843258, "adv/ratio_step_to_reasoning": 1.2036367465428026, "adv/std_final_conf": 0.8221418857574463, "adv/std_reasoning": 0.7393922209739685, "adv/std_step_conf": 0.8415817022323608, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7253136933987998, "calib/avg_num_step_conf": 6.96875, "calib/ece": 0.3819591836734696, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9673469387755103, "calib/gap": 0.07162234042553162, "calib/mean_conf": 0.957469387755102, "calib/mu_c": 0.9878723404255318, "calib/mu_w": 0.9162500000000002, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3819591836734696, "calib/std_conf": 0.1596223990778871, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9103830227743271, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.007106739155745112, "calib/step_q_w": 0.903276283618582, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2982.0, "completions/max_terminated_length": 2982.0, "completions/mean_length": 741.48046875, "completions/mean_terminated_length": 759.2760620117188, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 0.11306666666666666, "grad_norm": 0.029701529070734978, "kl": 0.04486083984375, "learning_rate": 2.6111111111111113e-06, "loss": -0.0522, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019311219453811646, "mask/share_reasoning": 0.8597928285598755, "mask/share_step_conf": 0.09745845943689346, "num_tokens": 32600286.0, "reward": 0.4921899735927582, "reward_std": 0.16975721716880798, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5927125215530396, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.09010495245456696, "step": 106 }, { "adv/mean_abs_final_conf": 0.5777794122695923, "adv/mean_abs_reasoning": 0.44087734818458557, "adv/mean_abs_step_conf": 0.6997914910316467, "adv/ratio_final_to_reasoning": 1.3105218824435698, "adv/ratio_step_to_reasoning": 1.5872702326694714, "adv/std_final_conf": 0.7838694453239441, "adv/std_reasoning": 0.7206186652183533, "adv/std_step_conf": 0.9182903170585632, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6142122235872236, "calib/avg_num_step_conf": 7.91796875, "calib/ece": 0.2766800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.976, "calib/gap": 0.02731879606879628, "calib/mean_conf": 0.9718000000000002, "calib/mu_c": 0.9798863636363638, "calib/mu_w": 0.9525675675675676, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2722400000000001, "calib/std_conf": 0.10596395613603711, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9099112426035503, "calib/step_q_c_n": 1352.0, "calib/step_q_gap": -0.002355424063116396, "calib/step_q_w": 0.9122666666666667, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 751.265625, "completions/mean_terminated_length": 760.1739501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 335.0, "epoch": 0.11413333333333334, "grad_norm": 0.035284675657749176, "kl": 0.0480804443359375, "learning_rate": 2.5833333333333337e-06, "loss": 0.019, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.019474536180496216, "mask/share_reasoning": 0.8588442802429199, "mask/share_step_conf": 0.10996241867542267, "num_tokens": 32897226.0, "reward": 0.5779565572738647, "reward_std": 0.19086697697639465, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7031800746917725, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.1199205219745636, "step": 107 }, { "adv/mean_abs_final_conf": 0.5465351939201355, "adv/mean_abs_reasoning": 0.4012470543384552, "adv/mean_abs_step_conf": 0.6311435699462891, "adv/ratio_final_to_reasoning": 1.3620914795778876, "adv/ratio_step_to_reasoning": 1.5729550238988528, "adv/std_final_conf": 0.7537654638290405, "adv/std_reasoning": 0.6816881895065308, "adv/std_step_conf": 0.8558666110038757, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6519782569951881, "calib/avg_num_step_conf": 7.51171875, "calib/ece": 0.212798353909465, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9465020576131687, "calib/gap": 0.10581714489395821, "calib/mean_conf": 0.9513991769547326, "calib/mu_c": 0.9783977900552485, "calib/mu_w": 0.8725806451612903, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2096707818930041, "calib/std_conf": 0.16139979602125026, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119541080680978, "calib/step_q_c_n": 1351.0, "calib/step_q_gap": 0.008947115061104904, "calib/step_q_w": 0.9030069930069929, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 789.66015625, "completions/mean_terminated_length": 808.612060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.1152, "grad_norm": 0.03416075184941292, "kl": 0.04317474365234375, "learning_rate": 2.5555555555555557e-06, "loss": -0.025, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018498994410037994, "mask/share_reasoning": 0.8550485372543335, "mask/share_step_conf": 0.1030149906873703, "num_tokens": 33202611.0, "reward": 0.5849625468254089, "reward_std": 0.15807197988033295, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7417808771133423, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.09689418971538544, "step": 108 }, { "adv/mean_abs_final_conf": 0.5549604892730713, "adv/mean_abs_reasoning": 0.4143076539039612, "adv/mean_abs_step_conf": 0.4917936325073242, "adv/ratio_final_to_reasoning": 1.33948886544518, "adv/ratio_step_to_reasoning": 1.1870252163415853, "adv/std_final_conf": 0.7719733715057373, "adv/std_reasoning": 0.7013838291168213, "adv/std_step_conf": 0.7514364719390869, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7433721014231021, "calib/avg_num_step_conf": 8.18359375, "calib/ece": 0.42466101694915265, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9152542372881356, "calib/gap": 0.04861373979628669, "calib/mean_conf": 0.9398305084745763, "calib/mu_c": 0.962283464566929, "calib/mu_w": 0.9136697247706423, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41317796610169505, "calib/std_conf": 0.18462276266203417, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9148893360160967, "calib/step_q_c_n": 994.0, "calib/step_q_gap": 0.016410680248612475, "calib/step_q_w": 0.8984786557674842, "calib/step_q_w_n": 1101.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 833.2890625, "completions/mean_terminated_length": 860.1693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.11626666666666667, "grad_norm": 0.047447409480810165, "kl": 0.041332244873046875, "learning_rate": 2.5277777777777778e-06, "loss": -0.0616, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.017474649474024773, "mask/share_reasoning": 0.8517318964004517, "mask/share_step_conf": 0.09954346716403961, "num_tokens": 33520533.0, "reward": 0.4513227939605713, "reward_std": 0.12802022695541382, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.534849226474762, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.08420264720916748, "step": 109 }, { "adv/mean_abs_final_conf": 0.6413172483444214, "adv/mean_abs_reasoning": 0.46961185336112976, "adv/mean_abs_step_conf": 0.6421517133712769, "adv/ratio_final_to_reasoning": 1.3656325830669584, "adv/ratio_step_to_reasoning": 1.3674095080335729, "adv/std_final_conf": 0.8469480872154236, "adv/std_reasoning": 0.739577054977417, "adv/std_step_conf": 0.8569457530975342, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6070996225877662, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.3432377049180329, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9631147540983607, "calib/gap": 0.06185216834009821, "calib/mean_conf": 0.9556967213114756, "calib/mu_c": 0.979271523178808, "calib/mu_w": 0.9174193548387097, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34004098360655755, "calib/std_conf": 0.16116102269206362, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9149534292972059, "calib/step_q_c_n": 1181.0, "calib/step_q_gap": 0.00016314232148839647, "calib/step_q_w": 0.9147902869757175, "calib/step_q_w_n": 906.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2732.0, "completions/max_terminated_length": 2732.0, "completions/mean_length": 712.703125, "completions/mean_terminated_length": 729.8080444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.11733333333333333, "grad_norm": 0.024848058819770813, "kl": 0.04810333251953125, "learning_rate": 2.5e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.019908972084522247, "mask/share_reasoning": 0.8381432294845581, "mask/share_step_conf": 0.11851028352975845, "num_tokens": 33807905.0, "reward": 0.5132585763931274, "reward_std": 0.20372839272022247, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6232175827026367, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.0939246416091919, "step": 110 }, { "adv/mean_abs_final_conf": 0.5431433916091919, "adv/mean_abs_reasoning": 0.4445968270301819, "adv/mean_abs_step_conf": 0.611305296421051, "adv/ratio_final_to_reasoning": 1.2216537739085576, "adv/ratio_step_to_reasoning": 1.3749654951530996, "adv/std_final_conf": 0.7826384902000427, "adv/std_reasoning": 0.720780611038208, "adv/std_step_conf": 0.8417488932609558, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6398247710075666, "calib/avg_num_step_conf": 7.79296875, "calib/ece": 0.29635593220338974, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.8940677966101694, "calib/gap": 0.0925655117483074, "calib/mean_conf": 0.9247457627118645, "calib/mu_c": 0.956516129032258, "calib/mu_w": 0.8639506172839506, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28216101694915247, "calib/std_conf": 0.21463888421634741, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9072383465259455, "calib/step_q_c_n": 1137.0, "calib/step_q_gap": -0.012516898229299356, "calib/step_q_w": 0.9197552447552448, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 746.828125, "completions/mean_terminated_length": 786.7818603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.1184, "grad_norm": 0.029776204377412796, "kl": 0.0422210693359375, "learning_rate": 2.4722222222222226e-06, "loss": -0.1322, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01845252886414528, "mask/share_reasoning": 0.8304966688156128, "mask/share_step_conf": 0.10026955604553223, "num_tokens": 34106501.0, "reward": 0.5177097916603088, "reward_std": 0.1772458851337433, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6399652361869812, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.09076686948537827, "step": 111 }, { "adv/mean_abs_final_conf": 0.5834466814994812, "adv/mean_abs_reasoning": 0.45243269205093384, "adv/mean_abs_step_conf": 0.592110276222229, "adv/ratio_final_to_reasoning": 1.289576751968662, "adv/ratio_step_to_reasoning": 1.3087256659949114, "adv/std_final_conf": 0.8077910542488098, "adv/std_reasoning": 0.7207841277122498, "adv/std_step_conf": 0.8251468539237976, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.7726471883717515, "calib/avg_num_step_conf": 8.78125, "calib/ece": 0.33729957805907174, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.8185654008438819, "calib/gap": 0.1558640434591101, "calib/mean_conf": 0.8708016877637129, "calib/mu_c": 0.935251798561151, "calib/mu_w": 0.779387755102041, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3108016877637131, "calib/std_conf": 0.2917092628038922, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9141239892183289, "calib/step_q_c_n": 1113.0, "calib/step_q_gap": -0.0030037640856358516, "calib/step_q_w": 0.9171277533039648, "calib/step_q_w_n": 1135.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 764.14453125, "completions/mean_terminated_length": 815.0875244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.11946666666666667, "grad_norm": 0.057774025946855545, "kl": 0.0446014404296875, "learning_rate": 2.4444444444444447e-06, "loss": -0.1685, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.017471184954047203, "mask/share_reasoning": 0.8221666216850281, "mask/share_step_conf": 0.09786219894886017, "num_tokens": 34410042.0, "reward": 0.4895625710487366, "reward_std": 0.18455946445465088, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6176429986953735, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.06773216277360916, "step": 112 }, { "adv/mean_abs_final_conf": 0.597976565361023, "adv/mean_abs_reasoning": 0.3947388529777527, "adv/mean_abs_step_conf": 0.6773931384086609, "adv/ratio_final_to_reasoning": 1.5148662485340014, "adv/ratio_step_to_reasoning": 1.716053875362602, "adv/std_final_conf": 0.8128007650375366, "adv/std_reasoning": 0.7012818455696106, "adv/std_step_conf": 0.8878843188285828, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7030959752321981, "calib/avg_num_step_conf": 8.203125, "calib/ece": 0.29709677419354846, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8387096774193549, "calib/gap": 0.16902786377708978, "calib/mean_conf": 0.8870161290322581, "calib/mu_c": 0.951764705882353, "calib/mu_w": 0.7827368421052632, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28358870967741945, "calib/std_conf": 0.2722841845557857, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9182191780821918, "calib/step_q_c_n": 1241.0, "calib/step_q_gap": 0.003930133793147439, "calib/step_q_w": 0.9142890442890443, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 739.1796875, "completions/mean_terminated_length": 753.9044189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 421.0, "epoch": 0.12053333333333334, "grad_norm": 0.04569617658853531, "kl": 0.05890655517578125, "learning_rate": 2.4166666666666667e-06, "loss": 0.0051, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01979427970945835, "mask/share_reasoning": 0.8440899848937988, "mask/share_step_conf": 0.11658456176519394, "num_tokens": 34704472.0, "reward": 0.5492851138114929, "reward_std": 0.15636232495307922, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6708121299743652, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.1152581125497818, "step": 113 }, { "adv/mean_abs_final_conf": 0.5460219979286194, "adv/mean_abs_reasoning": 0.3543413281440735, "adv/mean_abs_step_conf": 0.5728147029876709, "adv/ratio_final_to_reasoning": 1.5409492332957826, "adv/ratio_step_to_reasoning": 1.6165619347533946, "adv/std_final_conf": 0.7545836567878723, "adv/std_reasoning": 0.6186301708221436, "adv/std_step_conf": 0.8066951036453247, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7095818520399526, "calib/avg_num_step_conf": 8.25390625, "calib/ece": 0.2344081632653063, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8081632653061225, "calib/gap": 0.19621466057220238, "calib/mean_conf": 0.8503265306122448, "calib/mu_c": 0.9031843575418994, "calib/mu_w": 0.706969696969697, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1770612244897961, "calib/std_conf": 0.3217602978866824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9194342105263158, "calib/step_q_c_n": 1520.0, "calib/step_q_gap": 0.021440955888879065, "calib/step_q_w": 0.8979932546374367, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 755.85546875, "completions/mean_terminated_length": 773.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.1216, "grad_norm": 0.04572848975658417, "kl": 0.05167388916015625, "learning_rate": 2.388888888888889e-06, "loss": -0.0136, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.019356999546289444, "mask/share_reasoning": 0.838664174079895, "mask/share_step_conf": 0.11854131519794464, "num_tokens": 35002995.0, "reward": 0.5850042104721069, "reward_std": 0.16218672692775726, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7297917604446411, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.10896667838096619, "step": 114 }, { "adv/mean_abs_final_conf": 0.7146841287612915, "adv/mean_abs_reasoning": 0.5172688961029053, "adv/mean_abs_step_conf": 0.6105026602745056, "adv/ratio_final_to_reasoning": 1.381649146402788, "adv/ratio_step_to_reasoning": 1.1802423553281898, "adv/std_final_conf": 0.8772038817405701, "adv/std_reasoning": 0.7928091287612915, "adv/std_step_conf": 0.8541765809059143, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6585798816568047, "calib/avg_num_step_conf": 8.765625, "calib/ece": 0.3560323886639676, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6963562753036437, "calib/gap": 0.1992564102564105, "calib/mean_conf": 0.7507692307692306, "calib/mu_c": 0.8451538461538463, "calib/mu_w": 0.6458974358974358, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29024291497975707, "calib/std_conf": 0.4010005393706118, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9103193612774451, "calib/step_q_c_n": 1002.0, "calib/step_q_gap": -0.009728947901298968, "calib/step_q_w": 0.920048309178744, "calib/step_q_w_n": 1242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 752.640625, "completions/mean_terminated_length": 773.7991943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 409.0, "epoch": 0.12266666666666666, "grad_norm": 0.0434345006942749, "kl": 0.050384521484375, "learning_rate": 2.361111111111111e-06, "loss": -0.0688, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018630973994731903, "mask/share_reasoning": 0.8468644618988037, "mask/share_step_conf": 0.10716083645820618, "num_tokens": 35300935.0, "reward": 0.49607956409454346, "reward_std": 0.2259339690208435, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6164039373397827, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.0812239944934845, "step": 115 }, { "adv/mean_abs_final_conf": 0.6588397026062012, "adv/mean_abs_reasoning": 0.5030173063278198, "adv/mean_abs_step_conf": 0.6287224292755127, "adv/ratio_final_to_reasoning": 1.3097754178995002, "adv/ratio_step_to_reasoning": 1.2499021830190669, "adv/std_final_conf": 0.849608838558197, "adv/std_reasoning": 0.7755253314971924, "adv/std_step_conf": 0.8579614162445068, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6190443213296398, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.3493522267206478, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5506072874493927, "calib/gap": 0.18693421052631565, "calib/mean_conf": 0.6146153846153848, "calib/mu_c": 0.6865131578947368, "calib/mu_w": 0.4995789473684211, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1742914979757085, "calib/std_conf": 0.456469192088652, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.917745803357314, "calib/step_q_c_n": 1251.0, "calib/step_q_gap": 0.005545219131413259, "calib/step_q_w": 0.9122005842259008, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 806.85546875, "completions/mean_terminated_length": 822.9282836914062, "completions/min_length": 0.0, "completions/min_terminated_length": 410.0, "epoch": 0.12373333333333333, "grad_norm": 0.042107515037059784, "kl": 0.05181121826171875, "learning_rate": 2.3333333333333336e-06, "loss": -0.0284, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017941061407327652, "mask/share_reasoning": 0.8542428016662598, "mask/share_step_conf": 0.10828490555286407, "num_tokens": 35612010.0, "reward": 0.520144522190094, "reward_std": 0.1907690167427063, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6205867528915405, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.10876478254795074, "step": 116 }, { "adv/mean_abs_final_conf": 0.7517051696777344, "adv/mean_abs_reasoning": 0.5047839283943176, "adv/mean_abs_step_conf": 0.6694271564483643, "adv/ratio_final_to_reasoning": 1.489162248229368, "adv/ratio_step_to_reasoning": 1.3261657489329448, "adv/std_final_conf": 0.9063663482666016, "adv/std_reasoning": 0.7577282786369324, "adv/std_step_conf": 0.8747488856315613, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6470985611024792, "calib/avg_num_step_conf": 7.91015625, "calib/ece": 0.34774590163934427, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5614754098360656, "calib/gap": 0.2177713976896576, "calib/mean_conf": 0.6325819672131148, "calib/mu_c": 0.7334351145038169, "calib/mu_w": 0.5156637168141593, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2217213114754098, "calib/std_conf": 0.4505464503656794, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.918167330677291, "calib/step_q_c_n": 1004.0, "calib/step_q_gap": 0.010867624506869733, "calib/step_q_w": 0.9072997061704212, "calib/step_q_w_n": 1021.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 757.97265625, "completions/mean_terminated_length": 779.2810668945312, "completions/min_length": 0.0, "completions/min_terminated_length": 428.0, "epoch": 0.1248, "grad_norm": 0.04477640986442566, "kl": 0.05652618408203125, "learning_rate": 2.305555555555556e-06, "loss": -0.0612, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01890769973397255, "mask/share_reasoning": 0.8459818363189697, "mask/share_step_conf": 0.10776673257350922, "num_tokens": 35912651.0, "reward": 0.5056103467941284, "reward_std": 0.23193931579589844, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.617074191570282, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.10195904970169067, "step": 117 }, { "adv/mean_abs_final_conf": 0.7256252765655518, "adv/mean_abs_reasoning": 0.40705668926239014, "adv/mean_abs_step_conf": 0.517852246761322, "adv/ratio_final_to_reasoning": 1.7826147947118276, "adv/ratio_step_to_reasoning": 1.2721870452484143, "adv/std_final_conf": 0.8958174586296082, "adv/std_reasoning": 0.7204546928405762, "adv/std_step_conf": 0.7915481328964233, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6148611111111111, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.3820731707317074, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.43089430894308944, "calib/gap": 0.18881250000000016, "calib/mean_conf": 0.47898373983739845, "calib/mu_c": 0.5526666666666668, "calib/mu_w": 0.3638541666666666, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12565040650406506, "calib/std_conf": 0.46938225245805687, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9042331768388108, "calib/step_q_c_n": 1278.0, "calib/step_q_gap": 0.03996117683881084, "calib/step_q_w": 0.8642719999999999, "calib/step_q_w_n": 1000.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 800.22265625, "completions/mean_terminated_length": 819.4280395507812, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.12586666666666665, "grad_norm": 0.06547056883573532, "kl": 0.05860137939453125, "learning_rate": 2.277777777777778e-06, "loss": -0.0507, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018557041883468628, "mask/share_reasoning": 0.8442280888557434, "mask/share_step_conf": 0.11377738416194916, "num_tokens": 36221516.0, "reward": 0.5010180473327637, "reward_std": 0.19814561307430267, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.59047931432724, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.10218191146850586, "step": 118 }, { "adv/mean_abs_final_conf": 0.7921119928359985, "adv/mean_abs_reasoning": 0.5211715698242188, "adv/mean_abs_step_conf": 0.5580976009368896, "adv/ratio_final_to_reasoning": 1.5198680025910907, "adv/ratio_step_to_reasoning": 1.0708519674722958, "adv/std_final_conf": 0.9064111709594727, "adv/std_reasoning": 0.7577606439590454, "adv/std_step_conf": 0.7907903790473938, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6426352210139812, "calib/avg_num_step_conf": 8.0546875, "calib/ece": 0.34921810699588474, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.39094650205761317, "calib/gap": 0.23328767123287686, "calib/mean_conf": 0.48016460905349795, "calib/mu_c": 0.5732876712328768, "calib/mu_w": 0.33999999999999997, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11427983539094644, "calib/std_conf": 0.4640756706054439, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9131447746883989, "calib/step_q_c_n": 1043.0, "calib/step_q_gap": 0.09312514760302115, "calib/step_q_w": 0.8200196270853778, "calib/step_q_w_n": 1019.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 836.3203125, "completions/mean_terminated_length": 859.831298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.12693333333333334, "grad_norm": 1.6998445987701416, "kl": 1.7955398559570312, "learning_rate": 2.25e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01771821826696396, "mask/share_reasoning": 0.8580576181411743, "mask/share_step_conf": 0.09688045084476471, "num_tokens": 36540678.0, "reward": 0.5004833936691284, "reward_std": 0.2134743481874466, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6017242074012756, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.09689880907535553, "step": 119 }, { "adv/mean_abs_final_conf": 0.7166979908943176, "adv/mean_abs_reasoning": 0.3981080949306488, "adv/mean_abs_step_conf": 0.591509222984314, "adv/ratio_final_to_reasoning": 1.8002597787396606, "adv/ratio_step_to_reasoning": 1.485800541401591, "adv/std_final_conf": 0.8850848078727722, "adv/std_reasoning": 0.6817933917045593, "adv/std_step_conf": 0.8239685297012329, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5817554557124519, "calib/avg_num_step_conf": 8.66796875, "calib/ece": 0.4027916666666666, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.36666666666666664, "calib/gap": 0.14995827984595628, "calib/mean_conf": 0.4697083333333333, "calib/mu_c": 0.5171951219512194, "calib/mu_w": 0.36723684210526314, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09458333333333332, "calib/std_conf": 0.45939035499658554, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9023473142016188, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": -0.0190654764960555, "calib/step_q_w": 0.9214127906976743, "calib/step_q_w_n": 860.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 747.6015625, "completions/mean_terminated_length": 781.1672973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.128, "grad_norm": 0.13198651373386383, "kl": 0.0618438720703125, "learning_rate": 2.222222222222222e-06, "loss": -0.1479, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.018665369600057602, "mask/share_reasoning": 0.8298088312149048, "mask/share_step_conf": 0.10855702310800552, "num_tokens": 36838752.0, "reward": 0.4868927597999573, "reward_std": 0.19931383430957794, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.5547672510147095, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.10339336097240448, "step": 120 }, { "adv/mean_abs_final_conf": 0.7934297919273376, "adv/mean_abs_reasoning": 0.48773372173309326, "adv/mean_abs_step_conf": 0.6476555466651917, "adv/ratio_final_to_reasoning": 1.6267683708807263, "adv/ratio_step_to_reasoning": 1.3278875702172872, "adv/std_final_conf": 0.9295187592506409, "adv/std_reasoning": 0.739499032497406, "adv/std_step_conf": 0.8416915535926819, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6733449477351915, "calib/avg_num_step_conf": 8.109375, "calib/ece": 0.3030705394190871, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.3983402489626556, "calib/gap": 0.26758473234082986, "calib/mean_conf": 0.5517012448132781, "calib/mu_c": 0.6371951219512195, "calib/mu_w": 0.36961038961038967, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08713692946058094, "calib/std_conf": 0.4516626843728425, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9108955916473318, "calib/step_q_c_n": 1293.0, "calib/step_q_gap": -0.0034339102683771383, "calib/step_q_w": 0.9143295019157089, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 821.71484375, "completions/mean_terminated_length": 851.6558837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.12906666666666666, "grad_norm": 0.11628983914852142, "kl": 0.05675506591796875, "learning_rate": 2.1944444444444445e-06, "loss": -0.0551, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017633387818932533, "mask/share_reasoning": 0.84507155418396, "mask/share_step_conf": 0.10213877260684967, "num_tokens": 37154167.0, "reward": 0.5350307822227478, "reward_std": 0.21541568636894226, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6386016011238098, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.11505372822284698, "step": 121 }, { "adv/mean_abs_final_conf": 0.6203608512878418, "adv/mean_abs_reasoning": 0.37967678904533386, "adv/mean_abs_step_conf": 0.5931355953216553, "adv/ratio_final_to_reasoning": 1.6339182936299272, "adv/ratio_step_to_reasoning": 1.562211892944644, "adv/std_final_conf": 0.8358514904975891, "adv/std_reasoning": 0.6612572073936462, "adv/std_step_conf": 0.8236783146858215, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7160174440091656, "calib/avg_num_step_conf": 8.3515625, "calib/ece": 0.30764227642276415, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6463414634146342, "calib/gap": 0.16523911597309482, "calib/mean_conf": 0.7708130081300814, "calib/mu_c": 0.8265644171779142, "calib/mu_w": 0.6613253012048194, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2079268292682926, "calib/std_conf": 0.37039221904401487, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9133486943164362, "calib/step_q_c_n": 1302.0, "calib/step_q_gap": 0.02909032111069454, "calib/step_q_w": 0.8842583732057416, "calib/step_q_w_n": 836.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 751.8515625, "completions/mean_terminated_length": 772.9879150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.13013333333333332, "grad_norm": 0.046760763972997665, "kl": 0.062774658203125, "learning_rate": 2.166666666666667e-06, "loss": -0.0863, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018509387969970703, "mask/share_reasoning": 0.8496177196502686, "mask/share_step_conf": 0.10452916473150253, "num_tokens": 37453985.0, "reward": 0.5595579147338867, "reward_std": 0.14614179730415344, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.67402184009552, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.12556278705596924, "step": 122 }, { "adv/mean_abs_final_conf": 0.7070633172988892, "adv/mean_abs_reasoning": 0.5265301465988159, "adv/mean_abs_step_conf": 0.6078084707260132, "adv/ratio_final_to_reasoning": 1.3428733793615593, "adv/ratio_step_to_reasoning": 1.1543659459049482, "adv/std_final_conf": 0.8862820267677307, "adv/std_reasoning": 0.7756414413452148, "adv/std_step_conf": 0.8420189619064331, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6118337444868057, "calib/avg_num_step_conf": 8.0078125, "calib/ece": 0.33516806722689063, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.6470588235294118, "calib/gap": 0.047467294610151756, "calib/mean_conf": 0.8623949579831933, "calib/mu_c": 0.8805442176870748, "calib/mu_w": 0.833076923076923, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28995798319327726, "calib/std_conf": 0.2610075409737548, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9046708746618575, "calib/step_q_c_n": 1109.0, "calib/step_q_gap": 0.0014190149381592132, "calib/step_q_w": 0.9032518597236983, "calib/step_q_w_n": 941.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 823.140625, "completions/mean_terminated_length": 863.6229248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.1312, "grad_norm": 0.04716081544756889, "kl": 0.05249786376953125, "learning_rate": 2.138888888888889e-06, "loss": -0.1441, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.016694001853466034, "mask/share_reasoning": 0.8423705101013184, "mask/share_step_conf": 0.0940604954957962, "num_tokens": 37769997.0, "reward": 0.4957738220691681, "reward_std": 0.21281671524047852, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6119519472122192, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.07803315669298172, "step": 123 }, { "adv/mean_abs_final_conf": 0.616827130317688, "adv/mean_abs_reasoning": 0.3860151171684265, "adv/mean_abs_step_conf": 0.660730242729187, "adv/ratio_final_to_reasoning": 1.597935165965413, "adv/ratio_step_to_reasoning": 1.7116693449103872, "adv/std_final_conf": 0.8458890914916992, "adv/std_reasoning": 0.6816806793212891, "adv/std_step_conf": 0.8720849752426147, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6382839721254355, "calib/avg_num_step_conf": 7.890625, "calib/ece": 0.27410887096774195, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7620967741935484, "calib/gap": 0.11188704994192833, "calib/mean_conf": 0.9016088709677419, "calib/mu_c": 0.9395060975609757, "calib/mu_w": 0.8276190476190474, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25721370967741936, "calib/std_conf": 0.2211914134249312, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9153998416468724, "calib/step_q_c_n": 1263.0, "calib/step_q_gap": 0.035835772954666445, "calib/step_q_w": 0.8795640686922059, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 834.609375, "completions/mean_terminated_length": 847.857177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.13226666666666667, "grad_norm": 0.04040057957172394, "kl": 0.060516357421875, "learning_rate": 2.1111111111111114e-06, "loss": -0.0405, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017735231667757034, "mask/share_reasoning": 0.8659318089485168, "mask/share_step_conf": 0.10070796310901642, "num_tokens": 38090473.0, "reward": 0.5623254179954529, "reward_std": 0.16345149278640747, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6955687999725342, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.10798833519220352, "step": 124 }, { "adv/mean_abs_final_conf": 0.6533343195915222, "adv/mean_abs_reasoning": 0.5057482719421387, "adv/mean_abs_step_conf": 0.6745232343673706, "adv/ratio_final_to_reasoning": 1.291817205983985, "adv/ratio_step_to_reasoning": 1.3337133743969392, "adv/std_final_conf": 0.8712226748466492, "adv/std_reasoning": 0.7929260730743408, "adv/std_step_conf": 0.9033569693565369, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6195589919816724, "calib/avg_num_step_conf": 8.453125, "calib/ece": 0.33647302904564325, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8340248962655602, "calib/gap": 0.08465349369988517, "calib/mean_conf": 0.9339834024896266, "calib/mu_c": 0.9680555555555554, "calib/mu_w": 0.8834020618556703, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33647302904564325, "calib/std_conf": 0.16902669795268496, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9162532299741601, "calib/step_q_c_n": 1161.0, "calib/step_q_gap": 0.004787626783731347, "calib/step_q_w": 0.9114656031904288, "calib/step_q_w_n": 1003.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2897.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 828.75, "completions/mean_terminated_length": 862.43896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.13333333333333333, "grad_norm": 0.03484358265995979, "kl": 0.05750274658203125, "learning_rate": 2.0833333333333334e-06, "loss": -0.1218, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017188692465424538, "mask/share_reasoning": 0.8439902067184448, "mask/share_step_conf": 0.0997585728764534, "num_tokens": 38407441.0, "reward": 0.4982278347015381, "reward_std": 0.20007047057151794, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6198605298995972, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.07581391930580139, "step": 125 }, { "adv/mean_abs_final_conf": 0.5603800415992737, "adv/mean_abs_reasoning": 0.44012451171875, "adv/mean_abs_step_conf": 0.48066073656082153, "adv/ratio_final_to_reasoning": 1.2732307033091803, "adv/ratio_step_to_reasoning": 1.0921017206784773, "adv/std_final_conf": 0.8002556562423706, "adv/std_reasoning": 0.7015246152877808, "adv/std_step_conf": 0.7370073795318604, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7394120065789473, "calib/avg_num_step_conf": 7.82421875, "calib/ece": 0.3948760330578513, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8181818181818182, "calib/gap": 0.12361019736842116, "calib/mean_conf": 0.923801652892562, "calib/mu_c": 0.9820312500000001, "calib/mu_w": 0.858421052631579, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3948760330578513, "calib/std_conf": 0.19001633769839177, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9115487571701721, "calib/step_q_c_n": 1046.0, "calib/step_q_gap": 0.02963653146484302, "calib/step_q_w": 0.8819122257053291, "calib/step_q_w_n": 957.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 812.41796875, "completions/mean_terminated_length": 831.916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.1344, "grad_norm": 0.035288866609334946, "kl": 0.05678558349609375, "learning_rate": 2.0555555555555555e-06, "loss": -0.0092, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018142499029636383, "mask/share_reasoning": 0.8539996147155762, "mask/share_step_conf": 0.10442037135362625, "num_tokens": 38720884.0, "reward": 0.46661850810050964, "reward_std": 0.17263664305210114, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5864734649658203, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.05770111829042435, "step": 126 }, { "adv/mean_abs_final_conf": 0.5590083003044128, "adv/mean_abs_reasoning": 0.46475133299827576, "adv/mean_abs_step_conf": 0.604527473449707, "adv/ratio_final_to_reasoning": 1.2028116126059325, "adv/ratio_step_to_reasoning": 1.3007546843377205, "adv/std_final_conf": 0.8077696561813354, "adv/std_reasoning": 0.7576107382774353, "adv/std_step_conf": 0.8422495722770691, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5680446378439079, "calib/avg_num_step_conf": 8.1328125, "calib/ece": 0.39438174273858934, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.8713692946058091, "calib/gap": 0.04174747332959006, "calib/mean_conf": 0.9441742738589213, "calib/mu_c": 0.9621897810218978, "calib/mu_w": 0.9204423076923077, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3850456431535271, "calib/std_conf": 0.15941850609340694, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9184539711191336, "calib/step_q_c_n": 1108.0, "calib/step_q_gap": 0.06268189719716233, "calib/step_q_w": 0.8557720739219713, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 755.1015625, "completions/mean_terminated_length": 785.7966918945312, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.13546666666666668, "grad_norm": 0.028602657839655876, "kl": 0.06744384765625, "learning_rate": 2.027777777777778e-06, "loss": -0.0554, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01856895349919796, "mask/share_reasoning": 0.8355052471160889, "mask/share_step_conf": 0.10686329007148743, "num_tokens": 39017862.0, "reward": 0.4707579016685486, "reward_std": 0.18518038094043732, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5729377865791321, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.07248419523239136, "step": 127 }, { "adv/mean_abs_final_conf": 0.6069778203964233, "adv/mean_abs_reasoning": 0.5077593326568604, "adv/mean_abs_step_conf": 0.6239622831344604, "adv/ratio_final_to_reasoning": 1.1954045575497358, "adv/ratio_step_to_reasoning": 1.228854386328196, "adv/std_final_conf": 0.8155607581138611, "adv/std_reasoning": 0.7576378583908081, "adv/std_step_conf": 0.8576443791389465, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6337553879310345, "calib/avg_num_step_conf": 8.01953125, "calib/ece": 0.4188524590163935, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8114754098360656, "calib/gap": 0.06631734913793086, "calib/mean_conf": 0.9057377049180327, "calib/mu_c": 0.937265625, "calib/mu_w": 0.8709482758620691, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4000000000000001, "calib/std_conf": 0.22061545073482344, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9117813765182187, "calib/step_q_c_n": 988.0, "calib/step_q_gap": 0.013828324875026277, "calib/step_q_w": 0.8979530516431924, "calib/step_q_w_n": 1065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 813.53515625, "completions/mean_terminated_length": 833.06005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.13653333333333334, "grad_norm": 0.9542741179466248, "kl": 0.25077056884765625, "learning_rate": 2.0000000000000003e-06, "loss": -0.056, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018129844218492508, "mask/share_reasoning": 0.858094334602356, "mask/share_step_conf": 0.10033835470676422, "num_tokens": 39332791.0, "reward": 0.46389466524124146, "reward_std": 0.19403664767742157, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.562094509601593, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.0750698372721672, "step": 128 }, { "adv/mean_abs_final_conf": 0.5394831895828247, "adv/mean_abs_reasoning": 0.398083359003067, "adv/mean_abs_step_conf": 0.5923358201980591, "adv/ratio_final_to_reasoning": 1.3552015611350092, "adv/ratio_step_to_reasoning": 1.4879693079395853, "adv/std_final_conf": 0.7606017589569092, "adv/std_reasoning": 0.6613805294036865, "adv/std_step_conf": 0.8078027367591858, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.601674992653541, "calib/avg_num_step_conf": 8.953125, "calib/ece": 0.3150607287449395, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9392712550607287, "calib/gap": 0.013018660005877036, "calib/mean_conf": 0.9711740890688259, "calib/mu_c": 0.9755487804878048, "calib/mu_w": 0.9625301204819278, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31113360323886663, "calib/std_conf": 0.07812995284174179, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9164421553090333, "calib/step_q_c_n": 1262.0, "calib/step_q_gap": -0.006295708768636721, "calib/step_q_w": 0.92273786407767, "calib/step_q_w_n": 1030.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 746.35546875, "completions/mean_terminated_length": 767.3373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.1376, "grad_norm": 0.029538558796048164, "kl": 0.071075439453125, "learning_rate": 1.9722222222222224e-06, "loss": -0.0328, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019183719530701637, "mask/share_reasoning": 0.8433767557144165, "mask/share_step_conf": 0.11009576916694641, "num_tokens": 39626242.0, "reward": 0.5408724546432495, "reward_std": 0.14939114451408386, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.650418758392334, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.11179494857788086, "step": 129 }, { "adv/mean_abs_final_conf": 0.5202195048332214, "adv/mean_abs_reasoning": 0.3178079426288605, "adv/mean_abs_step_conf": 0.5504122972488403, "adv/ratio_final_to_reasoning": 1.636899004254086, "adv/ratio_step_to_reasoning": 1.731902269955593, "adv/std_final_conf": 0.758277177810669, "adv/std_reasoning": 0.5961102247238159, "adv/std_step_conf": 0.7907390594482422, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6473863636363637, "calib/avg_num_step_conf": 8.5078125, "calib/ece": 0.3219591836734694, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9142857142857143, "calib/gap": 0.0012575757575759017, "calib/mean_conf": 0.9553469387755102, "calib/mu_c": 0.9557575757575757, "calib/mu_w": 0.9544999999999998, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3019183673469388, "calib/std_conf": 0.14168800894353764, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9155015432098765, "calib/step_q_c_n": 1296.0, "calib/step_q_gap": -0.0016639896699420786, "calib/step_q_w": 0.9171655328798186, "calib/step_q_w_n": 882.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 730.03515625, "completions/mean_terminated_length": 735.783447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.13866666666666666, "grad_norm": 0.021633949130773544, "kl": 0.08086395263671875, "learning_rate": 1.944444444444445e-06, "loss": -0.0076, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.020096562802791595, "mask/share_reasoning": 0.8546808958053589, "mask/share_step_conf": 0.11741002649068832, "num_tokens": 39918419.0, "reward": 0.5351709127426147, "reward_std": 0.13933923840522766, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6518476009368896, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.09740039706230164, "step": 130 }, { "adv/mean_abs_final_conf": 0.5365281105041504, "adv/mean_abs_reasoning": 0.3920038640499115, "adv/mean_abs_step_conf": 0.5234619379043579, "adv/ratio_final_to_reasoning": 1.3686806679942254, "adv/ratio_step_to_reasoning": 1.3353489235955813, "adv/std_final_conf": 0.7930212616920471, "adv/std_reasoning": 0.7205480933189392, "adv/std_step_conf": 0.7906890511512756, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7028908679672038, "calib/avg_num_step_conf": 8.67578125, "calib/ece": 0.4771966527196655, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.8242677824267782, "calib/gap": 0.09408326265196487, "calib/mean_conf": 0.9290794979079499, "calib/mu_c": 0.9806481481481482, "calib/mu_w": 0.8865648854961833, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4771966527196655, "calib/std_conf": 0.16774404552707597, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9052445961319682, "calib/step_q_c_n": 879.0, "calib/step_q_gap": 0.007263970200522496, "calib/step_q_w": 0.8979806259314457, "calib/step_q_w_n": 1342.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 775.9140625, "completions/mean_terminated_length": 807.4552612304688, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.13973333333333332, "grad_norm": 0.03940298780798912, "kl": 0.08701324462890625, "learning_rate": 1.916666666666667e-06, "loss": -0.058, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.018266182392835617, "mask/share_reasoning": 0.8353662490844727, "mask/share_step_conf": 0.10730510950088501, "num_tokens": 40223261.0, "reward": 0.4245801568031311, "reward_std": 0.14697477221488953, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.5070035457611084, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.07106305658817291, "step": 131 }, { "adv/mean_abs_final_conf": 0.5450819134712219, "adv/mean_abs_reasoning": 0.48999106884002686, "adv/mean_abs_step_conf": 0.758597731590271, "adv/ratio_final_to_reasoning": 1.1124323444539788, "adv/ratio_step_to_reasoning": 1.5481868544790545, "adv/std_final_conf": 0.7662321925163269, "adv/std_reasoning": 0.7578213214874268, "adv/std_step_conf": 0.9190864562988281, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.715741507870754, "calib/avg_num_step_conf": 8.42578125, "calib/ece": 0.2532365145228216, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9045643153526971, "calib/gap": 0.08132062966031495, "calib/mean_conf": 0.958630705394191, "calib/mu_c": 0.9825882352941178, "calib/mu_w": 0.9012676056338028, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2532365145228216, "calib/std_conf": 0.11600987461154433, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9107216494845363, "calib/step_q_c_n": 1358.0, "calib/step_q_gap": 0.018368708308065695, "calib/step_q_w": 0.8923529411764706, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2837.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 751.55078125, "completions/mean_terminated_length": 775.7943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 330.0, "epoch": 0.1408, "grad_norm": 0.02585168555378914, "kl": 0.0943603515625, "learning_rate": 1.888888888888889e-06, "loss": -0.1164, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01907270774245262, "mask/share_reasoning": 0.8382309675216675, "mask/share_step_conf": 0.11144635081291199, "num_tokens": 40521250.0, "reward": 0.567676305770874, "reward_std": 0.21228492259979248, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6992738246917725, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.11654753983020782, "step": 132 }, { "adv/mean_abs_final_conf": 0.7107321619987488, "adv/mean_abs_reasoning": 0.6219977140426636, "adv/mean_abs_step_conf": 0.7526808381080627, "adv/ratio_final_to_reasoning": 1.14266040847539, "adv/ratio_step_to_reasoning": 1.210102257797744, "adv/std_final_conf": 0.876602828502655, "adv/std_reasoning": 0.8432193994522095, "adv/std_step_conf": 0.9198958277702332, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6375985360360361, "calib/avg_num_step_conf": 8.67578125, "calib/ece": 0.41757322175732225, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.9037656903765691, "calib/gap": 0.06002463400900915, "calib/mean_conf": 0.9531380753138076, "calib/mu_c": 0.9810156250000002, "calib/mu_w": 0.920990990990991, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41757322175732225, "calib/std_conf": 0.1353091368872253, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9036509528585758, "calib/step_q_c_n": 997.0, "calib/step_q_gap": -0.015785321651228257, "calib/step_q_w": 0.919436274509804, "calib/step_q_w_n": 1224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 843.09765625, "completions/mean_terminated_length": 873.8178100585938, "completions/min_length": 0.0, "completions/min_terminated_length": 324.0, "epoch": 0.14186666666666667, "grad_norm": 0.03142853453755379, "kl": 0.0968475341796875, "learning_rate": 1.8611111111111113e-06, "loss": -0.0891, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.016789088025689125, "mask/share_reasoning": 0.8545149564743042, "mask/share_step_conf": 0.09353969991207123, "num_tokens": 40843427.0, "reward": 0.4552901089191437, "reward_std": 0.2543225884437561, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5454761981964111, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.07916656881570816, "step": 133 }, { "adv/mean_abs_final_conf": 0.7164932489395142, "adv/mean_abs_reasoning": 0.5575861930847168, "adv/mean_abs_step_conf": 0.6128450632095337, "adv/ratio_final_to_reasoning": 1.284991016322841, "adv/ratio_step_to_reasoning": 1.0991037274777375, "adv/std_final_conf": 0.8968268036842346, "adv/std_reasoning": 0.8100833296775818, "adv/std_step_conf": 0.8422779440879822, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6791346973572037, "calib/avg_num_step_conf": 8.4140625, "calib/ece": 0.37463888888888885, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.825, "calib/gap": 0.06022733731173635, "calib/mean_conf": 0.9400555555555556, "calib/mu_c": 0.9656521739130434, "calib/mu_w": 0.905424836601307, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3698472222222222, "calib/std_conf": 0.11987569436197411, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9041947926711668, "calib/step_q_c_n": 1037.0, "calib/step_q_gap": 0.0016164578457414924, "calib/step_q_w": 0.9025783348254253, "calib/step_q_w_n": 1117.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 833.6171875, "completions/mean_terminated_length": 857.0521850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.14293333333333333, "grad_norm": 0.04132247716188431, "kl": 0.09771728515625, "learning_rate": 1.8333333333333333e-06, "loss": -0.1138, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017063282430171967, "mask/share_reasoning": 0.8586098551750183, "mask/share_step_conf": 0.09698310494422913, "num_tokens": 41165785.0, "reward": 0.4857555627822876, "reward_std": 0.2255382090806961, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5929381847381592, "rewards/format_reward_step": 0.9296875, "rewards/step_margin_reward": 0.08482285588979721, "step": 134 }, { "adv/mean_abs_final_conf": 0.6448278427124023, "adv/mean_abs_reasoning": 0.504493236541748, "adv/mean_abs_step_conf": 0.7012089490890503, "adv/ratio_final_to_reasoning": 1.2781694500656429, "adv/ratio_step_to_reasoning": 1.3899273534285004, "adv/std_final_conf": 0.8628575205802917, "adv/std_reasoning": 0.7755191922187805, "adv/std_step_conf": 0.9047689437866211, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5996099290780141, "calib/avg_num_step_conf": 8.078125, "calib/ece": 0.3548360655737706, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9098360655737705, "calib/gap": 0.02526241134751772, "calib/mean_conf": 0.9589344262295083, "calib/mu_c": 0.9686666666666666, "calib/mu_w": 0.9434042553191488, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3495081967213116, "calib/std_conf": 0.09651983043547963, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9050951199338297, "calib/step_q_c_n": 1209.0, "calib/step_q_gap": 0.014839008175971724, "calib/step_q_w": 0.890256111757858, "calib/step_q_w_n": 859.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 820.71484375, "completions/mean_terminated_length": 830.4466552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 425.0, "epoch": 0.144, "grad_norm": 0.026004180312156677, "kl": 0.1096954345703125, "learning_rate": 1.8055555555555557e-06, "loss": -0.0105, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018162589520215988, "mask/share_reasoning": 0.8634178042411804, "mask/share_step_conf": 0.1067008525133133, "num_tokens": 41481768.0, "reward": 0.5172268152236938, "reward_std": 0.2088211178779602, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6170132756233215, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.10962791740894318, "step": 135 }, { "adv/mean_abs_final_conf": 0.691309928894043, "adv/mean_abs_reasoning": 0.6264765858650208, "adv/mean_abs_step_conf": 0.5632041692733765, "adv/ratio_final_to_reasoning": 1.1034888525634237, "adv/ratio_step_to_reasoning": 0.8990027432481302, "adv/std_final_conf": 0.9061822295188904, "adv/std_reasoning": 0.8748161792755127, "adv/std_step_conf": 0.8094339370727539, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7285932721712538, "calib/avg_num_step_conf": 8.4921875, "calib/ece": 0.39331914893617037, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.7957446808510639, "calib/gap": 0.09732634338138924, "calib/mean_conf": 0.922, "calib/mu_c": 0.967142857142857, "calib/mu_w": 0.8698165137614677, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.38957446808510654, "calib/std_conf": 0.17193219841281568, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.9084947267497604, "calib/step_q_c_n": 1043.0, "calib/step_q_gap": 0.01617819270908838, "calib/step_q_w": 0.892316534040672, "calib/step_q_w_n": 1131.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 794.80078125, "completions/mean_terminated_length": 830.4856567382812, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.14506666666666668, "grad_norm": 0.03397757560014725, "kl": 0.118072509765625, "learning_rate": 1.777777777777778e-06, "loss": -0.187, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.017359191551804543, "mask/share_reasoning": 0.8350170850753784, "mask/share_step_conf": 0.10465492308139801, "num_tokens": 41793725.0, "reward": 0.463240385055542, "reward_std": 0.20691561698913574, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5662656426429749, "rewards/format_reward_step": 0.91015625, "rewards/step_margin_reward": 0.07896509021520615, "step": 136 }, { "adv/mean_abs_final_conf": 0.6818667054176331, "adv/mean_abs_reasoning": 0.5092138648033142, "adv/mean_abs_step_conf": 0.5852429866790771, "adv/ratio_final_to_reasoning": 1.339057619102745, "adv/ratio_step_to_reasoning": 1.1493068573557585, "adv/std_final_conf": 0.8761315941810608, "adv/std_reasoning": 0.775528609752655, "adv/std_step_conf": 0.841597318649292, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6587566844919784, "calib/avg_num_step_conf": 10.30859375, "calib/ece": 0.3525531914893619, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.8468085106382979, "calib/gap": 0.09921642899584082, "calib/mean_conf": 0.9248936170212767, "calib/mu_c": 0.9666911764705882, "calib/mu_w": 0.8674747474747474, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3493617021276597, "calib/std_conf": 0.17544196276741522, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8931043557168784, "calib/step_q_c_n": 1102.0, "calib/step_q_gap": 0.033820035612779464, "calib/step_q_w": 0.8592843201040989, "calib/step_q_w_n": 1537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 761.203125, "completions/mean_terminated_length": 808.5809326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 452.0, "epoch": 0.14613333333333334, "grad_norm": 0.04539915919303894, "kl": 0.1297454833984375, "learning_rate": 1.75e-06, "loss": -0.1721, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.017398172989487648, "mask/share_reasoning": 0.8216606378555298, "mask/share_step_conf": 0.10234744846820831, "num_tokens": 42095577.0, "reward": 0.4870629906654358, "reward_std": 0.17607498168945312, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5963343381881714, "rewards/format_reward_step": 0.91015625, "rewards/step_margin_reward": 0.08951041102409363, "step": 137 }, { "adv/mean_abs_final_conf": 0.6759997010231018, "adv/mean_abs_reasoning": 0.4773511588573456, "adv/mean_abs_step_conf": 0.6257574558258057, "adv/ratio_final_to_reasoning": 1.4161476063895375, "adv/ratio_step_to_reasoning": 1.3108954366502559, "adv/std_final_conf": 0.8648813366889954, "adv/std_reasoning": 0.7578563690185547, "adv/std_step_conf": 0.8564469218254089, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7212722298221614, "calib/avg_num_step_conf": 9.46875, "calib/ece": 0.30046025104602525, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.8326359832635983, "calib/gap": 0.08562395500835984, "calib/mean_conf": 0.9406276150627615, "calib/mu_c": 0.9714379084967322, "calib/mu_w": 0.8858139534883723, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30046025104602525, "calib/std_conf": 0.12576137758057834, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8965384615384616, "calib/step_q_c_n": 1248.0, "calib/step_q_gap": -0.004039769754055289, "calib/step_q_w": 0.9005782312925169, "calib/step_q_w_n": 1176.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 826.40234375, "completions/mean_terminated_length": 849.634521484375, "completions/min_length": 0.0, "completions/min_terminated_length": 337.0, "epoch": 0.1472, "grad_norm": 0.02785976231098175, "kl": 0.127197265625, "learning_rate": 1.7222222222222224e-06, "loss": -0.0407, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.017682313919067383, "mask/share_reasoning": 0.8470098972320557, "mask/share_step_conf": 0.10796407610177994, "num_tokens": 42411472.0, "reward": 0.5343175530433655, "reward_std": 0.2138931304216385, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6563183069229126, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.10606680810451508, "step": 138 }, { "adv/mean_abs_final_conf": 0.6428974866867065, "adv/mean_abs_reasoning": 0.4926608204841614, "adv/mean_abs_step_conf": 0.6243624687194824, "adv/ratio_final_to_reasoning": 1.3049494905133727, "adv/ratio_step_to_reasoning": 1.2673272214053708, "adv/std_final_conf": 0.8446921110153198, "adv/std_reasoning": 0.7755197286605835, "adv/std_step_conf": 0.8580498099327087, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6711105675146771, "calib/avg_num_step_conf": 10.51171875, "calib/ece": 0.2494190871369295, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8298755186721992, "calib/gap": 0.07924657534246571, "calib/mean_conf": 0.9309958506224066, "calib/mu_c": 0.955, "calib/mu_w": 0.8757534246575343, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24165975103734444, "calib/std_conf": 0.16118288105319495, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9026022304832714, "calib/step_q_c_n": 1345.0, "calib/step_q_gap": 0.03460817401967542, "calib/step_q_w": 0.867994056463596, "calib/step_q_w_n": 1346.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 753.72265625, "completions/mean_terminated_length": 781.186279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 440.0, "epoch": 0.14826666666666666, "grad_norm": 0.02467586100101471, "kl": 0.1418914794921875, "learning_rate": 1.6944444444444446e-06, "loss": -0.0956, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01858021691441536, "mask/share_reasoning": 0.8354074954986572, "mask/share_step_conf": 0.11085603386163712, "num_tokens": 42707521.0, "reward": 0.5628869533538818, "reward_std": 0.1800568401813507, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6942644119262695, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.11275951564311981, "step": 139 }, { "adv/mean_abs_final_conf": 0.6934506893157959, "adv/mean_abs_reasoning": 0.6130822896957397, "adv/mean_abs_step_conf": 0.6529911756515503, "adv/ratio_final_to_reasoning": 1.1310890902752082, "adv/ratio_step_to_reasoning": 1.0650954800465962, "adv/std_final_conf": 0.888580322265625, "adv/std_reasoning": 0.8431400656700134, "adv/std_step_conf": 0.8747093081474304, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.6419773774612485, "calib/avg_num_step_conf": 10.08984375, "calib/ece": 0.2891375000000002, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.8620689655172413, "calib/gap": 0.05986658567239245, "calib/mean_conf": 0.9407762931034483, "calib/mu_c": 0.960645806451613, "calib/mu_w": 0.9007792207792206, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2809051724137933, "calib/std_conf": 0.14409159861501758, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8979984301412872, "calib/step_q_c_n": 1274.0, "calib/step_q_gap": -0.0024752138617685215, "calib/step_q_w": 0.9004736440030557, "calib/step_q_w_n": 1309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 774.52734375, "completions/mean_terminated_length": 812.6188354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.14933333333333335, "grad_norm": 0.021134378388524055, "kl": 0.13348388671875, "learning_rate": 1.6666666666666667e-06, "loss": -0.1278, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.017613347619771957, "mask/share_reasoning": 0.8253074884414673, "mask/share_step_conf": 0.11020419746637344, "num_tokens": 43010816.0, "reward": 0.5066386461257935, "reward_std": 0.23156294226646423, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6316012144088745, "rewards/format_reward_step": 0.890625, "rewards/step_margin_reward": 0.08245730400085449, "step": 140 }, { "adv/mean_abs_final_conf": 0.6344403028488159, "adv/mean_abs_reasoning": 0.4509612023830414, "adv/mean_abs_step_conf": 0.6370654106140137, "adv/ratio_final_to_reasoning": 1.4068622743956796, "adv/ratio_step_to_reasoning": 1.4126834132238664, "adv/std_final_conf": 0.8541744947433472, "adv/std_reasoning": 0.7394145727157593, "adv/std_step_conf": 0.858633816242218, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7733057228915663, "calib/avg_num_step_conf": 8.640625, "calib/ece": 0.2811934156378602, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8065843621399177, "calib/gap": 0.10425677710843406, "calib/mean_conf": 0.9238271604938272, "calib/mu_c": 0.9594375000000002, "calib/mu_w": 0.8551807228915661, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.273292181069959, "calib/std_conf": 0.16561570794794106, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.903386709367494, "calib/step_q_c_n": 1249.0, "calib/step_q_gap": 0.08223406139241618, "calib/step_q_w": 0.8211526479750778, "calib/step_q_w_n": 963.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 788.52734375, "completions/mean_terminated_length": 813.9636840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.1504, "grad_norm": 0.027693144977092743, "kl": 0.130859375, "learning_rate": 1.638888888888889e-06, "loss": -0.072, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.017691563814878464, "mask/share_reasoning": 0.8506156206130981, "mask/share_step_conf": 0.1004428118467331, "num_tokens": 43319775.0, "reward": 0.5400452613830566, "reward_std": 0.18913422524929047, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6792542934417725, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.08833617717027664, "step": 141 }, { "adv/mean_abs_final_conf": 0.7354959845542908, "adv/mean_abs_reasoning": 0.5655615925788879, "adv/mean_abs_step_conf": 0.5950552821159363, "adv/ratio_final_to_reasoning": 1.300470177263141, "adv/ratio_step_to_reasoning": 1.0521493855382946, "adv/std_final_conf": 0.8831775784492493, "adv/std_reasoning": 0.7932071089744568, "adv/std_step_conf": 0.825971245765686, "calib/answer_extract_rate": 0.8828125, "calib/auroc": 0.7189526973580129, "calib/avg_num_step_conf": 9.9375, "calib/ece": 0.3627459618208518, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.8193832599118943, "calib/gap": 0.0847057427622212, "calib/mean_conf": 0.931027900146843, "calib/mu_c": 0.9675968992248063, "calib/mu_w": 0.8828911564625851, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3627459618208518, "calib/std_conf": 0.14102006086777882, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.901234084231146, "calib/step_q_c_n": 1021.0, "calib/step_q_gap": 0.003125088827337641, "calib/step_q_w": 0.8981089954038084, "calib/step_q_w_n": 1523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 828.59765625, "completions/mean_terminated_length": 862.2804565429688, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.15146666666666667, "grad_norm": 0.023893464356660843, "kl": 0.1291046142578125, "learning_rate": 1.6111111111111113e-06, "loss": -0.0921, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.016964614391326904, "mask/share_reasoning": 0.8385289907455444, "mask/share_step_conf": 0.10544387996196747, "num_tokens": 43637056.0, "reward": 0.4579760432243347, "reward_std": 0.21751856803894043, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5716378688812256, "rewards/format_reward_step": 0.8828125, "rewards/step_margin_reward": 0.06697048991918564, "step": 142 }, { "adv/mean_abs_final_conf": 0.7062772512435913, "adv/mean_abs_reasoning": 0.5316286683082581, "adv/mean_abs_step_conf": 0.5760251879692078, "adv/ratio_final_to_reasoning": 1.3285161116143298, "adv/ratio_step_to_reasoning": 1.083510394204714, "adv/std_final_conf": 0.8750473260879517, "adv/std_reasoning": 0.7929207682609558, "adv/std_step_conf": 0.8259369730949402, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.7046296296296296, "calib/avg_num_step_conf": 9.9453125, "calib/ece": 0.3384356725146199, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.8026315789473685, "calib/gap": 0.061054750402576685, "calib/mean_conf": 0.9262134502923977, "calib/mu_c": 0.9503140096618359, "calib/mu_w": 0.8892592592592592, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.32969298245614037, "calib/std_conf": 0.14742606186738186, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8921758865248228, "calib/step_q_c_n": 1175.0, "calib/step_q_gap": -0.012849642286263951, "calib/step_q_w": 0.9050255288110868, "calib/step_q_w_n": 1371.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 821.39453125, "completions/mean_terminated_length": 861.7909545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.15253333333333333, "grad_norm": 0.025743544101715088, "kl": 0.1334686279296875, "learning_rate": 1.5833333333333333e-06, "loss": -0.0919, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.016596755012869835, "mask/share_reasoning": 0.8329101800918579, "mask/share_step_conf": 0.10361805558204651, "num_tokens": 43954669.0, "reward": 0.4721832871437073, "reward_std": 0.1786777377128601, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5830044746398926, "rewards/format_reward_step": 0.87890625, "rewards/step_margin_reward": 0.07776840031147003, "step": 143 }, { "adv/mean_abs_final_conf": 0.7158745527267456, "adv/mean_abs_reasoning": 0.48347049951553345, "adv/mean_abs_step_conf": 0.5655519962310791, "adv/ratio_final_to_reasoning": 1.4806995534248626, "adv/ratio_step_to_reasoning": 1.169775605332272, "adv/std_final_conf": 0.8891063332557678, "adv/std_reasoning": 0.7578825950622559, "adv/std_step_conf": 0.791719913482666, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.702331645957969, "calib/avg_num_step_conf": 8.91796875, "calib/ece": 0.2610373443983404, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7759336099585062, "calib/gap": 0.10897530296057667, "calib/mean_conf": 0.9153112033195022, "calib/mu_c": 0.9523899371069181, "calib/mu_w": 0.8434146341463414, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2582987551867222, "calib/std_conf": 0.15926701476033325, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8921519939804364, "calib/step_q_c_n": 1329.0, "calib/step_q_gap": -0.011977985055203089, "calib/step_q_w": 0.9041299790356395, "calib/step_q_w_n": 954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 782.4296875, "completions/mean_terminated_length": 804.4256591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 366.0, "epoch": 0.1536, "grad_norm": 0.028950883075594902, "kl": 0.150909423828125, "learning_rate": 1.5555555555555558e-06, "loss": -0.1276, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.018203455954790115, "mask/share_reasoning": 0.8445795178413391, "mask/share_step_conf": 0.10987326502799988, "num_tokens": 44259099.0, "reward": 0.543576717376709, "reward_std": 0.20246967673301697, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6907746195793152, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.08387883007526398, "step": 144 }, { "adv/mean_abs_final_conf": 0.718525230884552, "adv/mean_abs_reasoning": 0.6111539006233215, "adv/mean_abs_step_conf": 0.6051175594329834, "adv/ratio_final_to_reasoning": 1.1756862390172451, "adv/ratio_step_to_reasoning": 0.9901230423561371, "adv/std_final_conf": 0.8936732411384583, "adv/std_reasoning": 0.8432523012161255, "adv/std_step_conf": 0.8428249359130859, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.6748837582170915, "calib/avg_num_step_conf": 12.046875, "calib/ece": 0.24582978723404258, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.7531914893617021, "calib/gap": 0.11617764951098297, "calib/mean_conf": 0.8925531914893617, "calib/mu_c": 0.9325974025974026, "calib/mu_w": 0.8164197530864197, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24153191489361703, "calib/std_conf": 0.19602505359966252, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8844444444444445, "calib/step_q_c_n": 1350.0, "calib/step_q_gap": -0.00669742406766638, "calib/step_q_w": 0.8911418685121109, "calib/step_q_w_n": 1734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 774.7265625, "completions/mean_terminated_length": 822.9461059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.15466666666666667, "grad_norm": 0.02300332859158516, "kl": 0.1427459716796875, "learning_rate": 1.527777777777778e-06, "loss": -0.2306, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01733851432800293, "mask/share_reasoning": 0.8131154775619507, "mask/share_step_conf": 0.11095225065946579, "num_tokens": 44560133.0, "reward": 0.5210059285163879, "reward_std": 0.23228174448013306, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6677339673042297, "rewards/format_reward_step": 0.90625, "rewards/step_margin_reward": 0.07271534949541092, "step": 145 }, { "adv/mean_abs_final_conf": 0.7613563537597656, "adv/mean_abs_reasoning": 0.5813404321670532, "adv/mean_abs_step_conf": 0.516336977481842, "adv/ratio_final_to_reasoning": 1.3096566342748086, "adv/ratio_step_to_reasoning": 0.8881834961265315, "adv/std_final_conf": 0.9185754656791687, "adv/std_reasoning": 0.8268218636512756, "adv/std_step_conf": 0.7569028735160828, "calib/answer_extract_rate": 0.8671875, "calib/auroc": 0.7219461697722566, "calib/avg_num_step_conf": 10.96875, "calib/ece": 0.3987727272727273, "calib/final_conf_rate": 0.859375, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.6863636363636364, "calib/gap": 0.14056728778467908, "calib/mean_conf": 0.8760454545454545, "calib/mu_c": 0.9495238095238095, "calib/mu_w": 0.8089565217391305, "calib/nonempty_final_conf_rate": 0.859375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3987727272727273, "calib/std_conf": 0.2181581947438241, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8933727175080559, "calib/step_q_c_n": 931.0, "calib/step_q_gap": 0.01878028277177457, "calib/step_q_w": 0.8745924347362813, "calib/step_q_w_n": 1877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 791.16796875, "completions/mean_terminated_length": 858.2161254882812, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.15573333333333333, "grad_norm": 0.02888815850019455, "kl": 0.142608642578125, "learning_rate": 1.5e-06, "loss": -0.2562, "mask/has_final_conf_rate": 0.859375, "mask/share_final_conf": 0.016367482021450996, "mask/share_reasoning": 0.8004570007324219, "mask/share_step_conf": 0.10505051910877228, "num_tokens": 44869888.0, "reward": 0.4238643944263458, "reward_std": 0.20759667456150055, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.5241339802742004, "rewards/format_reward_step": 0.8515625, "rewards/step_margin_reward": 0.07046981155872345, "step": 146 }, { "adv/mean_abs_final_conf": 0.7660461068153381, "adv/mean_abs_reasoning": 0.5582988262176514, "adv/mean_abs_step_conf": 0.5253020524978638, "adv/ratio_final_to_reasoning": 1.372107679332102, "adv/ratio_step_to_reasoning": 0.9408976480510746, "adv/std_final_conf": 0.9019193053245544, "adv/std_reasoning": 0.7931223511695862, "adv/std_step_conf": 0.7755138278007507, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6936363636363636, "calib/avg_num_step_conf": 10.67578125, "calib/ece": 0.36917730496453893, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.676595744680851, "calib/gap": 0.12923757575757588, "calib/mean_conf": 0.8706524822695035, "calib/mu_c": 0.9311466666666666, "calib/mu_w": 0.8019090909090907, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3539574468085106, "calib/std_conf": 0.2383865184663548, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8811808118081181, "calib/step_q_c_n": 1084.0, "calib/step_q_gap": 0.035262073178645714, "calib/step_q_w": 0.8459187386294724, "calib/step_q_w_n": 1649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 777.2890625, "completions/mean_terminated_length": 829.1083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.1568, "grad_norm": 0.023427121341228485, "kl": 0.140838623046875, "learning_rate": 1.4722222222222225e-06, "loss": -0.1953, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.016856852918863297, "mask/share_reasoning": 0.8128437995910645, "mask/share_step_conf": 0.10779933631420135, "num_tokens": 45172554.0, "reward": 0.46643000841140747, "reward_std": 0.22191259264945984, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5909908413887024, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.06061914935708046, "step": 147 }, { "adv/mean_abs_final_conf": 0.7195539474487305, "adv/mean_abs_reasoning": 0.5210443139076233, "adv/mean_abs_step_conf": 0.6254093647003174, "adv/ratio_final_to_reasoning": 1.3809841663031779, "adv/ratio_step_to_reasoning": 1.200299759554035, "adv/std_final_conf": 0.8986973166465759, "adv/std_reasoning": 0.7756775617599487, "adv/std_step_conf": 0.8422386050224304, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.701277355123509, "calib/avg_num_step_conf": 10.55078125, "calib/ece": 0.1885344827586207, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.1579618671926366, "calib/mean_conf": 0.892844827586207, "calib/mu_c": 0.9357396449704143, "calib/mu_w": 0.7777777777777777, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17646551724137932, "calib/std_conf": 0.21257982693114066, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8895997286295794, "calib/step_q_c_n": 1474.0, "calib/step_q_gap": -0.0056569950868020635, "calib/step_q_w": 0.8952567237163814, "calib/step_q_w_n": 1227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 767.38671875, "completions/mean_terminated_length": 801.8407592773438, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.15786666666666666, "grad_norm": 0.03625670447945595, "kl": 0.1369476318359375, "learning_rate": 1.4444444444444445e-06, "loss": -0.1425, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.0180515144020319, "mask/share_reasoning": 0.8227970004081726, "mask/share_step_conf": 0.11618275940418243, "num_tokens": 45474117.0, "reward": 0.5595434904098511, "reward_std": 0.20942556858062744, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7142660021781921, "rewards/format_reward_step": 0.90234375, "rewards/step_margin_reward": 0.09232106059789658, "step": 148 }, { "adv/mean_abs_final_conf": 0.7409499883651733, "adv/mean_abs_reasoning": 0.5377918481826782, "adv/mean_abs_step_conf": 0.6655032634735107, "adv/ratio_final_to_reasoning": 1.3777635173701739, "adv/ratio_step_to_reasoning": 1.2374736912104536, "adv/std_final_conf": 0.893731951713562, "adv/std_reasoning": 0.7929414510726929, "adv/std_step_conf": 0.8731259703636169, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7618329383035265, "calib/avg_num_step_conf": 10.0390625, "calib/ece": 0.28487704918032797, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7377049180327869, "calib/gap": 0.14380593262946206, "calib/mean_conf": 0.9009426229508198, "calib/mu_c": 0.9545751633986929, "calib/mu_w": 0.8107692307692308, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2793852459016395, "calib/std_conf": 0.1819620208607914, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9007949125596185, "calib/step_q_c_n": 1258.0, "calib/step_q_gap": 0.033782717437667364, "calib/step_q_w": 0.8670121951219512, "calib/step_q_w_n": 1312.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2947.0, "completions/max_terminated_length": 2947.0, "completions/mean_length": 806.10546875, "completions/mean_terminated_length": 835.477783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 321.0, "epoch": 0.15893333333333334, "grad_norm": 0.031530484557151794, "kl": 0.1332244873046875, "learning_rate": 1.4166666666666667e-06, "loss": -0.0885, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.017923608422279358, "mask/share_reasoning": 0.8353755474090576, "mask/share_step_conf": 0.11154457926750183, "num_tokens": 45784936.0, "reward": 0.5386008620262146, "reward_std": 0.21099933981895447, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6873722672462463, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.08045439422130585, "step": 149 }, { "adv/mean_abs_final_conf": 0.7494795322418213, "adv/mean_abs_reasoning": 0.5564857721328735, "adv/mean_abs_step_conf": 0.6270158290863037, "adv/ratio_final_to_reasoning": 1.3468080762770447, "adv/ratio_step_to_reasoning": 1.126741887188069, "adv/std_final_conf": 0.9043705463409424, "adv/std_reasoning": 0.8101388216018677, "adv/std_step_conf": 0.8577750325202942, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.689269860707179, "calib/avg_num_step_conf": 11.828125, "calib/ece": 0.3133905579399142, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.7811158798283262, "calib/gap": 0.14401500076534557, "calib/mean_conf": 0.9099570815450644, "calib/mu_c": 0.9680575539568346, "calib/mu_w": 0.824042553191489, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3133905579399142, "calib/std_conf": 0.189712902256198, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8931949882537198, "calib/step_q_c_n": 1277.0, "calib/step_q_gap": 0.0018985861977517393, "calib/step_q_w": 0.891296402055968, "calib/step_q_w_n": 1751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 761.71875, "completions/mean_terminated_length": 795.9183349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 320.0, "epoch": 0.16, "grad_norm": 0.026372069492936134, "kl": 0.1349639892578125, "learning_rate": 1.3888888888888892e-06, "loss": -0.128, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.018884621560573578, "mask/share_reasoning": 0.8096184134483337, "mask/share_step_conf": 0.12852820754051208, "num_tokens": 46084896.0, "reward": 0.49053722620010376, "reward_std": 0.22207728028297424, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6167457103729248, "rewards/format_reward_step": 0.89453125, "rewards/step_margin_reward": 0.0768287256360054, "step": 150 }, { "adv/mean_abs_final_conf": 0.7335193157196045, "adv/mean_abs_reasoning": 0.6208381652832031, "adv/mean_abs_step_conf": 0.6437610387802124, "adv/ratio_final_to_reasoning": 1.1814984270256652, "adv/ratio_step_to_reasoning": 1.0369224618891668, "adv/std_final_conf": 0.9019107222557068, "adv/std_reasoning": 0.8432009220123291, "adv/std_step_conf": 0.8425126075744629, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.6885245901639345, "calib/avg_num_step_conf": 9.66015625, "calib/ece": 0.37636363636363634, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.7142857142857143, "calib/gap": 0.09675966310723405, "calib/mean_conf": 0.8845887445887445, "calib/mu_c": 0.9302459016393442, "calib/mu_w": 0.8334862385321101, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3664069264069264, "calib/std_conf": 0.20576260783341227, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8911137218045113, "calib/step_q_c_n": 1064.0, "calib/step_q_gap": 0.015634658639145815, "calib/step_q_w": 0.8754790631653655, "calib/step_q_w_n": 1409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 807.22265625, "completions/mean_terminated_length": 853.9214477539062, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.16106666666666666, "grad_norm": 0.022941017523407936, "kl": 0.130828857421875, "learning_rate": 1.3611111111111112e-06, "loss": -0.227, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.01679162308573723, "mask/share_reasoning": 0.8220335245132446, "mask/share_step_conf": 0.10648736357688904, "num_tokens": 46398569.0, "reward": 0.45801666378974915, "reward_std": 0.23174870014190674, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5681366920471191, "rewards/format_reward_step": 0.90234375, "rewards/step_margin_reward": 0.07211536914110184, "step": 151 }, { "adv/mean_abs_final_conf": 0.6697338819503784, "adv/mean_abs_reasoning": 0.5552310943603516, "adv/mean_abs_step_conf": 0.5894858837127686, "adv/ratio_final_to_reasoning": 1.206225459548404, "adv/ratio_step_to_reasoning": 1.0616946523715136, "adv/std_final_conf": 0.8452982902526855, "adv/std_reasoning": 0.8100466728210449, "adv/std_step_conf": 0.8252595067024231, "calib/answer_extract_rate": 0.8828125, "calib/auroc": 0.7348987982289689, "calib/avg_num_step_conf": 11.234375, "calib/ece": 0.35623893805309753, "calib/final_conf_rate": 0.8828125, "calib/format_rate": 0.875, "calib/frac_conf_gt_0.9": 0.6991150442477876, "calib/gap": 0.12718058191018355, "calib/mean_conf": 0.8870353982300885, "calib/mu_c": 0.9444354838709678, "calib/mu_w": 0.8172549019607842, "calib/nonempty_final_conf_rate": 0.8828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3473008849557524, "calib/std_conf": 0.21500477167800355, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8937307692307692, "calib/step_q_c_n": 1040.0, "calib/step_q_gap": 0.019073906485670977, "calib/step_q_w": 0.8746568627450982, "calib/step_q_w_n": 1836.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 753.68359375, "completions/mean_terminated_length": 814.10546875, "completions/min_length": 0.0, "completions/min_terminated_length": 432.0, "epoch": 0.16213333333333332, "grad_norm": 0.027036553248763084, "kl": 0.12664794921875, "learning_rate": 1.3333333333333334e-06, "loss": -0.2308, "mask/has_final_conf_rate": 0.8828125, "mask/share_final_conf": 0.017170913517475128, "mask/share_reasoning": 0.801292359828949, "mask/share_step_conf": 0.10731793195009232, "num_tokens": 46696904.0, "reward": 0.45713406801223755, "reward_std": 0.20017015933990479, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.573637843132019, "rewards/format_reward_step": 0.875, "rewards/step_margin_reward": 0.06875520944595337, "step": 152 }, { "adv/mean_abs_final_conf": 0.6873051524162292, "adv/mean_abs_reasoning": 0.4994828701019287, "adv/mean_abs_step_conf": 0.6231411695480347, "adv/ratio_final_to_reasoning": 1.3760334809399408, "adv/ratio_step_to_reasoning": 1.2475726533341798, "adv/std_final_conf": 0.9070894122123718, "adv/std_reasoning": 0.7928266525268555, "adv/std_step_conf": 0.873490571975708, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5885488571866851, "calib/avg_num_step_conf": 9.5390625, "calib/ece": 0.24746887966804973, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.7344398340248963, "calib/gap": 0.09906120570639676, "calib/mean_conf": 0.8995020746887967, "calib/mu_c": 0.9332075471698112, "calib/mu_w": 0.8341463414634145, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.243609958506224, "calib/std_conf": 0.1849811053372621, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9003977272727273, "calib/step_q_c_n": 1232.0, "calib/step_q_gap": 0.004587809917355523, "calib/step_q_w": 0.8958099173553717, "calib/step_q_w_n": 1210.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 798.7109375, "completions/mean_terminated_length": 817.8800659179688, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.1632, "grad_norm": 0.043053604662418365, "kl": 0.12396240234375, "learning_rate": 1.3055555555555556e-06, "loss": -0.0107, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01817367784678936, "mask/share_reasoning": 0.8484771847724915, "mask/share_step_conf": 0.10991162061691284, "num_tokens": 47008694.0, "reward": 0.54566490650177, "reward_std": 0.17580105364322662, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6826929450035095, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.09691811352968216, "step": 153 }, { "adv/mean_abs_final_conf": 0.6787465810775757, "adv/mean_abs_reasoning": 0.5282330513000488, "adv/mean_abs_step_conf": 0.6091671586036682, "adv/ratio_final_to_reasoning": 1.2849377361130545, "adv/ratio_step_to_reasoning": 1.1532166665914414, "adv/std_final_conf": 0.8813791871070862, "adv/std_reasoning": 0.8099303841590881, "adv/std_step_conf": 0.8264101147651672, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6794193460860127, "calib/avg_num_step_conf": 9.22265625, "calib/ece": 0.39432098765432094, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.7078189300411523, "calib/gap": 0.09575702075702075, "calib/mean_conf": 0.8846090534979423, "calib/mu_c": 0.9307142857142856, "calib/mu_w": 0.8349572649572649, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3802057613168724, "calib/std_conf": 0.22132632240199768, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9010103092783506, "calib/step_q_c_n": 970.0, "calib/step_q_gap": 0.005985147524216861, "calib/step_q_w": 0.8950251617541337, "calib/step_q_w_n": 1391.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 726.19921875, "completions/mean_terminated_length": 758.8040771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.16426666666666667, "grad_norm": 0.030010627582669258, "kl": 0.140289306640625, "learning_rate": 1.2777777777777779e-06, "loss": -0.1675, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01895446516573429, "mask/share_reasoning": 0.8261541128158569, "mask/share_step_conf": 0.11192267388105392, "num_tokens": 47299041.0, "reward": 0.46878811717033386, "reward_std": 0.21039001643657684, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5803776979446411, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.07047979533672333, "step": 154 }, { "adv/mean_abs_final_conf": 0.7753806114196777, "adv/mean_abs_reasoning": 0.6197185516357422, "adv/mean_abs_step_conf": 0.6528646945953369, "adv/ratio_final_to_reasoning": 1.2511818621099315, "adv/ratio_step_to_reasoning": 1.0534858007269683, "adv/std_final_conf": 0.9207301139831543, "adv/std_reasoning": 0.8591168522834778, "adv/std_step_conf": 0.8739998936653137, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6355042016806722, "calib/avg_num_step_conf": 9.9296875, "calib/ece": 0.34849645390070927, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.6680851063829787, "calib/gap": 0.17985197527286756, "calib/mean_conf": 0.8380567375886527, "calib/mu_c": 0.9268347338935572, "calib/mu_w": 0.7469827586206896, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3400851063829788, "calib/std_conf": 0.27345172422314257, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8926479750778817, "calib/step_q_c_n": 963.0, "calib/step_q_gap": 0.0399563981304466, "calib/step_q_w": 0.8526915769474351, "calib/step_q_w_n": 1579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 718.68359375, "completions/mean_terminated_length": 760.2603149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.16533333333333333, "grad_norm": 0.032451026141643524, "kl": 0.1468048095703125, "learning_rate": 1.25e-06, "loss": -0.1853, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.018368728458881378, "mask/share_reasoning": 0.8137243986129761, "mask/share_step_conf": 0.11321938782930374, "num_tokens": 47590240.0, "reward": 0.4808383882045746, "reward_std": 0.23973357677459717, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6014244556427002, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.08368983864784241, "step": 155 }, { "adv/mean_abs_final_conf": 0.6914392709732056, "adv/mean_abs_reasoning": 0.5390540361404419, "adv/mean_abs_step_conf": 0.6534163951873779, "adv/ratio_final_to_reasoning": 1.2826900915608062, "adv/ratio_step_to_reasoning": 1.2121537942017018, "adv/std_final_conf": 0.87925124168396, "adv/std_reasoning": 0.7929775714874268, "adv/std_step_conf": 0.8582495450973511, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6796161529680365, "calib/avg_num_step_conf": 9.91796875, "calib/ece": 0.28479338842975205, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6239669421487604, "calib/gap": 0.16742865296803644, "calib/mean_conf": 0.8237190082644629, "calib/mu_c": 0.8901369863013697, "calib/mu_w": 0.7227083333333333, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2526033057851239, "calib/std_conf": 0.2897038448793171, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8891600633914423, "calib/step_q_c_n": 1262.0, "calib/step_q_gap": 0.014281441621669333, "calib/step_q_w": 0.8748786217697729, "calib/step_q_w_n": 1277.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 794.78125, "completions/mean_terminated_length": 823.7409057617188, "completions/min_length": 0.0, "completions/min_terminated_length": 346.0, "epoch": 0.1664, "grad_norm": 0.05786724016070366, "kl": 0.1220245361328125, "learning_rate": 1.2222222222222223e-06, "loss": -0.0627, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018169675022363663, "mask/share_reasoning": 0.8339545130729675, "mask/share_step_conf": 0.1127195730805397, "num_tokens": 47898464.0, "reward": 0.5359795689582825, "reward_std": 0.20994602143764496, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6656608581542969, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.10395440459251404, "step": 156 }, { "adv/mean_abs_final_conf": 0.6915332674980164, "adv/mean_abs_reasoning": 0.4938841462135315, "adv/mean_abs_step_conf": 0.5931864976882935, "adv/ratio_final_to_reasoning": 1.4001932898632283, "adv/ratio_step_to_reasoning": 1.2010640597315883, "adv/std_final_conf": 0.8813050985336304, "adv/std_reasoning": 0.7579091191291809, "adv/std_step_conf": 0.842310905456543, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.7553385416666667, "calib/avg_num_step_conf": 10.6015625, "calib/ece": 0.19064655172413786, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.6681034482758621, "calib/gap": 0.24574305555555565, "calib/mean_conf": 0.8264224137931035, "calib/mu_c": 0.9026875000000001, "calib/mu_w": 0.6569444444444444, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16370689655172405, "calib/std_conf": 0.2910691543277476, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8778259871441689, "calib/step_q_c_n": 1452.0, "calib/step_q_gap": 0.008428205844644543, "calib/step_q_w": 0.8693977812995244, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 782.43359375, "completions/mean_terminated_length": 824.2921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.16746666666666668, "grad_norm": 0.030279651284217834, "kl": 0.1249542236328125, "learning_rate": 1.1944444444444446e-06, "loss": -0.2256, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.017455583438277245, "mask/share_reasoning": 0.8157730102539062, "mask/share_step_conf": 0.11599016934633255, "num_tokens": 48202495.0, "reward": 0.5462800860404968, "reward_std": 0.1933847963809967, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7099796533584595, "rewards/format_reward_step": 0.90234375, "rewards/step_margin_reward": 0.07711178064346313, "step": 157 }, { "adv/mean_abs_final_conf": 0.7298696041107178, "adv/mean_abs_reasoning": 0.6030030250549316, "adv/mean_abs_step_conf": 0.6138399839401245, "adv/ratio_final_to_reasoning": 1.2103912812779487, "adv/ratio_step_to_reasoning": 1.0179716492868434, "adv/std_final_conf": 0.8874096870422363, "adv/std_reasoning": 0.810146152973175, "adv/std_step_conf": 0.8261474370956421, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6337298964529632, "calib/avg_num_step_conf": 9.0859375, "calib/ece": 0.28541322314049594, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.731404958677686, "calib/gap": 0.10318498935154585, "calib/mean_conf": 0.8915289256198347, "calib/mu_c": 0.9294771241830065, "calib/mu_w": 0.8262921348314607, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27235537190082654, "calib/std_conf": 0.2134983528835051, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9025806451612904, "calib/step_q_c_n": 1302.0, "calib/step_q_gap": -0.002399823588709604, "calib/step_q_w": 0.90498046875, "calib/step_q_w_n": 1024.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 766.0234375, "completions/mean_terminated_length": 784.4080200195312, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.16853333333333334, "grad_norm": 0.02617565542459488, "kl": 0.1327972412109375, "learning_rate": 1.1666666666666668e-06, "loss": -0.0683, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.019105687737464905, "mask/share_reasoning": 0.8394445180892944, "mask/share_step_conf": 0.11801233887672424, "num_tokens": 48503837.0, "reward": 0.5290356874465942, "reward_std": 0.22228708863258362, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6603983640670776, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.0906417965888977, "step": 158 }, { "adv/mean_abs_final_conf": 0.6971362829208374, "adv/mean_abs_reasoning": 0.5892915725708008, "adv/mean_abs_step_conf": 0.7132741212844849, "adv/ratio_final_to_reasoning": 1.1830073861052537, "adv/ratio_step_to_reasoning": 1.2103925365380788, "adv/std_final_conf": 0.8710833787918091, "adv/std_reasoning": 0.8102442622184753, "adv/std_step_conf": 0.8896806240081787, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.714888155482215, "calib/avg_num_step_conf": 9.9453125, "calib/ece": 0.30097457627118657, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6822033898305084, "calib/gap": 0.1881987532086543, "calib/mean_conf": 0.8471610169491526, "calib/mu_c": 0.9277037037037038, "calib/mu_w": 0.7395049504950495, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28805084745762727, "calib/std_conf": 0.2681369960483135, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9040033500837522, "calib/step_q_c_n": 1194.0, "calib/step_q_gap": 0.025302659748446477, "calib/step_q_w": 0.8787006903353057, "calib/step_q_w_n": 1352.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 761.53515625, "completions/mean_terminated_length": 792.4918212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 445.0, "epoch": 0.1696, "grad_norm": 0.05064807087182999, "kl": 0.1299896240234375, "learning_rate": 1.138888888888889e-06, "loss": -0.1321, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01853277161717415, "mask/share_reasoning": 0.8229720592498779, "mask/share_step_conf": 0.11943262815475464, "num_tokens": 48803574.0, "reward": 0.518351674079895, "reward_std": 0.24158057570457458, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6450753808021545, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.1017841324210167, "step": 159 }, { "adv/mean_abs_final_conf": 0.7091639041900635, "adv/mean_abs_reasoning": 0.5300674438476562, "adv/mean_abs_step_conf": 0.6332898139953613, "adv/ratio_final_to_reasoning": 1.337874854268319, "adv/ratio_step_to_reasoning": 1.1947344085092908, "adv/std_final_conf": 0.879418134689331, "adv/std_reasoning": 0.7756585478782654, "adv/std_step_conf": 0.8588519096374512, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6800014560279557, "calib/avg_num_step_conf": 9.77734375, "calib/ece": 0.30569620253164553, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5991561181434599, "calib/gap": 0.15435861968549824, "calib/mean_conf": 0.8364978902953587, "calib/mu_c": 0.902279411764706, "calib/mu_w": 0.7479207920792078, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2841772151898734, "calib/std_conf": 0.268625138781579, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8942211055276384, "calib/step_q_c_n": 1194.0, "calib/step_q_gap": 0.02555036450395609, "calib/step_q_w": 0.8686707410236824, "calib/step_q_w_n": 1309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 813.01953125, "completions/mean_terminated_length": 832.5320434570312, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.17066666666666666, "grad_norm": 0.03158262372016907, "kl": 0.1331634521484375, "learning_rate": 1.111111111111111e-06, "loss": -0.1071, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01779044046998024, "mask/share_reasoning": 0.8432353734970093, "mask/share_step_conf": 0.11553666740655899, "num_tokens": 49116547.0, "reward": 0.49313122034072876, "reward_std": 0.21372725069522858, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6346972584724426, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.060940176248550415, "step": 160 }, { "adv/mean_abs_final_conf": 0.6245907545089722, "adv/mean_abs_reasoning": 0.3743476867675781, "adv/mean_abs_step_conf": 0.6349760293960571, "adv/ratio_final_to_reasoning": 1.6684776655151683, "adv/ratio_step_to_reasoning": 1.6962199897078454, "adv/std_final_conf": 0.848501980304718, "adv/std_reasoning": 0.6816536784172058, "adv/std_step_conf": 0.8736690282821655, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6707924836601307, "calib/avg_num_step_conf": 9.42578125, "calib/ece": 0.20596774193548392, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.657258064516129, "calib/gap": 0.14180392156862742, "calib/mean_conf": 0.853951612903226, "calib/mu_c": 0.8928333333333333, "calib/mu_w": 0.7510294117647058, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16705645161290328, "calib/std_conf": 0.2540695185167517, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.879764030612245, "calib/step_q_c_n": 1568.0, "calib/step_q_gap": -0.021312892464678268, "calib/step_q_w": 0.9010769230769232, "calib/step_q_w_n": 845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 731.30078125, "completions/mean_terminated_length": 742.9087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 336.0, "epoch": 0.17173333333333332, "grad_norm": 0.026658998802304268, "kl": 0.1280975341796875, "learning_rate": 1.0833333333333335e-06, "loss": -0.0085, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.020110558718442917, "mask/share_reasoning": 0.8370461463928223, "mask/share_step_conf": 0.1272183060646057, "num_tokens": 49407680.0, "reward": 0.5983608961105347, "reward_std": 0.17839132249355316, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7521929740905762, "rewards/format_reward_step": 0.96875, "rewards/step_margin_reward": 0.11015383899211884, "step": 161 }, { "adv/mean_abs_final_conf": 0.6977721452713013, "adv/mean_abs_reasoning": 0.5885578989982605, "adv/mean_abs_step_conf": 0.6228806972503662, "adv/ratio_final_to_reasoning": 1.1855624509651914, "adv/ratio_step_to_reasoning": 1.058316774459274, "adv/std_final_conf": 0.882885754108429, "adv/std_reasoning": 0.8265968561172485, "adv/std_step_conf": 0.858915388584137, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7146706586826347, "calib/avg_num_step_conf": 10.29296875, "calib/ece": 0.22425101214574897, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6720647773279352, "calib/gap": 0.18242964071856282, "calib/mean_conf": 0.8410931174089069, "calib/mu_c": 0.9001796407185628, "calib/mu_w": 0.71775, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19461538461538458, "calib/std_conf": 0.2689686770311461, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8863685636856369, "calib/step_q_c_n": 1476.0, "calib/step_q_gap": 0.012805146947069135, "calib/step_q_w": 0.8735634167385677, "calib/step_q_w_n": 1159.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2071.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 723.3203125, "completions/mean_terminated_length": 743.6546020507812, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.1728, "grad_norm": 0.02563856728374958, "kl": 0.1303558349609375, "learning_rate": 1.0555555555555557e-06, "loss": -0.0963, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.019811637699604034, "mask/share_reasoning": 0.826894998550415, "mask/share_step_conf": 0.12594959139823914, "num_tokens": 49696994.0, "reward": 0.5756694674491882, "reward_std": 0.22317549586296082, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7345855236053467, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.09331589937210083, "step": 162 }, { "adv/mean_abs_final_conf": 0.6574447751045227, "adv/mean_abs_reasoning": 0.4014560878276825, "adv/mean_abs_step_conf": 0.5452881455421448, "adv/ratio_final_to_reasoning": 1.637650530253557, "adv/ratio_step_to_reasoning": 1.358275941193846, "adv/std_final_conf": 0.8560632467269897, "adv/std_reasoning": 0.7016843557357788, "adv/std_step_conf": 0.7915970087051392, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.7470642413078517, "calib/avg_num_step_conf": 9.37890625, "calib/ece": 0.28895652173913033, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.5434782608695652, "calib/gap": 0.20775500805894542, "calib/mean_conf": 0.7836521739130435, "calib/mu_c": 0.8748837209302327, "calib/mu_w": 0.6671287128712873, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2558695652173912, "calib/std_conf": 0.3100037319124109, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8922811671087534, "calib/step_q_c_n": 1131.0, "calib/step_q_gap": 0.011107938762296543, "calib/step_q_w": 0.8811732283464568, "calib/step_q_w_n": 1270.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 811.58984375, "completions/mean_terminated_length": 855.0081787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 340.0, "epoch": 0.17386666666666667, "grad_norm": 0.0375203937292099, "kl": 0.11700439453125, "learning_rate": 1.0277777777777777e-06, "loss": -0.1507, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.0175506342202425, "mask/share_reasoning": 0.8228362798690796, "mask/share_step_conf": 0.10883183032274246, "num_tokens": 50009593.0, "reward": 0.4967234134674072, "reward_std": 0.16443182528018951, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6381679773330688, "rewards/format_reward_step": 0.8984375, "rewards/step_margin_reward": 0.07324765622615814, "step": 163 }, { "adv/mean_abs_final_conf": 0.690467357635498, "adv/mean_abs_reasoning": 0.5424333214759827, "adv/mean_abs_step_conf": 0.6274310350418091, "adv/ratio_final_to_reasoning": 1.2729073423378727, "adv/ratio_step_to_reasoning": 1.15669707261815, "adv/std_final_conf": 0.8795118927955627, "adv/std_reasoning": 0.8100107908248901, "adv/std_step_conf": 0.8425417542457581, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7660706201901312, "calib/avg_num_step_conf": 9.25, "calib/ece": 0.22578723404255313, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.5063829787234042, "calib/gap": 0.25039007092198595, "calib/mean_conf": 0.7679999999999999, "calib/mu_c": 0.8681560283687944, "calib/mu_w": 0.6177659574468084, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19689361702127656, "calib/std_conf": 0.31162006326580965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8877343750000001, "calib/step_q_c_n": 1152.0, "calib/step_q_gap": 0.005004111842105252, "calib/step_q_w": 0.8827302631578948, "calib/step_q_w_n": 1216.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 827.125, "completions/mean_terminated_length": 871.3744506835938, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.17493333333333333, "grad_norm": 0.05669071525335312, "kl": 0.11883544921875, "learning_rate": 1.0000000000000002e-06, "loss": -0.1809, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.016391105949878693, "mask/share_reasoning": 0.8361638784408569, "mask/share_step_conf": 0.09666381031274796, "num_tokens": 50327473.0, "reward": 0.5372678637504578, "reward_std": 0.213059201836586, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6890285015106201, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.09253846853971481, "step": 164 }, { "adv/mean_abs_final_conf": 0.7203365564346313, "adv/mean_abs_reasoning": 0.5179213285446167, "adv/mean_abs_step_conf": 0.6181538105010986, "adv/ratio_final_to_reasoning": 1.390822344503191, "adv/ratio_step_to_reasoning": 1.1935283921172737, "adv/std_final_conf": 0.9021759033203125, "adv/std_reasoning": 0.7930563688278198, "adv/std_step_conf": 0.8590736985206604, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6774078052273541, "calib/avg_num_step_conf": 9.6171875, "calib/ece": 0.2903361344537815, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5168067226890757, "calib/gap": 0.1662255639097745, "calib/mean_conf": 0.7526050420168068, "calib/mu_c": 0.8259398496240603, "calib/mu_w": 0.6597142857142858, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24205882352941172, "calib/std_conf": 0.3251837546069797, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8825730442978323, "calib/step_q_c_n": 1061.0, "calib/step_q_gap": 0.05238032481175092, "calib/step_q_w": 0.8301927194860814, "calib/step_q_w_n": 1401.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 795.234375, "completions/mean_terminated_length": 834.34423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 393.0, "epoch": 0.176, "grad_norm": 0.034762658178806305, "kl": 0.124237060546875, "learning_rate": 9.722222222222224e-07, "loss": -0.1189, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017579592764377594, "mask/share_reasoning": 0.8248994946479797, "mask/share_step_conf": 0.11064592003822327, "num_tokens": 50636629.0, "reward": 0.5113325119018555, "reward_std": 0.20932233333587646, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6395570039749146, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.09404556453227997, "step": 165 }, { "adv/mean_abs_final_conf": 0.6928807497024536, "adv/mean_abs_reasoning": 0.437917023897171, "adv/mean_abs_step_conf": 0.6622142791748047, "adv/ratio_final_to_reasoning": 1.5822192604806147, "adv/ratio_step_to_reasoning": 1.5121912212535993, "adv/std_final_conf": 0.8943244218826294, "adv/std_reasoning": 0.7208465933799744, "adv/std_step_conf": 0.8748304843902588, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.8110718395528522, "calib/avg_num_step_conf": 11.20703125, "calib/ece": 0.16765957446808516, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.5148936170212766, "calib/gap": 0.36726943942133805, "calib/mean_conf": 0.7297872340425532, "calib/mu_c": 0.8501265822784808, "calib/mu_w": 0.48285714285714276, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11255319148936177, "calib/std_conf": 0.34458293854913646, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.881169757489301, "calib/step_q_c_n": 1402.0, "calib/step_q_gap": 0.04444174112938282, "calib/step_q_w": 0.8367280163599182, "calib/step_q_w_n": 1467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 757.82421875, "completions/mean_terminated_length": 801.665283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 382.0, "epoch": 0.17706666666666668, "grad_norm": 0.05696016550064087, "kl": 0.1289215087890625, "learning_rate": 9.444444444444445e-07, "loss": -0.148, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.018113480880856514, "mask/share_reasoning": 0.8088841438293457, "mask/share_step_conf": 0.11831484735012054, "num_tokens": 50936816.0, "reward": 0.5796314477920532, "reward_std": 0.18654602766036987, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7484402060508728, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.10457274317741394, "step": 166 }, { "adv/mean_abs_final_conf": 0.7203253507614136, "adv/mean_abs_reasoning": 0.5225661396980286, "adv/mean_abs_step_conf": 0.6850904226303101, "adv/ratio_final_to_reasoning": 1.3784386243962585, "adv/ratio_step_to_reasoning": 1.311011890334491, "adv/std_final_conf": 0.8873981237411499, "adv/std_reasoning": 0.757948637008667, "adv/std_step_conf": 0.8701576590538025, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5047265625, "calib/avg_num_step_conf": 9.8359375, "calib/ece": 0.27308333333333323, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6125, "calib/gap": 0.046250000000000235, "calib/mean_conf": 0.8085833333333333, "calib/mu_c": 0.8240000000000002, "calib/mu_w": 0.7777499999999999, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2074999999999999, "calib/std_conf": 0.2874030150425627, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8687977369165488, "calib/step_q_c_n": 1414.0, "calib/step_q_gap": 0.009513316626693635, "calib/step_q_w": 0.8592844202898552, "calib/step_q_w_n": 1104.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 749.890625, "completions/mean_terminated_length": 777.214599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 465.0, "epoch": 0.17813333333333334, "grad_norm": 0.030544927343726158, "kl": 0.12530517578125, "learning_rate": 9.166666666666666e-07, "loss": -0.085, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.018237832933664322, "mask/share_reasoning": 0.8299527168273926, "mask/share_step_conf": 0.11665323376655579, "num_tokens": 51234396.0, "reward": 0.5179564356803894, "reward_std": 0.20450013875961304, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6521179676055908, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.07129494845867157, "step": 167 }, { "adv/mean_abs_final_conf": 0.7917367219924927, "adv/mean_abs_reasoning": 0.592582643032074, "adv/mean_abs_step_conf": 0.6729299426078796, "adv/ratio_final_to_reasoning": 1.336078150958666, "adv/ratio_step_to_reasoning": 1.1355883445466641, "adv/std_final_conf": 0.93386310338974, "adv/std_reasoning": 0.8268343806266785, "adv/std_step_conf": 0.8747602701187134, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7033836416747808, "calib/avg_num_step_conf": 10.84765625, "calib/ece": 0.180468085106383, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.3617021276595745, "calib/gap": 0.25802661473547545, "calib/mean_conf": 0.6473617021276595, "calib/mu_c": 0.7341025641025641, "calib/mu_w": 0.4760759493670887, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08200000000000003, "calib/std_conf": 0.3600021528027779, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8628933631618194, "calib/step_q_c_n": 1341.0, "calib/step_q_gap": 0.041180271239813826, "calib/step_q_w": 0.8217130919220056, "calib/step_q_w_n": 1436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 832.90625, "completions/mean_terminated_length": 877.4649658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.1792, "grad_norm": 0.04968951642513275, "kl": 0.115997314453125, "learning_rate": 8.88888888888889e-07, "loss": -0.1728, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.016532883048057556, "mask/share_reasoning": 0.8280484080314636, "mask/share_step_conf": 0.10463744401931763, "num_tokens": 51552292.0, "reward": 0.5492472648620605, "reward_std": 0.22687752544879913, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6980816721916199, "rewards/format_reward_step": 0.9140625, "rewards/step_margin_reward": 0.09494408965110779, "step": 168 }, { "adv/mean_abs_final_conf": 0.7144399285316467, "adv/mean_abs_reasoning": 0.4918302297592163, "adv/mean_abs_step_conf": 0.5391664505004883, "adv/ratio_final_to_reasoning": 1.452614917308789, "adv/ratio_step_to_reasoning": 1.0962450412298694, "adv/std_final_conf": 0.9200153946876526, "adv/std_reasoning": 0.757727324962616, "adv/std_step_conf": 0.7924664616584778, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6603001849480723, "calib/avg_num_step_conf": 10.24609375, "calib/ece": 0.2731535269709543, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4854771784232365, "calib/gap": 0.13812846777635535, "calib/mean_conf": 0.7589626556016599, "calib/mu_c": 0.8157042253521127, "calib/mu_w": 0.6775757575757574, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22145228215767632, "calib/std_conf": 0.30494432294957036, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8821566975568661, "calib/step_q_c_n": 1187.0, "calib/step_q_gap": 0.03588232429781313, "calib/step_q_w": 0.846274373259053, "calib/step_q_w_n": 1436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 777.56640625, "completions/mean_terminated_length": 812.4775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.18026666666666666, "grad_norm": 0.04508724436163902, "kl": 0.1221771240234375, "learning_rate": 8.611111111111112e-07, "loss": -0.1666, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017777567729353905, "mask/share_reasoning": 0.8302462697029114, "mask/share_step_conf": 0.10900741815567017, "num_tokens": 51855533.0, "reward": 0.5315976738929749, "reward_std": 0.18220046162605286, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6588956713676453, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.10586215555667877, "step": 169 }, { "adv/mean_abs_final_conf": 0.6918748021125793, "adv/mean_abs_reasoning": 0.5032524466514587, "adv/mean_abs_step_conf": 0.6352858543395996, "adv/ratio_final_to_reasoning": 1.3748066337604041, "adv/ratio_step_to_reasoning": 1.2623601903312438, "adv/std_final_conf": 0.8777769207954407, "adv/std_reasoning": 0.7577748894691467, "adv/std_step_conf": 0.8588606715202332, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6978388278388279, "calib/avg_num_step_conf": 9.5390625, "calib/ece": 0.20398340248962665, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5145228215767634, "calib/gap": 0.2134087912087912, "calib/mean_conf": 0.7716182572614109, "calib/mu_c": 0.8522000000000001, "calib/mu_w": 0.6387912087912089, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17659751037344407, "calib/std_conf": 0.3062750972334261, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8830372720063443, "calib/step_q_c_n": 1261.0, "calib/step_q_gap": 0.05963337700211058, "calib/step_q_w": 0.8234038950042337, "calib/step_q_w_n": 1181.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 797.0, "completions/mean_terminated_length": 819.4055786132812, "completions/min_length": 0.0, "completions/min_terminated_length": 412.0, "epoch": 0.18133333333333335, "grad_norm": 0.04358909651637077, "kl": 0.1205902099609375, "learning_rate": 8.333333333333333e-07, "loss": -0.0837, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.017811400815844536, "mask/share_reasoning": 0.8396157026290894, "mask/share_step_conf": 0.11522920429706573, "num_tokens": 52163717.0, "reward": 0.5652769804000854, "reward_std": 0.18775799870491028, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7053242325782776, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.1197609230875969, "step": 170 }, { "adv/mean_abs_final_conf": 0.6722425222396851, "adv/mean_abs_reasoning": 0.5818113088607788, "adv/mean_abs_step_conf": 0.6223161220550537, "adv/ratio_final_to_reasoning": 1.1554304840790668, "adv/ratio_step_to_reasoning": 1.0696184700733744, "adv/std_final_conf": 0.862605094909668, "adv/std_reasoning": 0.8433414697647095, "adv/std_step_conf": 0.8730627298355103, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.6797431681152611, "calib/avg_num_step_conf": 10.6171875, "calib/ece": 0.246359649122807, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.5350877192982456, "calib/gap": 0.23424242424242425, "calib/mean_conf": 0.7349561403508772, "calib/mu_c": 0.8366666666666667, "calib/mu_w": 0.6024242424242424, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20776315789473682, "calib/std_conf": 0.34193014805147937, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8850319634703198, "calib/step_q_c_n": 1095.0, "calib/step_q_gap": 0.015192160636062102, "calib/step_q_w": 0.8698398028342577, "calib/step_q_w_n": 1623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 756.71875, "completions/mean_terminated_length": 803.8174438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.1824, "grad_norm": 0.043682727962732315, "kl": 0.1182098388671875, "learning_rate": 8.055555555555557e-07, "loss": -0.1868, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.01802668906748295, "mask/share_reasoning": 0.8102461099624634, "mask/share_step_conf": 0.11313347518444061, "num_tokens": 52464333.0, "reward": 0.5145975351333618, "reward_std": 0.23285827040672302, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6447129249572754, "rewards/format_reward_step": 0.890625, "rewards/step_margin_reward": 0.10557597130537033, "step": 171 }, { "adv/mean_abs_final_conf": 0.6428580284118652, "adv/mean_abs_reasoning": 0.4294288456439972, "adv/mean_abs_step_conf": 0.6253763437271118, "adv/ratio_final_to_reasoning": 1.4970070942667972, "adv/ratio_step_to_reasoning": 1.456297941022709, "adv/std_final_conf": 0.8712030053138733, "adv/std_reasoning": 0.7393924593925476, "adv/std_step_conf": 0.8585172295570374, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7319500402900885, "calib/avg_num_step_conf": 9.94921875, "calib/ece": 0.17716049382716065, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5432098765432098, "calib/gap": 0.2573545527800162, "calib/mean_conf": 0.7608641975308642, "calib/mu_c": 0.8381764705882353, "calib/mu_w": 0.5808219178082191, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11921810699588492, "calib/std_conf": 0.3273004592285005, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8736363636363637, "calib/step_q_c_n": 1474.0, "calib/step_q_gap": 0.07356180632042697, "calib/step_q_w": 0.8000745573159367, "calib/step_q_w_n": 1073.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 738.86328125, "completions/mean_terminated_length": 765.7854614257812, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.18346666666666667, "grad_norm": 0.029050162062048912, "kl": 0.12353515625, "learning_rate": 7.777777777777779e-07, "loss": -0.1587, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01907893642783165, "mask/share_reasoning": 0.823893666267395, "mask/share_step_conf": 0.12187115848064423, "num_tokens": 52756834.0, "reward": 0.610542893409729, "reward_std": 0.17611366510391235, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7432527542114258, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.15595799684524536, "step": 172 }, { "adv/mean_abs_final_conf": 0.7335048317909241, "adv/mean_abs_reasoning": 0.5627208948135376, "adv/mean_abs_step_conf": 0.6173565983772278, "adv/ratio_final_to_reasoning": 1.3034967042302157, "adv/ratio_step_to_reasoning": 1.0970920114523102, "adv/std_final_conf": 0.9194482564926147, "adv/std_reasoning": 0.8100591897964478, "adv/std_step_conf": 0.8428692817687988, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.62815259625574, "calib/avg_num_step_conf": 10.25, "calib/ece": 0.24184426229508202, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4959016393442623, "calib/gap": 0.17763970328505818, "calib/mean_conf": 0.7425819672131148, "calib/mu_c": 0.8117449664429529, "calib/mu_w": 0.6341052631578947, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1868852459016394, "calib/std_conf": 0.32937019732807243, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.86306463878327, "calib/step_q_c_n": 1315.0, "calib/step_q_gap": 0.00031444779778477816, "calib/step_q_w": 0.8627501909854852, "calib/step_q_w_n": 1309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 795.515625, "completions/mean_terminated_length": 814.6080322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.18453333333333333, "grad_norm": 0.06134408712387085, "kl": 0.1226654052734375, "learning_rate": 7.5e-07, "loss": -0.0304, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.018563395366072655, "mask/share_reasoning": 0.837020993232727, "mask/share_step_conf": 0.12097810208797455, "num_tokens": 53063646.0, "reward": 0.5331622362136841, "reward_std": 0.20329146087169647, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6742647886276245, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.08815346658229828, "step": 173 }, { "adv/mean_abs_final_conf": 0.7394042015075684, "adv/mean_abs_reasoning": 0.5299354195594788, "adv/mean_abs_step_conf": 0.595130443572998, "adv/ratio_final_to_reasoning": 1.3952722807662403, "adv/ratio_step_to_reasoning": 1.1230244697886285, "adv/std_final_conf": 0.9050006866455078, "adv/std_reasoning": 0.7930909395217896, "adv/std_step_conf": 0.8261666297912598, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6056763285024154, "calib/avg_num_step_conf": 10.421875, "calib/ece": 0.27093220338983054, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3855932203389831, "calib/gap": 0.12093297101449263, "calib/mean_conf": 0.6838983050847458, "calib/mu_c": 0.7310416666666667, "calib/mu_w": 0.6101086956521741, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17233050847457626, "calib/std_conf": 0.33854132236150997, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8738634471273938, "calib/step_q_c_n": 1201.0, "calib/step_q_gap": 0.04451784385541013, "calib/step_q_w": 0.8293456032719837, "calib/step_q_w_n": 1467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 854.65625, "completions/mean_terminated_length": 893.0285034179688, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.1856, "grad_norm": 0.7679807543754578, "kl": 1.7789306640625, "learning_rate": 7.222222222222222e-07, "loss": -0.137, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.017254464328289032, "mask/share_reasoning": 0.8324068784713745, "mask/share_step_conf": 0.10736991465091705, "num_tokens": 53386670.0, "reward": 0.5143245458602905, "reward_std": 0.20822112262248993, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.641096830368042, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.09145841747522354, "step": 174 }, { "adv/mean_abs_final_conf": 0.745948851108551, "adv/mean_abs_reasoning": 0.5892815589904785, "adv/mean_abs_step_conf": 0.5366685390472412, "adv/ratio_final_to_reasoning": 1.2658615219292886, "adv/ratio_step_to_reasoning": 0.9107166699168887, "adv/std_final_conf": 0.9064469933509827, "adv/std_reasoning": 0.8431714773178101, "adv/std_step_conf": 0.7931908369064331, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.7054147465437787, "calib/avg_num_step_conf": 9.69140625, "calib/ece": 0.3047598253275109, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.49344978165938863, "calib/gap": 0.22457142857142864, "calib/mean_conf": 0.6929694323144104, "calib/mu_c": 0.8145714285714286, "calib/mu_w": 0.59, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2696069868995633, "calib/std_conf": 0.3631556307783941, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8837430167597766, "calib/step_q_c_n": 895.0, "calib/step_q_gap": 0.09064717817213486, "calib/step_q_w": 0.7930958385876418, "calib/step_q_w_n": 1586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 809.4921875, "completions/mean_terminated_length": 856.322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 338.0, "epoch": 0.18666666666666668, "grad_norm": 0.044385261833667755, "kl": 0.11968994140625, "learning_rate": 6.944444444444446e-07, "loss": -0.137, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.01704726368188858, "mask/share_reasoning": 0.8160483241081238, "mask/share_step_conf": 0.11221694201231003, "num_tokens": 53699724.0, "reward": 0.4492078423500061, "reward_std": 0.21994549036026, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6050456762313843, "rewards/format_reward_step": 0.89453125, "rewards/step_margin_reward": 0.032432470470666885, "step": 175 }, { "adv/mean_abs_final_conf": 0.7688947319984436, "adv/mean_abs_reasoning": 0.5796619653701782, "adv/mean_abs_step_conf": 0.6438271999359131, "adv/ratio_final_to_reasoning": 1.326453653910895, "adv/ratio_step_to_reasoning": 1.1106942293941233, "adv/std_final_conf": 0.9263163805007935, "adv/std_reasoning": 0.8100887537002563, "adv/std_step_conf": 0.8563393354415894, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6597462939744818, "calib/avg_num_step_conf": 11.171875, "calib/ece": 0.23804166666666662, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5125, "calib/gap": 0.23044841064975286, "calib/mean_conf": 0.708125, "calib/mu_c": 0.795503355704698, "calib/mu_w": 0.5650549450549451, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16266666666666663, "calib/std_conf": 0.3583774561013383, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8748981077147017, "calib/step_q_c_n": 1374.0, "calib/step_q_gap": 0.010934446880246806, "calib/step_q_w": 0.8639636608344549, "calib/step_q_w_n": 1486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 737.20703125, "completions/mean_terminated_length": 779.8553466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 299.0, "epoch": 0.18773333333333334, "grad_norm": 0.04479358717799187, "kl": 0.118072509765625, "learning_rate": 6.666666666666667e-07, "loss": -0.2052, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01866709068417549, "mask/share_reasoning": 0.8016656041145325, "mask/share_step_conf": 0.12497982382774353, "num_tokens": 53992513.0, "reward": 0.5420329570770264, "reward_std": 0.20010113716125488, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6909761428833008, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.08918342739343643, "step": 176 }, { "adv/mean_abs_final_conf": 0.7447647452354431, "adv/mean_abs_reasoning": 0.45483875274658203, "adv/mean_abs_step_conf": 0.5685096979141235, "adv/ratio_final_to_reasoning": 1.6374258805744204, "adv/ratio_step_to_reasoning": 1.249914820320674, "adv/std_final_conf": 0.9233677387237549, "adv/std_reasoning": 0.7394517660140991, "adv/std_step_conf": 0.809304416179657, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7086515863689777, "calib/avg_num_step_conf": 10.6640625, "calib/ece": 0.23116666666666663, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5166666666666667, "calib/gap": 0.22249706227967114, "calib/mean_conf": 0.73775, "calib/mu_c": 0.8230405405405407, "calib/mu_w": 0.6005434782608695, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17612499999999998, "calib/std_conf": 0.336591895574844, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8616353677621283, "calib/step_q_c_n": 1278.0, "calib/step_q_gap": 0.05520974792741751, "calib/step_q_w": 0.8064256198347108, "calib/step_q_w_n": 1452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 783.15625, "completions/mean_terminated_length": 818.318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 304.0, "epoch": 0.1888, "grad_norm": 0.041428253054618835, "kl": 0.1178741455078125, "learning_rate": 6.388888888888889e-07, "loss": -0.1572, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.018425092101097107, "mask/share_reasoning": 0.8225729465484619, "mask/share_step_conf": 0.11603318154811859, "num_tokens": 54296833.0, "reward": 0.5523170232772827, "reward_std": 0.22074581682682037, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6918632388114929, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.11042701452970505, "step": 177 }, { "adv/mean_abs_final_conf": 0.7356642484664917, "adv/mean_abs_reasoning": 0.5754988193511963, "adv/mean_abs_step_conf": 0.6615732908248901, "adv/ratio_final_to_reasoning": 1.2783071376164805, "adv/ratio_step_to_reasoning": 1.1495649835923767, "adv/std_final_conf": 0.9213371872901917, "adv/std_reasoning": 0.8100489974021912, "adv/std_step_conf": 0.8748651742935181, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7234873129472998, "calib/avg_num_step_conf": 9.1328125, "calib/ece": 0.2090243902439024, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.47560975609756095, "calib/gap": 0.23845586640641914, "calib/mean_conf": 0.7197560975609756, "calib/mu_c": 0.8040880503144652, "calib/mu_w": 0.565632183908046, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1412195121951219, "calib/std_conf": 0.339978391026928, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8770305980528512, "calib/step_q_c_n": 1438.0, "calib/step_q_gap": 0.01588615360840684, "calib/step_q_w": 0.8611444444444444, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 777.96875, "completions/mean_terminated_length": 790.3175048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 331.0, "epoch": 0.18986666666666666, "grad_norm": 0.04370216280221939, "kl": 0.1238861083984375, "learning_rate": 6.111111111111112e-07, "loss": -0.0689, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0190714281052351, "mask/share_reasoning": 0.8445122241973877, "mask/share_step_conf": 0.12079129368066788, "num_tokens": 54602065.0, "reward": 0.5797073841094971, "reward_std": 0.21497192978858948, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7258832454681396, "rewards/format_reward_step": 0.95703125, "rewards/step_margin_reward": 0.11712530255317688, "step": 178 }, { "adv/mean_abs_final_conf": 0.6700806021690369, "adv/mean_abs_reasoning": 0.5245306491851807, "adv/mean_abs_step_conf": 0.6180737018585205, "adv/ratio_final_to_reasoning": 1.2774860786685338, "adv/ratio_step_to_reasoning": 1.178336676452848, "adv/std_final_conf": 0.8638622760772705, "adv/std_reasoning": 0.757753849029541, "adv/std_step_conf": 0.8429971933364868, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7330273892773892, "calib/avg_num_step_conf": 9.640625, "calib/ece": 0.2402231520223152, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5648535564853556, "calib/gap": 0.21421692890442878, "calib/mean_conf": 0.8128591352859135, "calib/mu_c": 0.8989044289044287, "calib/mu_w": 0.6846874999999999, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2273779637377964, "calib/std_conf": 0.27860574390767745, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8812111292962356, "calib/step_q_c_n": 1222.0, "calib/step_q_gap": 0.0577841630041005, "calib/step_q_w": 0.8234269662921351, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 822.890625, "completions/mean_terminated_length": 846.0240478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.19093333333333334, "grad_norm": 0.0315001904964447, "kl": 0.11480712890625, "learning_rate": 5.833333333333334e-07, "loss": -0.0478, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01795131526887417, "mask/share_reasoning": 0.8423312306404114, "mask/share_step_conf": 0.1123737245798111, "num_tokens": 54918989.0, "reward": 0.5374990105628967, "reward_std": 0.2064266949892044, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6836533546447754, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.09446967393159866, "step": 179 }, { "adv/mean_abs_final_conf": 0.6589535474777222, "adv/mean_abs_reasoning": 0.4871194064617157, "adv/mean_abs_step_conf": 0.6457631587982178, "adv/ratio_final_to_reasoning": 1.3527556872844717, "adv/ratio_step_to_reasoning": 1.3256773395435857, "adv/std_final_conf": 0.8544555902481079, "adv/std_reasoning": 0.757696270942688, "adv/std_step_conf": 0.874000608921051, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6901719901719902, "calib/avg_num_step_conf": 10.79296875, "calib/ece": 0.18008368200836816, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5481171548117155, "calib/gap": 0.28092137592137567, "calib/mean_conf": 0.7502928870292888, "calib/mu_c": 0.8372727272727272, "calib/mu_w": 0.5563513513513515, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11999999999999995, "calib/std_conf": 0.3319605732116072, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8613936781609195, "calib/step_q_c_n": 1392.0, "calib/step_q_gap": 0.01669637692094872, "calib/step_q_w": 0.8446973012399708, "calib/step_q_w_n": 1371.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 816.4765625, "completions/mean_terminated_length": 849.6666259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.192, "grad_norm": 0.05409874767065048, "kl": 0.11113739013671875, "learning_rate": 5.555555555555555e-07, "loss": -0.1265, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.017763523384928703, "mask/share_reasoning": 0.8341307640075684, "mask/share_step_conf": 0.10904324799776077, "num_tokens": 55231863.0, "reward": 0.5864684581756592, "reward_std": 0.17747780680656433, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7399226427078247, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.1173892617225647, "step": 180 }, { "adv/mean_abs_final_conf": 0.7162941098213196, "adv/mean_abs_reasoning": 0.5559822916984558, "adv/mean_abs_step_conf": 0.6380290389060974, "adv/ratio_final_to_reasoning": 1.2883397916022314, "adv/ratio_step_to_reasoning": 1.1475707921505902, "adv/std_final_conf": 0.9164659380912781, "adv/std_reasoning": 0.8098906874656677, "adv/std_step_conf": 0.8589956164360046, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7135375494071146, "calib/avg_num_step_conf": 8.8828125, "calib/ece": 0.25395161290322577, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5201612903225806, "calib/gap": 0.22732806324110688, "calib/mean_conf": 0.7539516129032259, "calib/mu_c": 0.8547826086956523, "calib/mu_w": 0.6274545454545454, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2257258064516129, "calib/std_conf": 0.32277855823875423, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8858549222797927, "calib/step_q_c_n": 1158.0, "calib/step_q_gap": 0.018543094322803322, "calib/step_q_w": 0.8673118279569894, "calib/step_q_w_n": 1116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 746.71875, "completions/mean_terminated_length": 755.5731811523438, "completions/min_length": 0.0, "completions/min_terminated_length": 395.0, "epoch": 0.19306666666666666, "grad_norm": 0.032625097781419754, "kl": 0.135833740234375, "learning_rate": 5.277777777777779e-07, "loss": -0.0057, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01943790167570114, "mask/share_reasoning": 0.8480738401412964, "mask/share_step_conf": 0.12076946347951889, "num_tokens": 55529287.0, "reward": 0.5502763986587524, "reward_std": 0.20317913591861725, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6994858980178833, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.09950438886880875, "step": 181 }, { "adv/mean_abs_final_conf": 0.7068451642990112, "adv/mean_abs_reasoning": 0.4585656523704529, "adv/mean_abs_step_conf": 0.5878832340240479, "adv/ratio_final_to_reasoning": 1.541426316264929, "adv/ratio_step_to_reasoning": 1.2820045090274785, "adv/std_final_conf": 0.8891746401786804, "adv/std_reasoning": 0.7394094467163086, "adv/std_step_conf": 0.8260961771011353, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7947052947052947, "calib/avg_num_step_conf": 8.5859375, "calib/ece": 0.2035102040816326, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6163265306122448, "calib/gap": 0.2901948051948052, "calib/mean_conf": 0.7838367346938776, "calib/mu_c": 0.8916233766233767, "calib/mu_w": 0.6014285714285714, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17938775510204075, "calib/std_conf": 0.3214266251965697, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8847572078907436, "calib/step_q_c_n": 1318.0, "calib/step_q_gap": 0.02775720789074354, "calib/step_q_w": 0.8570000000000001, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 756.31640625, "completions/mean_terminated_length": 774.468017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.19413333333333332, "grad_norm": 0.03317157179117203, "kl": 0.1144256591796875, "learning_rate": 5.000000000000001e-07, "loss": -0.0833, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018733307719230652, "mask/share_reasoning": 0.8430584073066711, "mask/share_step_conf": 0.1147707849740982, "num_tokens": 55829064.0, "reward": 0.5790822505950928, "reward_std": 0.19087812304496765, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7412495613098145, "rewards/format_reward_step": 0.953125, "rewards/step_margin_reward": 0.1059773862361908, "step": 182 }, { "adv/mean_abs_final_conf": 0.7296533584594727, "adv/mean_abs_reasoning": 0.5593553781509399, "adv/mean_abs_step_conf": 0.6811344027519226, "adv/ratio_final_to_reasoning": 1.3044539964404855, "adv/ratio_step_to_reasoning": 1.2177131558179477, "adv/std_final_conf": 0.8898025155067444, "adv/std_reasoning": 0.7930997014045715, "adv/std_step_conf": 0.8903030157089233, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7486428990610328, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.21281512605042008, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5126050420168067, "calib/gap": 0.2929592136150234, "calib/mean_conf": 0.7123949579831933, "calib/mu_c": 0.8305633802816901, "calib/mu_w": 0.5376041666666667, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16428571428571423, "calib/std_conf": 0.3581132565437828, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8857435897435899, "calib/step_q_c_n": 1170.0, "calib/step_q_gap": 0.04959738035730832, "calib/step_q_w": 0.8361462093862816, "calib/step_q_w_n": 1108.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 809.51953125, "completions/mean_terminated_length": 842.4268188476562, "completions/min_length": 0.0, "completions/min_terminated_length": 418.0, "epoch": 0.1952, "grad_norm": 0.07171082496643066, "kl": 0.12462615966796875, "learning_rate": 4.7222222222222226e-07, "loss": -0.1082, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017574355006217957, "mask/share_reasoning": 0.8402279019355774, "mask/share_step_conf": 0.10313522815704346, "num_tokens": 56142981.0, "reward": 0.5524033308029175, "reward_std": 0.21205949783325195, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7050496339797974, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.10366328060626984, "step": 183 }, { "adv/mean_abs_final_conf": 0.6614429354667664, "adv/mean_abs_reasoning": 0.5752705335617065, "adv/mean_abs_step_conf": 0.7010568976402283, "adv/ratio_final_to_reasoning": 1.1497945694724454, "adv/ratio_step_to_reasoning": 1.2186560178908055, "adv/std_final_conf": 0.8730801343917847, "adv/std_reasoning": 0.8267854452133179, "adv/std_step_conf": 0.8901596069335938, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7002822580645162, "calib/avg_num_step_conf": 9.65625, "calib/ece": 0.21629787234042563, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.6851063829787234, "calib/gap": 0.21309274193548378, "calib/mean_conf": 0.8464255319148937, "calib/mu_c": 0.9189677419354838, "calib/mu_w": 0.705875, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20157446808510648, "calib/std_conf": 0.2716396035959013, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8873322422258594, "calib/step_q_c_n": 1222.0, "calib/step_q_gap": 0.0556362422258595, "calib/step_q_w": 0.8316959999999999, "calib/step_q_w_n": 1250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 786.3125, "completions/mean_terminated_length": 831.8016357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 351.0, "epoch": 0.19626666666666667, "grad_norm": 0.03393423184752464, "kl": 0.1053619384765625, "learning_rate": 4.444444444444445e-07, "loss": -0.1485, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01755671016871929, "mask/share_reasoning": 0.8220027089118958, "mask/share_step_conf": 0.10575306415557861, "num_tokens": 56449557.0, "reward": 0.5497719049453735, "reward_std": 0.2198198437690735, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.69991135597229, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.09494496136903763, "step": 184 }, { "adv/mean_abs_final_conf": 0.669137179851532, "adv/mean_abs_reasoning": 0.5341732501983643, "adv/mean_abs_step_conf": 0.5832904577255249, "adv/ratio_final_to_reasoning": 1.252659468820367, "adv/ratio_step_to_reasoning": 1.0919499572637175, "adv/std_final_conf": 0.8619014024734497, "adv/std_reasoning": 0.775561511516571, "adv/std_step_conf": 0.8265411853790283, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7486979166666666, "calib/avg_num_step_conf": 10.265625, "calib/ece": 0.24618644067796602, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.6059322033898306, "calib/gap": 0.24150892857142858, "calib/mean_conf": 0.7948305084745763, "calib/mu_c": 0.8930714285714286, "calib/mu_w": 0.6515625, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22389830508474567, "calib/std_conf": 0.30630447091046753, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8659555908009514, "calib/step_q_c_n": 1261.0, "calib/step_q_gap": 0.07792340352955418, "calib/step_q_w": 0.7880321872713972, "calib/step_q_w_n": 1367.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 763.078125, "completions/mean_terminated_length": 813.9500122070312, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.19733333333333333, "grad_norm": 0.03766736388206482, "kl": 0.1071929931640625, "learning_rate": 4.1666666666666667e-07, "loss": -0.2526, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01759970560669899, "mask/share_reasoning": 0.8113294839859009, "mask/share_step_conf": 0.10857082158327103, "num_tokens": 56751825.0, "reward": 0.5245174765586853, "reward_std": 0.21171888709068298, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6829047203063965, "rewards/format_reward_step": 0.921875, "rewards/step_margin_reward": 0.07238027453422546, "step": 185 }, { "adv/mean_abs_final_conf": 0.6287456750869751, "adv/mean_abs_reasoning": 0.534726619720459, "adv/mean_abs_step_conf": 0.6588317155838013, "adv/ratio_final_to_reasoning": 1.1758263978248675, "adv/ratio_step_to_reasoning": 1.2320907381200157, "adv/std_final_conf": 0.8337107300758362, "adv/std_reasoning": 0.7755080461502075, "adv/std_step_conf": 0.8746791481971741, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7532038682286343, "calib/avg_num_step_conf": 9.73046875, "calib/ece": 0.21229166666666668, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6333333333333333, "calib/gap": 0.25686217469926864, "calib/mean_conf": 0.7873750000000002, "calib/mu_c": 0.8719254658385093, "calib/mu_w": 0.6150632911392406, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1644166666666667, "calib/std_conf": 0.3269470283929799, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8626144658261446, "calib/step_q_c_n": 1507.0, "calib/step_q_gap": 0.02363072598874638, "calib/step_q_w": 0.8389837398373983, "calib/step_q_w_n": 984.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2890.0, "completions/max_terminated_length": 2890.0, "completions/mean_length": 747.94921875, "completions/mean_terminated_length": 787.9629516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.1984, "grad_norm": 0.03523175045847893, "kl": 0.1146240234375, "learning_rate": 3.8888888888888895e-07, "loss": -0.1019, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.018348462879657745, "mask/share_reasoning": 0.8129653334617615, "mask/share_step_conf": 0.11790493130683899, "num_tokens": 57048340.0, "reward": 0.5777019262313843, "reward_std": 0.17776593565940857, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7238870859146118, "rewards/format_reward_step": 0.9375, "rewards/step_margin_reward": 0.11823556572198868, "step": 186 }, { "adv/mean_abs_final_conf": 0.7016821503639221, "adv/mean_abs_reasoning": 0.533699095249176, "adv/mean_abs_step_conf": 0.7027453184127808, "adv/ratio_final_to_reasoning": 1.3147523700341244, "adv/ratio_step_to_reasoning": 1.3167444439542841, "adv/std_final_conf": 0.8603917360305786, "adv/std_reasoning": 0.7756333947181702, "adv/std_step_conf": 0.8903680443763733, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6942536138543008, "calib/avg_num_step_conf": 9.48828125, "calib/ece": 0.30104602510460254, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5523012552301255, "calib/gap": 0.12569486188636037, "calib/mean_conf": 0.8151882845188284, "calib/mu_c": 0.8688321167883212, "calib/mu_w": 0.7431372549019608, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2715062761506276, "calib/std_conf": 0.27137775341953146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8607933740191805, "calib/step_q_c_n": 1147.0, "calib/step_q_gap": 0.06179181395677813, "calib/step_q_w": 0.7990015600624024, "calib/step_q_w_n": 1282.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 822.83984375, "completions/mean_terminated_length": 842.5880126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 404.0, "epoch": 0.19946666666666665, "grad_norm": 0.030599793419241905, "kl": 0.10474395751953125, "learning_rate": 3.611111111111111e-07, "loss": -0.0718, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.017855819314718246, "mask/share_reasoning": 0.8483026623725891, "mask/share_step_conf": 0.11040402948856354, "num_tokens": 57360531.0, "reward": 0.5265020132064819, "reward_std": 0.2111303061246872, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6392011642456055, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.12005294859409332, "step": 187 }, { "adv/mean_abs_final_conf": 0.6758257150650024, "adv/mean_abs_reasoning": 0.4870375096797943, "adv/mean_abs_step_conf": 0.7226362228393555, "adv/ratio_final_to_reasoning": 1.3876255968648659, "adv/ratio_step_to_reasoning": 1.4837383332434846, "adv/std_final_conf": 0.8640143871307373, "adv/std_reasoning": 0.7578036785125732, "adv/std_step_conf": 0.9045403599739075, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6517767946339375, "calib/avg_num_step_conf": 9.05078125, "calib/ece": 0.29319502074688797, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6473029045643154, "calib/gap": 0.14823248180391035, "calib/mean_conf": 0.7960165975103736, "calib/mu_c": 0.8562937062937062, "calib/mu_w": 0.7080612244897958, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24792531120331954, "calib/std_conf": 0.31101077307600766, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8802759197324415, "calib/step_q_c_n": 1196.0, "calib/step_q_gap": 0.060971727047338775, "calib/step_q_w": 0.8193041926851027, "calib/step_q_w_n": 1121.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 809.97265625, "completions/mean_terminated_length": 836.1007690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 0.20053333333333334, "grad_norm": 0.04199248552322388, "kl": 0.1089019775390625, "learning_rate": 3.3333333333333335e-07, "loss": -0.1571, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01780654862523079, "mask/share_reasoning": 0.8402260541915894, "mask/share_step_conf": 0.11071738600730896, "num_tokens": 57671956.0, "reward": 0.5369012355804443, "reward_std": 0.2067958414554596, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6518781185150146, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.12192434072494507, "step": 188 }, { "adv/mean_abs_final_conf": 0.6806594729423523, "adv/mean_abs_reasoning": 0.4600512683391571, "adv/mean_abs_step_conf": 0.5683373212814331, "adv/ratio_final_to_reasoning": 1.4795296085143261, "adv/ratio_step_to_reasoning": 1.235378229329097, "adv/std_final_conf": 0.8793020844459534, "adv/std_reasoning": 0.7576866745948792, "adv/std_step_conf": 0.8246341347694397, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7665191950906236, "calib/avg_num_step_conf": 8.625, "calib/ece": 0.21946058091286308, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5435684647302904, "calib/gap": 0.3174832310546599, "calib/mean_conf": 0.7121576763485477, "calib/mu_c": 0.8412587412587414, "calib/mu_w": 0.5237755102040815, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1691286307053942, "calib/std_conf": 0.3612362246410801, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8582800982800983, "calib/step_q_c_n": 1221.0, "calib/step_q_gap": -0.022054248224460893, "calib/step_q_w": 0.8803343465045592, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 772.9765625, "completions/mean_terminated_length": 785.24609375, "completions/min_length": 0.0, "completions/min_terminated_length": 51.0, "epoch": 0.2016, "grad_norm": 0.026472102850675583, "kl": 0.124603271484375, "learning_rate": 3.055555555555556e-07, "loss": -0.1296, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01839999482035637, "mask/share_reasoning": 0.8569129109382629, "mask/share_step_conf": 0.1090620756149292, "num_tokens": 57977606.0, "reward": 0.5555186867713928, "reward_std": 0.20296701788902283, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7223590016365051, "rewards/format_reward_step": 0.94140625, "rewards/step_margin_reward": 0.08867847919464111, "step": 189 }, { "adv/mean_abs_final_conf": 0.6543399095535278, "adv/mean_abs_reasoning": 0.47864142060279846, "adv/mean_abs_step_conf": 0.6116296648979187, "adv/ratio_final_to_reasoning": 1.3670774851233218, "adv/ratio_step_to_reasoning": 1.2778452481768827, "adv/std_final_conf": 0.8639841675758362, "adv/std_reasoning": 0.757647693157196, "adv/std_step_conf": 0.8420366048812866, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7143939393939394, "calib/avg_num_step_conf": 9.7734375, "calib/ece": 0.23644351464435143, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.602510460251046, "calib/gap": 0.2598715728715729, "calib/mean_conf": 0.7836401673640166, "calib/mu_c": 0.8912857142857143, "calib/mu_w": 0.6314141414141414, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21715481171548112, "calib/std_conf": 0.32542172865808633, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8704513888888888, "calib/step_q_c_n": 1152.0, "calib/step_q_gap": 0.05475509259259248, "calib/step_q_w": 0.8156962962962964, "calib/step_q_w_n": 1350.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 804.6484375, "completions/mean_terminated_length": 833.9676513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.20266666666666666, "grad_norm": 0.04170762002468109, "kl": 0.1046142578125, "learning_rate": 2.7777777777777776e-07, "loss": -0.148, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.017385059967637062, "mask/share_reasoning": 0.8406679630279541, "mask/share_step_conf": 0.10679075121879578, "num_tokens": 58289204.0, "reward": 0.5498544573783875, "reward_std": 0.17992237210273743, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6893831491470337, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.11423200368881226, "step": 190 }, { "adv/mean_abs_final_conf": 0.6690695285797119, "adv/mean_abs_reasoning": 0.5088257789611816, "adv/mean_abs_step_conf": 0.5422240495681763, "adv/ratio_final_to_reasoning": 1.3149285202209757, "adv/ratio_step_to_reasoning": 1.0656379294995244, "adv/std_final_conf": 0.8677650690078735, "adv/std_reasoning": 0.7928370237350464, "adv/std_step_conf": 0.8096816539764404, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.716376691521679, "calib/avg_num_step_conf": 10.89453125, "calib/ece": 0.26278688524590177, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.6147540983606558, "calib/gap": 0.2648329190831261, "calib/mean_conf": 0.7719672131147541, "calib/mu_c": 0.8826760563380281, "calib/mu_w": 0.617843137254902, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22639344262295097, "calib/std_conf": 0.3475668100518378, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8723001631321371, "calib/step_q_c_n": 1226.0, "calib/step_q_gap": 0.0888196768877354, "calib/step_q_w": 0.7834804862444017, "calib/step_q_w_n": 1563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2770.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 768.87890625, "completions/mean_terminated_length": 800.1340942382812, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.20373333333333332, "grad_norm": 0.03425566852092743, "kl": 0.1134490966796875, "learning_rate": 2.5000000000000004e-07, "loss": -0.0533, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01871422305703163, "mask/share_reasoning": 0.8217412829399109, "mask/share_step_conf": 0.12048199772834778, "num_tokens": 58590205.0, "reward": 0.5533047318458557, "reward_std": 0.18823924660682678, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6944394111633301, "rewards/format_reward_step": 0.94921875, "rewards/step_margin_reward": 0.11138871312141418, "step": 191 }, { "adv/mean_abs_final_conf": 0.6132959127426147, "adv/mean_abs_reasoning": 0.5046427249908447, "adv/mean_abs_step_conf": 0.7104313373565674, "adv/ratio_final_to_reasoning": 1.215307151715585, "adv/ratio_step_to_reasoning": 1.407790704541428, "adv/std_final_conf": 0.8096941113471985, "adv/std_reasoning": 0.7756152749061584, "adv/std_step_conf": 0.9055757522583008, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.765743670886076, "calib/avg_num_step_conf": 8.26171875, "calib/ece": 0.20294117647058807, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6722689075630253, "calib/gap": 0.2902215189873416, "calib/mean_conf": 0.8101680672268907, "calib/mu_c": 0.9077215189873417, "calib/mu_w": 0.6175, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17462184873949566, "calib/std_conf": 0.31895834451369953, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8800899550224888, "calib/step_q_c_n": 1334.0, "calib/step_q_gap": 0.02710660034899315, "calib/step_q_w": 0.8529833546734956, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 790.390625, "completions/mean_terminated_length": 812.6104125976562, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.2048, "grad_norm": 0.03434360399842262, "kl": 0.11631011962890625, "learning_rate": 2.2222222222222224e-07, "loss": -0.1059, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01865743286907673, "mask/share_reasoning": 0.8368434906005859, "mask/share_step_conf": 0.1171552985906601, "num_tokens": 58897521.0, "reward": 0.5786335468292236, "reward_std": 0.21570603549480438, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7242991924285889, "rewards/format_reward_step": 0.92578125, "rewards/step_margin_reward": 0.12437406182289124, "step": 192 }, { "adv/mean_abs_final_conf": 0.7365913987159729, "adv/mean_abs_reasoning": 0.6295239925384521, "adv/mean_abs_step_conf": 0.6144850254058838, "adv/ratio_final_to_reasoning": 1.1700767682352964, "adv/ratio_step_to_reasoning": 0.9761105735272675, "adv/std_final_conf": 0.9079707860946655, "adv/std_reasoning": 0.8591470122337341, "adv/std_step_conf": 0.8432223200798035, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7285806544202067, "calib/avg_num_step_conf": 8.59765625, "calib/ece": 0.2454481792717087, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.5966386554621849, "calib/gap": 0.26407242632988914, "calib/mean_conf": 0.7737114845938374, "calib/mu_c": 0.8891044776119404, "calib/mu_w": 0.6250320512820513, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2280672268907563, "calib/std_conf": 0.33006874315649326, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8711301989150091, "calib/step_q_c_n": 1106.0, "calib/step_q_gap": 0.11124902889672761, "calib/step_q_w": 0.7598811700182815, "calib/step_q_w_n": 1094.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 763.2421875, "completions/mean_terminated_length": 800.7786254882812, "completions/min_length": 0.0, "completions/min_terminated_length": 433.0, "epoch": 0.20586666666666667, "grad_norm": 0.04226591810584068, "kl": 0.1104583740234375, "learning_rate": 1.9444444444444447e-07, "loss": -0.1568, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017796780914068222, "mask/share_reasoning": 0.8287982940673828, "mask/share_step_conf": 0.10652987658977509, "num_tokens": 59198623.0, "reward": 0.5382182598114014, "reward_std": 0.22335919737815857, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.675937831401825, "rewards/format_reward_step": 0.91796875, "rewards/step_margin_reward": 0.11221752315759659, "step": 193 }, { "adv/mean_abs_final_conf": 0.5812814235687256, "adv/mean_abs_reasoning": 0.47916898131370544, "adv/mean_abs_step_conf": 0.6441755294799805, "adv/ratio_final_to_reasoning": 1.213103197905393, "adv/ratio_step_to_reasoning": 1.3443598283717941, "adv/std_final_conf": 0.8021941781044006, "adv/std_reasoning": 0.7755130529403687, "adv/std_step_conf": 0.8861175179481506, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7469745793054063, "calib/avg_num_step_conf": 8.78515625, "calib/ece": 0.22046818181818187, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6528925619834711, "calib/gap": 0.3055363480128893, "calib/mean_conf": 0.7725070247933885, "calib/mu_c": 0.8924489795918367, "calib/mu_w": 0.5869126315789474, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19276859504132238, "calib/std_conf": 0.3436059382758407, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8584826472962067, "calib/step_q_c_n": 1239.0, "calib/step_q_gap": 0.02332423145462259, "calib/step_q_w": 0.8351584158415841, "calib/step_q_w_n": 1010.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 785.57421875, "completions/mean_terminated_length": 804.4280395507812, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.20693333333333333, "grad_norm": 0.024011608213186264, "kl": 0.1150360107421875, "learning_rate": 1.6666666666666668e-07, "loss": -0.0477, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018756553530693054, "mask/share_reasoning": 0.8442188501358032, "mask/share_step_conf": 0.11358709633350372, "num_tokens": 59505674.0, "reward": 0.5598720908164978, "reward_std": 0.18959592282772064, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.72027587890625, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.0955621674656868, "step": 194 }, { "adv/mean_abs_final_conf": 0.706782341003418, "adv/mean_abs_reasoning": 0.4451354742050171, "adv/mean_abs_step_conf": 0.5628293752670288, "adv/ratio_final_to_reasoning": 1.5877915420371407, "adv/ratio_step_to_reasoning": 1.2644001834996532, "adv/std_final_conf": 0.8860529065132141, "adv/std_reasoning": 0.7015165686607361, "adv/std_step_conf": 0.7926657795906067, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6485927152317881, "calib/avg_num_step_conf": 9.07421875, "calib/ece": 0.26105263157894726, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.6032388663967612, "calib/gap": 0.13642315121412796, "calib/mean_conf": 0.8041295546558704, "calib/mu_c": 0.8571523178807946, "calib/mu_w": 0.7207291666666666, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2269230769230768, "calib/std_conf": 0.2915808361763642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8598809523809522, "calib/step_q_c_n": 1260.0, "calib/step_q_gap": 0.047369193208797844, "calib/step_q_w": 0.8125117591721543, "calib/step_q_w_n": 1063.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 793.7421875, "completions/mean_terminated_length": 809.5538330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.208, "grad_norm": 0.034561559557914734, "kl": 0.10968017578125, "learning_rate": 1.3888888888888888e-07, "loss": -0.0458, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018744606524705887, "mask/share_reasoning": 0.8473308086395264, "mask/share_step_conf": 0.11439335346221924, "num_tokens": 59814856.0, "reward": 0.542776346206665, "reward_std": 0.1786074936389923, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6802500486373901, "rewards/format_reward_step": 0.96484375, "rewards/step_margin_reward": 0.0943652093410492, "step": 195 }, { "adv/mean_abs_final_conf": 0.554034948348999, "adv/mean_abs_reasoning": 0.3748331665992737, "adv/mean_abs_step_conf": 0.5898422002792358, "adv/ratio_final_to_reasoning": 1.478084112394745, "adv/ratio_step_to_reasoning": 1.5736126171295397, "adv/std_final_conf": 0.7945897579193115, "adv/std_reasoning": 0.661367654800415, "adv/std_step_conf": 0.8264286518096924, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6893210462122239, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.27139442231075683, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7768924302788844, "calib/gap": 0.1939937660929666, "calib/mean_conf": 0.8502788844621515, "calib/mu_c": 0.9229299363057324, "calib/mu_w": 0.7289361702127658, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2480876494023903, "calib/std_conf": 0.2912459871179368, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.884874715261959, "calib/step_q_c_n": 1317.0, "calib/step_q_gap": 0.04978626573022127, "calib/step_q_w": 0.8350884495317378, "calib/step_q_w_n": 961.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 726.890625, "completions/mean_terminated_length": 732.6141967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.20906666666666668, "grad_norm": 0.03739454224705696, "kl": 0.1191558837890625, "learning_rate": 1.1111111111111112e-07, "loss": -0.0247, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.020269915461540222, "mask/share_reasoning": 0.8461052179336548, "mask/share_step_conf": 0.1258123368024826, "num_tokens": 60103484.0, "reward": 0.5633987188339233, "reward_std": 0.17114314436912537, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.707119882106781, "rewards/format_reward_step": 0.9765625, "rewards/step_margin_reward": 0.10092748701572418, "step": 196 }, { "adv/mean_abs_final_conf": 0.699467122554779, "adv/mean_abs_reasoning": 0.5718144774436951, "adv/mean_abs_step_conf": 0.6629625558853149, "adv/ratio_final_to_reasoning": 1.22324136611888, "adv/ratio_step_to_reasoning": 1.159401487785162, "adv/std_final_conf": 0.8781381845474243, "adv/std_reasoning": 0.8101165294647217, "adv/std_step_conf": 0.8749804496765137, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7413176535313939, "calib/avg_num_step_conf": 8.55859375, "calib/ece": 0.311280991735537, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6033057851239669, "calib/gap": 0.22983494945327032, "calib/mean_conf": 0.785495867768595, "calib/mu_c": 0.8909160305343512, "calib/mu_w": 0.6610810810810809, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27772727272727254, "calib/std_conf": 0.3244880933257758, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8695282146160962, "calib/step_q_c_n": 1081.0, "calib/step_q_gap": 0.054753439841321416, "calib/step_q_w": 0.8147747747747748, "calib/step_q_w_n": 1110.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 797.8359375, "completions/mean_terminated_length": 816.9840087890625, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.21013333333333334, "grad_norm": 0.068642258644104, "kl": 0.1166534423828125, "learning_rate": 8.333333333333334e-08, "loss": -0.0939, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.018231216818094254, "mask/share_reasoning": 0.8450701236724854, "mask/share_step_conf": 0.1132611557841301, "num_tokens": 60412786.0, "reward": 0.5338296294212341, "reward_std": 0.2415270358324051, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6625949144363403, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.11365815252065659, "step": 197 }, { "adv/mean_abs_final_conf": 0.6309677362442017, "adv/mean_abs_reasoning": 0.47601014375686646, "adv/mean_abs_step_conf": 0.675212025642395, "adv/ratio_final_to_reasoning": 1.325534223418742, "adv/ratio_step_to_reasoning": 1.4184824304653383, "adv/std_final_conf": 0.8275750279426575, "adv/std_reasoning": 0.7208956480026245, "adv/std_step_conf": 0.8589403033256531, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7340441621294616, "calib/avg_num_step_conf": 9.34375, "calib/ece": 0.22042008368200844, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.6443514644351465, "calib/gap": 0.2848831820931639, "calib/mean_conf": 0.7817556485355647, "calib/mu_c": 0.885457894736842, "calib/mu_w": 0.6005747126436781, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1830962343096235, "calib/std_conf": 0.33933392478491603, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8954977375565609, "calib/step_q_c_n": 1326.0, "calib/step_q_gap": 0.0303288820218518, "calib/step_q_w": 0.8651688555347091, "calib/step_q_w_n": 1066.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 740.28125, "completions/mean_terminated_length": 770.3739624023438, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.2112, "grad_norm": 0.036617472767829895, "kl": 0.1192779541015625, "learning_rate": 5.555555555555556e-08, "loss": -0.1968, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.01898002065718174, "mask/share_reasoning": 0.8232492208480835, "mask/share_step_conf": 0.11870827525854111, "num_tokens": 60707682.0, "reward": 0.5694743394851685, "reward_std": 0.19897526502609253, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7132657766342163, "rewards/format_reward_step": 0.93359375, "rewards/step_margin_reward": 0.12021413445472717, "step": 198 }, { "adv/mean_abs_final_conf": 0.6924710273742676, "adv/mean_abs_reasoning": 0.6194695234298706, "adv/mean_abs_step_conf": 0.7322503924369812, "adv/ratio_final_to_reasoning": 1.1178451904142164, "adv/ratio_step_to_reasoning": 1.1820603996507641, "adv/std_final_conf": 0.871009349822998, "adv/std_reasoning": 0.8431258797645569, "adv/std_step_conf": 0.9053608775138855, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6342101385204834, "calib/avg_num_step_conf": 9.11328125, "calib/ece": 0.273909465020576, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6748971193415638, "calib/gap": 0.14821175950486287, "calib/mean_conf": 0.8035390946502057, "calib/mu_c": 0.8566025641025641, "calib/mu_w": 0.7083908045977012, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2177366255144032, "calib/std_conf": 0.3196510209909856, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.87454754601227, "calib/step_q_c_n": 1304.0, "calib/step_q_gap": 0.010883794797498347, "calib/step_q_w": 0.8636637512147717, "calib/step_q_w_n": 1029.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 815.87109375, "completions/mean_terminated_length": 825.5454711914062, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.21226666666666666, "grad_norm": 0.0326930470764637, "kl": 0.11248779296875, "learning_rate": 2.777777777777778e-08, "loss": -0.0395, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.018966779112815857, "mask/share_reasoning": 0.8485132455825806, "mask/share_step_conf": 0.12080118805170059, "num_tokens": 61020745.0, "reward": 0.5522454977035522, "reward_std": 0.22606705129146576, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6731410026550293, "rewards/format_reward_step": 0.9453125, "rewards/step_margin_reward": 0.12041248381137848, "step": 199 }, { "adv/mean_abs_final_conf": 0.5759242177009583, "adv/mean_abs_reasoning": 0.3816964626312256, "adv/mean_abs_step_conf": 0.5817010402679443, "adv/ratio_final_to_reasoning": 1.5088539561797956, "adv/ratio_step_to_reasoning": 1.523988554303036, "adv/std_final_conf": 0.8206612467765808, "adv/std_reasoning": 0.6612957715988159, "adv/std_step_conf": 0.8095238208770752, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7613536155202822, "calib/avg_num_step_conf": 9.03125, "calib/ece": 0.20426829268292684, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7195121951219512, "calib/gap": 0.2685890652557319, "calib/mean_conf": 0.8371138211382114, "calib/mu_c": 0.9288271604938272, "calib/mu_w": 0.6602380952380953, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1914227642276423, "calib/std_conf": 0.2875919846699044, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8907153502235471, "calib/step_q_c_n": 1342.0, "calib/step_q_gap": 0.03217926774932023, "calib/step_q_w": 0.8585360824742269, "calib/step_q_w_n": 970.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 759.50390625, "completions/mean_terminated_length": 777.7320556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 484.0, "epoch": 0.21333333333333335, "grad_norm": 0.04065275192260742, "kl": 0.109375, "learning_rate": 0.0, "loss": -0.0473, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018793947994709015, "mask/share_reasoning": 0.8455758690834045, "mask/share_step_conf": 0.11219269037246704, "num_tokens": 61323226.0, "reward": 0.5978201627731323, "reward_std": 0.16689281165599823, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7508074045181274, "rewards/format_reward_step": 0.9609375, "rewards/step_margin_reward": 0.12608292698860168, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.06923778079100884, "train_runtime": 20771.5822, "train_samples_per_second": 2.465, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 61323226, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }