{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.47760647535324097, "adv/mean_abs_reasoning": 0.4569147527217865, "adv/mean_abs_step_conf": 0.7561872005462646, "adv/ratio_final_to_reasoning": 1.0452857398632815, "adv/ratio_step_to_reasoning": 1.6549853031484494, "adv/std_final_conf": 0.7227410674095154, "adv/std_reasoning": 0.7206857204437256, "adv/std_step_conf": 0.9253036379814148, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.025163671001791954, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.13, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.018737709149718285, "mask/share_reasoning": 0.845859944820404, "mask/share_step_conf": 0.10805858671665192, "num_tokens": 300991.0, "reward": 0.8751538991928101, "reward_std": 0.2377150058746338, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.7354698181152344, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "step": 1 }, { "adv/mean_abs_final_conf": 0.437887966632843, "adv/mean_abs_reasoning": 0.4207462966442108, "adv/mean_abs_step_conf": 0.7180101275444031, "adv/ratio_final_to_reasoning": 1.0407411072310102, "adv/ratio_step_to_reasoning": 1.7065156206272276, "adv/std_final_conf": 0.6832791566848755, "adv/std_reasoning": 0.6817297339439392, "adv/std_step_conf": 0.9190711975097656, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.021795878186821938, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": -0.0102, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01788702979683876, "mask/share_reasoning": 0.8706268668174744, "mask/share_step_conf": 0.09976735711097717, "num_tokens": 619483.0, "reward": 0.8191705346107483, "reward_std": 0.21779605746269226, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.6992889642715454, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "step": 2 }, { "adv/mean_abs_final_conf": 0.42324480414390564, "adv/mean_abs_reasoning": 0.4121120572090149, "adv/mean_abs_step_conf": 0.742616593837738, "adv/ratio_final_to_reasoning": 1.0270138830935598, "adv/ratio_step_to_reasoning": 1.8019773526332377, "adv/std_final_conf": 0.7039074897766113, "adv/std_reasoning": 0.7013108134269714, "adv/std_step_conf": 0.9225370287895203, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49612068965517236, "calib/avg_num_step_conf": 7.6328125, "calib/ece": 0.3051574803149606, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.758620689668483e-05, "calib/mean_conf": 0.9901968503937008, "calib/mu_c": 0.9901724137931033, "calib/mu_w": 0.99025, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3051574803149606, "calib/std_conf": 0.00138915580821869, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9130487804878049, "calib/step_q_c_n": 1312.0, "calib/step_q_gap": 0.01088367145353697, "calib/step_q_w": 0.9021651090342679, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 802.8984375, "completions/mean_terminated_length": 806.047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.024119608104228973, "kl": 0.0005092322826385498, "learning_rate": 7.5e-07, "loss": -0.0327, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.018682915717363358, "mask/share_reasoning": 0.8749101758003235, "mask/share_step_conf": 0.10250066965818405, "num_tokens": 930281.0, "reward": 0.8781776428222656, "reward_std": 0.21866968274116516, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.7362962961196899, "rewards/final_brier_reward_step": 0.6856839656829834, "rewards/format_reward_step": 0.9921875, "step": 3 }, { "adv/mean_abs_final_conf": 0.5237265825271606, "adv/mean_abs_reasoning": 0.4569150507450104, "adv/mean_abs_step_conf": 0.773900032043457, "adv/ratio_final_to_reasoning": 1.1462230926147268, "adv/ratio_step_to_reasoning": 1.6937503607762436, "adv/std_final_conf": 0.756417453289032, "adv/std_reasoning": 0.7015604376792908, "adv/std_step_conf": 0.9241310358047485, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4732142857142857, "calib/avg_num_step_conf": 7.64453125, "calib/ece": 0.2946245059288537, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000535714285714084, "calib/mean_conf": 0.9902766798418972, "calib/mu_c": 0.9901136363636364, "calib/mu_w": 0.9906493506493504, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2946245059288537, "calib/std_conf": 0.0016401971479246202, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9098583146905294, "calib/step_q_c_n": 1341.0, "calib/step_q_gap": 0.0013031198853343806, "calib/step_q_w": 0.908555194805195, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 782.640625, "completions/mean_terminated_length": 791.9209594726562, "completions/min_length": 0.0, "completions/min_terminated_length": 456.0, "epoch": 0.004266666666666667, "grad_norm": 0.027355490252375603, "kl": 0.0005553364753723145, "learning_rate": 1.0000000000000002e-06, "loss": -0.0374, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.018635034561157227, "mask/share_reasoning": 0.8674567937850952, "mask/share_step_conf": 0.10218945145606995, "num_tokens": 1236805.0, "reward": 0.8771485090255737, "reward_std": 0.25210636854171753, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.7269708514213562, "rewards/final_brier_reward_step": 0.6929511427879333, "rewards/format_reward_step": 0.984375, "step": 4 }, { "adv/mean_abs_final_conf": 0.47414445877075195, "adv/mean_abs_reasoning": 0.4151928126811981, "adv/mean_abs_step_conf": 0.7436368465423584, "adv/ratio_final_to_reasoning": 1.1419861912080336, "adv/ratio_step_to_reasoning": 1.7910638716025966, "adv/std_final_conf": 0.7365800738334656, "adv/std_reasoning": 0.6815617680549622, "adv/std_step_conf": 0.9012373089790344, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49449053668654386, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.45881392235609114, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": -0.004985437732261788, "calib/mean_conf": 0.9875957161981259, "calib/mu_c": 0.9852731829573933, "calib/mu_w": 0.9902586206896551, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4561365461847391, "calib/std_conf": 0.04157220198737621, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9131255901794145, "calib/step_q_c_n": 1059.0, "calib/step_q_gap": 0.01992823626541229, "calib/step_q_w": 0.8931973539140022, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 806.9296875, "completions/mean_terminated_length": 816.498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.005333333333333333, "grad_norm": 0.055844664573669434, "kl": 0.0005984306335449219, "learning_rate": 1.25e-06, "loss": -0.1083, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.018338514491915703, "mask/share_reasoning": 0.8661159873008728, "mask/share_step_conf": 0.10382672399282455, "num_tokens": 1550067.0, "reward": 0.7309314012527466, "reward_std": 0.18717044591903687, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6368963718414307, "rewards/final_brier_reward_step": 0.5265287160873413, "rewards/format_reward_step": 0.97265625, "step": 5 }, { "adv/mean_abs_final_conf": 0.40251004695892334, "adv/mean_abs_reasoning": 0.374636173248291, "adv/mean_abs_step_conf": 0.7238371968269348, "adv/ratio_final_to_reasoning": 1.0744025155631698, "adv/ratio_step_to_reasoning": 1.9321070641707896, "adv/std_final_conf": 0.6800278425216675, "adv/std_reasoning": 0.6612951159477234, "adv/std_step_conf": 0.9232251048088074, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4968253968253968, "calib/avg_num_step_conf": 8.14453125, "calib/ece": 0.42572580645161295, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.349206349198777e-05, "calib/mean_conf": 0.9902419354838711, "calib/mu_c": 0.9902142857142857, "calib/mu_w": 0.9902777777777777, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42572580645161295, "calib/std_conf": 0.0015364966841336825, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9111431143114312, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": 0.0001369541471603064, "calib/step_q_w": 0.9110061601642709, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 723.171875, "completions/mean_terminated_length": 737.5776977539062, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.0064, "grad_norm": 0.017985476180911064, "kl": 0.0007612109184265137, "learning_rate": 1.5e-06, "loss": -0.1013, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0195919219404459, "mask/share_reasoning": 0.8470251560211182, "mask/share_step_conf": 0.11385165899991989, "num_tokens": 1841151.0, "reward": 0.7441216111183167, "reward_std": 0.19726765155792236, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.6301347017288208, "rewards/final_brier_reward_step": 0.5549836158752441, "rewards/format_reward_step": 0.96875, "step": 6 }, { "adv/mean_abs_final_conf": 0.5039571523666382, "adv/mean_abs_reasoning": 0.4919060170650482, "adv/mean_abs_step_conf": 0.7596220374107361, "adv/ratio_final_to_reasoning": 1.0244988572684939, "adv/ratio_step_to_reasoning": 1.5442422150942827, "adv/std_final_conf": 0.7577763199806213, "adv/std_reasoning": 0.7576440572738647, "adv/std_step_conf": 0.9329904317855835, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49632380808004245, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.3113414634146341, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.352383839875909e-05, "calib/mean_conf": 0.9902032520325202, "calib/mu_c": 0.9901796407185628, "calib/mu_w": 0.9902531645569616, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3113414634146341, "calib/std_conf": 0.0014111020290820972, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9102744148506859, "calib/step_q_c_n": 1239.0, "calib/step_q_gap": 0.007883416257155718, "calib/step_q_w": 0.9023909985935302, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 847.07421875, "completions/mean_terminated_length": 874.399169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.007466666666666667, "grad_norm": 0.020187566056847572, "kl": 0.0005465447902679443, "learning_rate": 1.75e-06, "loss": -0.1468, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01694045029580593, "mask/share_reasoning": 0.8599110841751099, "mask/share_step_conf": 0.09189847111701965, "num_tokens": 2165426.0, "reward": 0.8446517586708069, "reward_std": 0.24925880134105682, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7083820104598999, "rewards/final_brier_reward_step": 0.658265233039856, "rewards/format_reward_step": 0.9609375, "step": 7 }, { "adv/mean_abs_final_conf": 0.4072754383087158, "adv/mean_abs_reasoning": 0.39883965253829956, "adv/mean_abs_step_conf": 0.7444837093353271, "adv/ratio_final_to_reasoning": 1.0211508201773047, "adv/ratio_step_to_reasoning": 1.866624104692891, "adv/std_final_conf": 0.6824573874473572, "adv/std_reasoning": 0.6816043853759766, "adv/std_step_conf": 0.9320701360702515, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5149176954732511, "calib/avg_num_step_conf": 7.54296875, "calib/ece": 0.33077235772357716, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": 0.0022045855379189128, "calib/mean_conf": 0.9893089430894308, "calib/mu_c": 0.9900617283950617, "calib/mu_w": 0.9878571428571428, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33077235772357716, "calib/std_conf": 0.008110647504853151, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9099438652766643, "calib/step_q_c_n": 1247.0, "calib/step_q_gap": 0.003028660598301647, "calib/step_q_w": 0.9069152046783626, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 832.9765625, "completions/mean_terminated_length": 849.5697631835938, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.008533333333333334, "grad_norm": 0.030579403042793274, "kl": 0.0005984306335449219, "learning_rate": 2.0000000000000003e-06, "loss": -0.0817, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017597010359168053, "mask/share_reasoning": 0.8655336499214172, "mask/share_step_conf": 0.09733809530735016, "num_tokens": 2485180.0, "reward": 0.8367857933044434, "reward_std": 0.19032156467437744, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.7142133712768555, "rewards/final_brier_reward_step": 0.6406081914901733, "rewards/format_reward_step": 0.9609375, "step": 8 }, { "adv/mean_abs_final_conf": 0.4530647397041321, "adv/mean_abs_reasoning": 0.4173707962036133, "adv/mean_abs_step_conf": 0.7631109952926636, "adv/ratio_final_to_reasoning": 1.0855209416307738, "adv/ratio_step_to_reasoning": 1.8283765951855957, "adv/std_final_conf": 0.7208334803581238, "adv/std_reasoning": 0.7015334367752075, "adv/std_step_conf": 0.9326648712158203, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5064102564102564, "calib/avg_num_step_conf": 7.609375, "calib/ece": 0.30553278688524593, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.012948717948718103, "calib/mean_conf": 0.985860655737705, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9770512820512818, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30553278688524593, "calib/std_conf": 0.06327533064648865, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.910533015115354, "calib/step_q_c_n": 1257.0, "calib/step_q_gap": 0.0008658660560197928, "calib/step_q_w": 0.9096671490593342, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 802.5078125, "completions/mean_terminated_length": 831.7490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.0096, "grad_norm": 0.0191666167229414, "kl": 0.0006733536720275879, "learning_rate": 2.25e-06, "loss": -0.1468, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01763000525534153, "mask/share_reasoning": 0.8491230607032776, "mask/share_step_conf": 0.09809070080518723, "num_tokens": 2798158.0, "reward": 0.8323462009429932, "reward_std": 0.24119850993156433, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.6868234872817993, "rewards/final_brier_reward_step": 0.6583374738693237, "rewards/format_reward_step": 0.94921875, "step": 9 }, { "adv/mean_abs_final_conf": 0.46466493606567383, "adv/mean_abs_reasoning": 0.4617326855659485, "adv/mean_abs_step_conf": 0.76551353931427, "adv/ratio_final_to_reasoning": 1.006350536991184, "adv/ratio_step_to_reasoning": 1.6579149868413052, "adv/std_final_conf": 0.7218580842018127, "adv/std_reasoning": 0.7206411957740784, "adv/std_step_conf": 0.9229851961135864, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.497684705231875, "calib/avg_num_step_conf": 7.62109375, "calib/ece": 0.35407600000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.600801714020996e-05, "calib/mean_conf": 0.9900760000000001, "calib/mu_c": 0.9900628930817608, "calib/mu_w": 0.990098901098901, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35407600000000006, "calib/std_conf": 0.0008474809732377484, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.908442760942761, "calib/step_q_c_n": 1188.0, "calib/step_q_gap": 0.001635421493219713, "calib/step_q_w": 0.9068073394495413, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 826.671875, "completions/mean_terminated_length": 839.793701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 455.0, "epoch": 0.010666666666666666, "grad_norm": 0.019903983920812607, "kl": 0.0007127523422241211, "learning_rate": 2.5e-06, "loss": -0.0049, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017885662615299225, "mask/share_reasoning": 0.8675950765609741, "mask/share_step_conf": 0.09889422357082367, "num_tokens": 3116586.0, "reward": 0.8278957009315491, "reward_std": 0.24505752325057983, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7082241773605347, "rewards/final_brier_reward_step": 0.6280359029769897, "rewards/format_reward_step": 0.9765625, "step": 10 }, { "adv/mean_abs_final_conf": 0.4113199710845947, "adv/mean_abs_reasoning": 0.4019410014152527, "adv/mean_abs_step_conf": 0.7453023195266724, "adv/ratio_final_to_reasoning": 1.023334194909995, "adv/ratio_step_to_reasoning": 1.8542580052854243, "adv/std_final_conf": 0.7009356021881104, "adv/std_reasoning": 0.7014546394348145, "adv/std_step_conf": 0.914635419845581, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49439056166763173, "calib/avg_num_step_conf": 7.4296875, "calib/ece": 0.3488938775510203, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": -0.0006118268674003557, "calib/mean_conf": 0.9897102040816326, "calib/mu_c": 0.9894904458598724, "calib/mu_w": 0.9901022727272728, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3488938775510203, "calib/std_conf": 0.005806549605893123, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9094141914191419, "calib/step_q_c_n": 1212.0, "calib/step_q_gap": 0.0026721624336346617, "calib/step_q_w": 0.9067420289855073, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 799.36328125, "completions/mean_terminated_length": 831.857666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.011733333333333333, "grad_norm": 0.03048000857234001, "kl": 0.0008162856101989746, "learning_rate": 2.7500000000000004e-06, "loss": -0.1851, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017573874443769455, "mask/share_reasoning": 0.8479140996932983, "mask/share_step_conf": 0.09544955939054489, "num_tokens": 3425703.0, "reward": 0.8215123414993286, "reward_std": 0.20594213902950287, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7136968374252319, "rewards/final_brier_reward_step": 0.6160464286804199, "rewards/format_reward_step": 0.953125, "step": 11 }, { "adv/mean_abs_final_conf": 0.3859899342060089, "adv/mean_abs_reasoning": 0.3481341600418091, "adv/mean_abs_step_conf": 0.7245120406150818, "adv/ratio_final_to_reasoning": 1.1087390394543688, "adv/ratio_step_to_reasoning": 2.0811288398934242, "adv/std_final_conf": 0.6618059873580933, "adv/std_reasoning": 0.6404772996902466, "adv/std_step_conf": 0.9084425568580627, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4823494885124937, "calib/avg_num_step_conf": 8.3125, "calib/ece": 0.2634897959183673, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00028006037229555414, "calib/mean_conf": 0.9900204081632653, "calib/mu_c": 0.9899438202247192, "calib/mu_w": 0.9902238805970147, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2634897959183673, "calib/std_conf": 0.0009409026477730967, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.911249182472204, "calib/step_q_c_n": 1529.0, "calib/step_q_gap": 0.004579733390400964, "calib/step_q_w": 0.906669449081803, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 755.1953125, "completions/mean_terminated_length": 782.7125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.0128, "grad_norm": 0.020890548825263977, "kl": 0.0010690093040466309, "learning_rate": 3e-06, "loss": -0.2085, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.018778398633003235, "mask/share_reasoning": 0.8324668407440186, "mask/share_step_conf": 0.11359857022762299, "num_tokens": 3723209.0, "reward": 0.8852379322052002, "reward_std": 0.18036311864852905, "rewards/accuracy_reward_step": 0.6953125, "rewards/asymmetric_l2_reward": 0.7404860258102417, "rewards/final_brier_reward_step": 0.7003023624420166, "rewards/format_reward_step": 0.953125, "step": 12 }, { "adv/mean_abs_final_conf": 0.3926801383495331, "adv/mean_abs_reasoning": 0.38152408599853516, "adv/mean_abs_step_conf": 0.7393319010734558, "adv/ratio_final_to_reasoning": 1.029240755067403, "adv/ratio_step_to_reasoning": 1.9378380768240526, "adv/std_final_conf": 0.661359429359436, "adv/std_reasoning": 0.6612658500671387, "adv/std_step_conf": 0.9282354712486267, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49732035822579507, "calib/avg_num_step_conf": 8.1953125, "calib/ece": 0.33808000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.359283548378446e-05, "calib/mean_conf": 0.9900800000000001, "calib/mu_c": 0.9900613496932515, "calib/mu_w": 0.9901149425287353, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33808000000000005, "calib/std_conf": 0.0008908422980528043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9087509578544062, "calib/step_q_c_n": 1305.0, "calib/step_q_gap": 0.0046652075391475645, "calib/step_q_w": 0.9040857503152586, "calib/step_q_w_n": 793.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 809.140625, "completions/mean_terminated_length": 828.56005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 323.0, "epoch": 0.013866666666666666, "grad_norm": 0.024629024788737297, "kl": 0.0017845630645751953, "learning_rate": 3.2500000000000002e-06, "loss": -0.0634, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0179408211261034, "mask/share_reasoning": 0.8538167476654053, "mask/share_step_conf": 0.10480496287345886, "num_tokens": 4034941.0, "reward": 0.833107590675354, "reward_std": 0.1895129680633545, "rewards/accuracy_reward_step": 0.63671875, "rewards/asymmetric_l2_reward": 0.7010772824287415, "rewards/final_brier_reward_step": 0.6432628631591797, "rewards/format_reward_step": 0.97265625, "step": 13 }, { "adv/mean_abs_final_conf": 0.4264124631881714, "adv/mean_abs_reasoning": 0.42067527770996094, "adv/mean_abs_step_conf": 0.7750456929206848, "adv/ratio_final_to_reasoning": 1.0136380381310783, "adv/ratio_step_to_reasoning": 1.8423846942939404, "adv/std_final_conf": 0.6820985674858093, "adv/std_reasoning": 0.681735098361969, "adv/std_step_conf": 0.9328522682189941, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.88671875, "calib/ece": 0.3588524590163934, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3588524590163934, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9080388978930308, "calib/step_q_c_n": 1234.0, "calib/step_q_gap": 0.012459280058635946, "calib/step_q_w": 0.8955796178343949, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 856.76953125, "completions/mean_terminated_length": 880.8554077148438, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.014933333333333333, "grad_norm": 0.016446553170681, "kl": 0.0022586584091186523, "learning_rate": 3.5e-06, "loss": -0.0861, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01721174642443657, "mask/share_reasoning": 0.8545247912406921, "mask/share_step_conf": 0.10091973841190338, "num_tokens": 4359674.0, "reward": 0.8058513402938843, "reward_std": 0.21509991586208344, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6914855241775513, "rewards/final_brier_reward_step": 0.6084984540939331, "rewards/format_reward_step": 0.953125, "step": 14 }, { "adv/mean_abs_final_conf": 0.3723694086074829, "adv/mean_abs_reasoning": 0.3705359399318695, "adv/mean_abs_step_conf": 0.7469418048858643, "adv/ratio_final_to_reasoning": 1.0049481534124611, "adv/ratio_step_to_reasoning": 2.015841715713743, "adv/std_final_conf": 0.6613016724586487, "adv/std_reasoning": 0.6611766219139099, "adv/std_step_conf": 0.9211583137512207, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5081591847265221, "calib/avg_num_step_conf": 7.75390625, "calib/ece": 0.39157480314960624, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016382868937081518, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900657894736843, "calib/mu_w": 0.9899019607843135, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39157480314960624, "calib/std_conf": 0.0008873565094161146, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9095799011532125, "calib/step_q_c_n": 1214.0, "calib/step_q_gap": 0.0030040256668311116, "calib/step_q_w": 0.9065758754863814, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 766.30859375, "completions/mean_terminated_length": 769.3137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.016, "grad_norm": 0.01988803967833519, "kl": 0.002725958824157715, "learning_rate": 3.7500000000000005e-06, "loss": 0.0034, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.01912056654691696, "mask/share_reasoning": 0.8694561719894409, "mask/share_step_conf": 0.10751700401306152, "num_tokens": 4663729.0, "reward": 0.7999786138534546, "reward_std": 0.19798709452152252, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.6810728311538696, "rewards/final_brier_reward_step": 0.6016968488693237, "rewards/format_reward_step": 0.9921875, "step": 15 }, { "adv/mean_abs_final_conf": 0.4110031723976135, "adv/mean_abs_reasoning": 0.41789019107818604, "adv/mean_abs_step_conf": 0.7575955390930176, "adv/ratio_final_to_reasoning": 0.9835195493275314, "adv/ratio_step_to_reasoning": 1.8129057711030927, "adv/std_final_conf": 0.7015405893325806, "adv/std_reasoning": 0.7013484835624695, "adv/std_step_conf": 0.9307181239128113, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.49444444444444446, "calib/avg_num_step_conf": 7.8359375, "calib/ece": 0.3529233870967743, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.555555555569747e-05, "calib/mean_conf": 0.9900201612903227, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9900555555555555, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3529233870967743, "calib/std_conf": 0.0003168595493044703, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.909328125, "calib/step_q_c_n": 1280.0, "calib/step_q_gap": 0.012289557506887094, "calib/step_q_w": 0.897038567493113, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 906.58984375, "completions/mean_terminated_length": 917.3399658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 468.0, "epoch": 0.017066666666666667, "grad_norm": 0.028787488117814064, "kl": 0.004323244094848633, "learning_rate": 4.000000000000001e-06, "loss": -0.0373, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.01627296581864357, "mask/share_reasoning": 0.8792263269424438, "mask/share_step_conf": 0.0927819162607193, "num_tokens": 5004664.0, "reward": 0.8178976774215698, "reward_std": 0.20622055232524872, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.6945248246192932, "rewards/final_brier_reward_step": 0.6240831017494202, "rewards/format_reward_step": 0.96875, "step": 16 }, { "adv/mean_abs_final_conf": 0.4864518642425537, "adv/mean_abs_reasoning": 0.4650035500526428, "adv/mean_abs_step_conf": 0.7391393184661865, "adv/ratio_final_to_reasoning": 1.0461250547172871, "adv/ratio_step_to_reasoning": 1.589534786094663, "adv/std_final_conf": 0.7398475408554077, "adv/std_reasoning": 0.7394839525222778, "adv/std_step_conf": 0.9180856347084045, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4942528735632184, "calib/avg_num_step_conf": 9.015625, "calib/ece": 0.28195121951219515, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.991869918699187, "calib/gap": -0.001034482758620725, "calib/mean_conf": 0.9892682926829268, "calib/mu_c": 0.9889655172413793, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28195121951219515, "calib/std_conf": 0.008081971475990674, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9013578595317726, "calib/step_q_c_n": 1495.0, "calib/step_q_gap": 0.004125387207049358, "calib/step_q_w": 0.8972324723247233, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 839.33203125, "completions/mean_terminated_length": 862.9276733398438, "completions/min_length": 0.0, "completions/min_terminated_length": 388.0, "epoch": 0.018133333333333335, "grad_norm": 0.025759395211935043, "kl": 0.005819797515869141, "learning_rate": 4.25e-06, "loss": -0.0926, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01730026863515377, "mask/share_reasoning": 0.8481937646865845, "mask/share_step_conf": 0.10716216266155243, "num_tokens": 5323061.0, "reward": 0.8808417320251465, "reward_std": 0.2449316680431366, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.7484192848205566, "rewards/final_brier_reward_step": 0.6851390600204468, "rewards/format_reward_step": 0.9609375, "step": 17 }, { "adv/mean_abs_final_conf": 0.4163411855697632, "adv/mean_abs_reasoning": 0.4063420295715332, "adv/mean_abs_step_conf": 0.7388330101966858, "adv/ratio_final_to_reasoning": 1.0246077325763558, "adv/ratio_step_to_reasoning": 1.818253974307672, "adv/std_final_conf": 0.6988867521286011, "adv/std_reasoning": 0.7013848423957825, "adv/std_step_conf": 0.9287502765655518, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.3046875, "calib/ece": 0.46139344262295084, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959016393442623, "calib/gap": 0.008448275862069199, "calib/mean_conf": 0.9859836065573769, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9815517241379309, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46139344262295084, "calib/std_conf": 0.06325410420255871, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9047178988326848, "calib/step_q_c_n": 1028.0, "calib/step_q_gap": 0.07243101358678306, "calib/step_q_w": 0.8322868852459018, "calib/step_q_w_n": 1098.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 848.72265625, "completions/mean_terminated_length": 876.1007690429688, "completions/min_length": 0.0, "completions/min_terminated_length": 473.0, "epoch": 0.0192, "grad_norm": 0.02178441919386387, "kl": 0.006207942962646484, "learning_rate": 4.5e-06, "loss": -0.1369, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01709761470556259, "mask/share_reasoning": 0.852958083152771, "mask/share_step_conf": 0.0986943170428276, "num_tokens": 5651054.0, "reward": 0.7043896317481995, "reward_std": 0.19021376967430115, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.6054363250732422, "rewards/final_brier_reward_step": 0.5127179622650146, "rewards/format_reward_step": 0.953125, "step": 18 }, { "adv/mean_abs_final_conf": 0.40043365955352783, "adv/mean_abs_reasoning": 0.398581862449646, "adv/mean_abs_step_conf": 0.7589485049247742, "adv/ratio_final_to_reasoning": 1.004645964300786, "adv/ratio_step_to_reasoning": 1.9041220296888304, "adv/std_final_conf": 0.6625997424125671, "adv/std_reasoning": 0.6613322496414185, "adv/std_step_conf": 0.9255523085594177, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5065441475654415, "calib/avg_num_step_conf": 8.82421875, "calib/ece": 0.3981376518218622, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.001713685067136983, "calib/mean_conf": 0.9892307692307691, "calib/mu_c": 0.9899315068493151, "calib/mu_w": 0.9882178217821781, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3981376518218622, "calib/std_conf": 0.008087038200582255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9048542880932556, "calib/step_q_c_n": 1201.0, "calib/step_q_gap": -0.001251572020166014, "calib/step_q_w": 0.9061058601134216, "calib/step_q_w_n": 1058.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2829.0, "completions/max_terminated_length": 2829.0, "completions/mean_length": 788.2421875, "completions/mean_terminated_length": 810.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 504.0, "epoch": 0.020266666666666665, "grad_norm": 0.023767001926898956, "kl": 0.010579109191894531, "learning_rate": 4.75e-06, "loss": -0.1017, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0177196953445673, "mask/share_reasoning": 0.8479148149490356, "mask/share_step_conf": 0.1070217490196228, "num_tokens": 5957604.0, "reward": 0.787616491317749, "reward_std": 0.1928047388792038, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6887671947479248, "rewards/final_brier_reward_step": 0.5794343948364258, "rewards/format_reward_step": 0.96484375, "step": 19 }, { "adv/mean_abs_final_conf": 0.41943609714508057, "adv/mean_abs_reasoning": 0.38557207584381104, "adv/mean_abs_step_conf": 0.7387959957122803, "adv/ratio_final_to_reasoning": 1.0878279922817524, "adv/ratio_step_to_reasoning": 1.9161034784364273, "adv/std_final_conf": 0.700600802898407, "adv/std_reasoning": 0.6614826321601868, "adv/std_step_conf": 0.9205412268638611, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5085093896713615, "calib/avg_num_step_conf": 9.4453125, "calib/ece": 0.3916344537815126, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.9831932773109243, "calib/gap": 0.001138644366196817, "calib/mean_conf": 0.9882731092436975, "calib/mu_c": 0.988732394366197, "calib/mu_w": 0.9875937500000002, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3916344537815126, "calib/std_conf": 0.011876685709483406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9072966692486446, "calib/step_q_c_n": 1291.0, "calib/step_q_gap": 0.02412275620516613, "calib/step_q_w": 0.8831739130434785, "calib/step_q_w_n": 1127.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 806.140625, "completions/mean_terminated_length": 849.2674560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 478.0, "epoch": 0.021333333333333333, "grad_norm": 0.028085876256227493, "kl": 0.014958381652832031, "learning_rate": 5e-06, "loss": -0.1511, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.017452768981456757, "mask/share_reasoning": 0.8182758688926697, "mask/share_step_conf": 0.11349007487297058, "num_tokens": 6268848.0, "reward": 0.7662522792816162, "reward_std": 0.20081782341003418, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.67189621925354, "rewards/final_brier_reward_step": 0.5637332201004028, "rewards/format_reward_step": 0.9296875, "step": 20 }, { "adv/mean_abs_final_conf": 0.5309414267539978, "adv/mean_abs_reasoning": 0.5210570096969604, "adv/mean_abs_step_conf": 0.7509444952011108, "adv/ratio_final_to_reasoning": 1.0189699339478917, "adv/ratio_step_to_reasoning": 1.4411944973887785, "adv/std_final_conf": 0.7589125633239746, "adv/std_reasoning": 0.7577093839645386, "adv/std_step_conf": 0.9168322682380676, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5154688418577308, "calib/avg_num_step_conf": 9.46484375, "calib/ece": 0.31445783132530125, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.000945767195767222, "calib/mean_conf": 0.9891566265060241, "calib/mu_c": 0.9894642857142858, "calib/mu_w": 0.9885185185185186, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31445783132530125, "calib/std_conf": 0.008145980250530215, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.90263933376041, "calib/step_q_c_n": 1561.0, "calib/step_q_gap": 0.010272744433263759, "calib/step_q_w": 0.8923665893271462, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2760.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 831.56640625, "completions/mean_terminated_length": 854.9437255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.0224, "grad_norm": 0.0251162052154541, "kl": 0.017168045043945312, "learning_rate": 4.9722222222222224e-06, "loss": -0.1365, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017269905656576157, "mask/share_reasoning": 0.8425857424736023, "mask/share_step_conf": 0.11280064284801483, "num_tokens": 6584689.0, "reward": 0.8635594844818115, "reward_std": 0.28787457942962646, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.7380008101463318, "rewards/final_brier_reward_step": 0.6633366942405701, "rewards/format_reward_step": 0.97265625, "step": 21 }, { "adv/mean_abs_final_conf": 0.42734625935554504, "adv/mean_abs_reasoning": 0.3793871998786926, "adv/mean_abs_step_conf": 0.727765679359436, "adv/ratio_final_to_reasoning": 1.1264119071286198, "adv/ratio_step_to_reasoning": 1.9182662978406648, "adv/std_final_conf": 0.7142421007156372, "adv/std_reasoning": 0.6815976500511169, "adv/std_step_conf": 0.9219359159469604, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.502267076203013, "calib/avg_num_step_conf": 9.78515625, "calib/ece": 0.3395102040816327, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9836734693877551, "calib/gap": 0.0008980547023549335, "calib/mean_conf": 0.9884897959183674, "calib/mu_c": 0.9888050314465407, "calib/mu_w": 0.9879069767441858, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3395102040816327, "calib/std_conf": 0.011418070977897708, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9052358168147641, "calib/step_q_c_n": 1463.0, "calib/step_q_gap": -0.0003112081372513842, "calib/step_q_w": 0.9055470249520154, "calib/step_q_w_n": 1042.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 796.98828125, "completions/mean_terminated_length": 829.3861694335938, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.023466666666666667, "grad_norm": 0.05403800308704376, "kl": 0.022701263427734375, "learning_rate": 4.944444444444445e-06, "loss": -0.2069, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017237115651369095, "mask/share_reasoning": 0.8285585045814514, "mask/share_step_conf": 0.11514189094305038, "num_tokens": 6890534.0, "reward": 0.8221842646598816, "reward_std": 0.17867985367774963, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.699776291847229, "rewards/final_brier_reward_step": 0.6289671659469604, "rewards/format_reward_step": 0.95703125, "step": 22 }, { "adv/mean_abs_final_conf": 0.5448117852210999, "adv/mean_abs_reasoning": 0.5245258808135986, "adv/mean_abs_step_conf": 0.7671633958816528, "adv/ratio_final_to_reasoning": 1.0386747444683482, "adv/ratio_step_to_reasoning": 1.4625844480575412, "adv/std_final_conf": 0.7926346063613892, "adv/std_reasoning": 0.775558352470398, "adv/std_step_conf": 0.9294148683547974, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.49233573926631446, "calib/avg_num_step_conf": 9.94140625, "calib/ece": 0.36487499999999995, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": -0.006642607336855733, "calib/mean_conf": 0.9857083333333334, "calib/mu_c": 0.9832450331125826, "calib/mu_w": 0.9898876404494383, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3607083333333333, "calib/std_conf": 0.06377302144236928, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8959471050750537, "calib/step_q_c_n": 1399.0, "calib/step_q_gap": -0.0044455650820144355, "calib/step_q_w": 0.9003926701570681, "calib/step_q_w_n": 1146.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 843.62109375, "completions/mean_terminated_length": 885.110595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.024533333333333334, "grad_norm": 0.02799358405172825, "kl": 0.025606155395507812, "learning_rate": 4.9166666666666665e-06, "loss": -0.2192, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016374429687857628, "mask/share_reasoning": 0.8253282308578491, "mask/share_step_conf": 0.11142238974571228, "num_tokens": 7210437.0, "reward": 0.7847557067871094, "reward_std": 0.2819891571998596, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.671172022819519, "rewards/final_brier_reward_step": 0.5928707122802734, "rewards/format_reward_step": 0.9375, "step": 23 }, { "adv/mean_abs_final_conf": 0.49382612109184265, "adv/mean_abs_reasoning": 0.4582400619983673, "adv/mean_abs_step_conf": 0.7413434386253357, "adv/ratio_final_to_reasoning": 1.0776581142606474, "adv/ratio_step_to_reasoning": 1.6178058186190998, "adv/std_final_conf": 0.7374116778373718, "adv/std_reasoning": 0.7396427989006042, "adv/std_step_conf": 0.9289793372154236, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.0234375, "calib/ece": 0.4468965517241379, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4468965517241379, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9000912863070538, "calib/step_q_c_n": 1205.0, "calib/step_q_gap": 0.007982542736149956, "calib/step_q_w": 0.8921087435709039, "calib/step_q_w_n": 1361.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 908.16796875, "completions/mean_terminated_length": 964.6929931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 507.0, "epoch": 0.0256, "grad_norm": 0.01509635429829359, "kl": 0.02335357666015625, "learning_rate": 4.888888888888889e-06, "loss": -0.2972, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.015077757649123669, "mask/share_reasoning": 0.8210850954055786, "mask/share_step_conf": 0.10524344444274902, "num_tokens": 7547440.0, "reward": 0.6930545568466187, "reward_std": 0.24548444151878357, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.6069023609161377, "rewards/final_brier_reward_step": 0.5003003478050232, "rewards/format_reward_step": 0.90234375, "step": 24 }, { "adv/mean_abs_final_conf": 0.48379945755004883, "adv/mean_abs_reasoning": 0.4722136855125427, "adv/mean_abs_step_conf": 0.7505365610122681, "adv/ratio_final_to_reasoning": 1.0245350196170846, "adv/ratio_step_to_reasoning": 1.5894002737291117, "adv/std_final_conf": 0.7559794783592224, "adv/std_reasoning": 0.7576834559440613, "adv/std_step_conf": 0.9282037615776062, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.83203125, "calib/ece": 0.3701652892561984, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3701652892561984, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8980571030640669, "calib/step_q_c_n": 1436.0, "calib/step_q_gap": -0.0016283733466638806, "calib/step_q_w": 0.8996854764107308, "calib/step_q_w_n": 1081.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 800.3515625, "completions/mean_terminated_length": 836.2857055664062, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.02666666666666667, "grad_norm": 0.023259948939085007, "kl": 0.031223297119140625, "learning_rate": 4.861111111111111e-06, "loss": -0.2173, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016756581142544746, "mask/share_reasoning": 0.8256217241287231, "mask/share_step_conf": 0.11465294659137726, "num_tokens": 7855554.0, "reward": 0.7800040245056152, "reward_std": 0.24854359030723572, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.6599463820457458, "rewards/final_brier_reward_step": 0.5930304527282715, "rewards/format_reward_step": 0.9453125, "step": 25 }, { "adv/mean_abs_final_conf": 0.48266762495040894, "adv/mean_abs_reasoning": 0.4587442874908447, "adv/mean_abs_step_conf": 0.7445145845413208, "adv/ratio_final_to_reasoning": 1.0521496138740292, "adv/ratio_step_to_reasoning": 1.6229402846922192, "adv/std_final_conf": 0.7365694046020508, "adv/std_reasoning": 0.7395188212394714, "adv/std_step_conf": 0.92665696144104, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5117647058823529, "calib/avg_num_step_conf": 10.3671875, "calib/ece": 0.3440000000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004705882352943336, "calib/mean_conf": 0.9898333333333335, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9895294117647055, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3440000000000001, "calib/std_conf": 0.002034425935955619, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8974180041870202, "calib/step_q_c_n": 1433.0, "calib/step_q_gap": 0.006992123761139557, "calib/step_q_w": 0.8904258804258807, "calib/step_q_w_n": 1221.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3021.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 839.625, "completions/mean_terminated_length": 888.1983032226562, "completions/min_length": 0.0, "completions/min_terminated_length": 581.0, "epoch": 0.027733333333333332, "grad_norm": 0.028885742649435997, "kl": 0.0317230224609375, "learning_rate": 4.833333333333333e-06, "loss": -0.2952, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.015826698392629623, "mask/share_reasoning": 0.8226677179336548, "mask/share_step_conf": 0.10681814700365067, "num_tokens": 8175738.0, "reward": 0.800934374332428, "reward_std": 0.22288797795772552, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.680953860282898, "rewards/final_brier_reward_step": 0.6123210787773132, "rewards/format_reward_step": 0.9375, "step": 26 }, { "adv/mean_abs_final_conf": 0.46593204140663147, "adv/mean_abs_reasoning": 0.4470588266849518, "adv/mean_abs_step_conf": 0.7534698247909546, "adv/ratio_final_to_reasoning": 1.0422164010531434, "adv/ratio_step_to_reasoning": 1.685393017241408, "adv/std_final_conf": 0.7386137843132019, "adv/std_reasoning": 0.7393884658813477, "adv/std_step_conf": 0.9277500510215759, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.34375, "calib/ece": 0.39816326530612245, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39816326530612245, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8990035842293908, "calib/step_q_c_n": 1395.0, "calib/step_q_gap": 0.005883073455248744, "calib/step_q_w": 0.893120510774142, "calib/step_q_w_n": 1253.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 865.66796875, "completions/mean_terminated_length": 893.5927124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 560.0, "epoch": 0.0288, "grad_norm": 0.017182230949401855, "kl": 0.03142356872558594, "learning_rate": 4.805555555555556e-06, "loss": -0.1524, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01604236289858818, "mask/share_reasoning": 0.8432894945144653, "mask/share_step_conf": 0.10941816121339798, "num_tokens": 8502565.0, "reward": 0.7764205932617188, "reward_std": 0.23888157308101654, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.6740305423736572, "rewards/final_brier_reward_step": 0.5741230249404907, "rewards/format_reward_step": 0.95703125, "step": 27 }, { "adv/mean_abs_final_conf": 0.3599877953529358, "adv/mean_abs_reasoning": 0.3518277406692505, "adv/mean_abs_step_conf": 0.7386596202850342, "adv/ratio_final_to_reasoning": 1.0231933237218962, "adv/ratio_step_to_reasoning": 2.099492265390864, "adv/std_final_conf": 0.6624004244804382, "adv/std_reasoning": 0.6616162061691284, "adv/std_step_conf": 0.9197353720664978, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 9.81640625, "calib/ece": 0.2569491525423728, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.440892098500626e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900000000000003, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2569491525423728, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8916149068322982, "calib/step_q_c_n": 1610.0, "calib/step_q_gap": 0.009178583465742185, "calib/step_q_w": 0.882436323366556, "calib/step_q_w_n": 903.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 891.28125, "completions/mean_terminated_length": 931.2979125976562, "completions/min_length": 0.0, "completions/min_terminated_length": 502.0, "epoch": 0.029866666666666666, "grad_norm": 0.019003966823220253, "kl": 0.029294967651367188, "learning_rate": 4.777777777777778e-06, "loss": -0.2184, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015266390517354012, "mask/share_reasoning": 0.8397279381752014, "mask/share_step_conf": 0.10203687846660614, "num_tokens": 8837677.0, "reward": 0.8644207715988159, "reward_std": 0.2272474765777588, "rewards/accuracy_reward_step": 0.67578125, "rewards/asymmetric_l2_reward": 0.7333863973617554, "rewards/final_brier_reward_step": 0.6767050623893738, "rewards/format_reward_step": 0.91796875, "step": 28 }, { "adv/mean_abs_final_conf": 0.46612828969955444, "adv/mean_abs_reasoning": 0.4471941590309143, "adv/mean_abs_step_conf": 0.7449893355369568, "adv/ratio_final_to_reasoning": 1.0423398434131408, "adv/ratio_step_to_reasoning": 1.6659191997305494, "adv/std_final_conf": 0.7390708327293396, "adv/std_reasoning": 0.7395082116127014, "adv/std_step_conf": 0.9280605912208557, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.49645390070921985, "calib/avg_num_step_conf": 9.35546875, "calib/ece": 0.40727272727272734, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014184397163097273, "calib/mean_conf": 0.9899173553719008, "calib/mu_c": 0.9898581560283688, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40727272727272734, "calib/std_conf": 0.0012829896443190113, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8957519379844963, "calib/step_q_c_n": 1290.0, "calib/step_q_gap": 0.0007293135501070802, "calib/step_q_w": 0.8950226244343892, "calib/step_q_w_n": 1105.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 947.33984375, "completions/mean_terminated_length": 985.8495483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 530.0, "epoch": 0.030933333333333334, "grad_norm": 0.020408691838383675, "kl": 0.0307464599609375, "learning_rate": 4.75e-06, "loss": -0.2097, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.014833498746156693, "mask/share_reasoning": 0.8490548133850098, "mask/share_step_conf": 0.09704914689064026, "num_tokens": 9187324.0, "reward": 0.7522528171539307, "reward_std": 0.23636189103126526, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.6475716233253479, "rewards/final_brier_reward_step": 0.5584964752197266, "rewards/format_reward_step": 0.94140625, "step": 29 }, { "adv/mean_abs_final_conf": 0.6084608435630798, "adv/mean_abs_reasoning": 0.5937298536300659, "adv/mean_abs_step_conf": 0.77057945728302, "adv/ratio_final_to_reasoning": 1.0248109301611643, "adv/ratio_step_to_reasoning": 1.2978620707240762, "adv/std_final_conf": 0.8092952370643616, "adv/std_reasoning": 0.810136616230011, "adv/std_step_conf": 0.9337138533592224, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.41015625, "calib/ece": 0.3611790393013099, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9899999999999999, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3611790393013099, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8935240753663644, "calib/step_q_c_n": 1433.0, "calib/step_q_gap": 0.01962797147026052, "calib/step_q_w": 0.8738961038961038, "calib/step_q_w_n": 1232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 886.84375, "completions/mean_terminated_length": 957.9408569335938, "completions/min_length": 0.0, "completions/min_terminated_length": 513.0, "epoch": 0.032, "grad_norm": 0.02599792182445526, "kl": 0.031978607177734375, "learning_rate": 4.722222222222222e-06, "loss": -0.2959, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.014466993510723114, "mask/share_reasoning": 0.808133602142334, "mask/share_step_conf": 0.1031806617975235, "num_tokens": 9521340.0, "reward": 0.7366952896118164, "reward_std": 0.29571014642715454, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.6129330992698669, "rewards/final_brier_reward_step": 0.5690511465072632, "rewards/format_reward_step": 0.89453125, "step": 30 }, { "adv/mean_abs_final_conf": 0.5071862936019897, "adv/mean_abs_reasoning": 0.49132299423217773, "adv/mean_abs_step_conf": 0.7129656672477722, "adv/ratio_final_to_reasoning": 1.0322869060801898, "adv/ratio_step_to_reasoning": 1.45111398330129, "adv/std_final_conf": 0.755790114402771, "adv/std_reasoning": 0.757742166519165, "adv/std_step_conf": 0.919549286365509, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.78125, "calib/ece": 0.4204347826086956, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4204347826086956, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8882582123758594, "calib/step_q_c_n": 1309.0, "calib/step_q_gap": 0.02449529025318531, "calib/step_q_w": 0.8637629221226741, "calib/step_q_w_n": 1451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 974.890625, "completions/mean_terminated_length": 1053.04638671875, "completions/min_length": 0.0, "completions/min_terminated_length": 574.0, "epoch": 0.03306666666666667, "grad_norm": 0.023567797616124153, "kl": 0.033023834228515625, "learning_rate": 4.694444444444445e-06, "loss": -0.2323, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.013692351058125496, "mask/share_reasoning": 0.8139607906341553, "mask/share_step_conf": 0.09812812507152557, "num_tokens": 9876824.0, "reward": 0.702761173248291, "reward_std": 0.2406536340713501, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.6041278839111328, "rewards/final_brier_reward_step": 0.519363284111023, "rewards/format_reward_step": 0.8984375, "step": 31 }, { "adv/mean_abs_final_conf": 0.41383790969848633, "adv/mean_abs_reasoning": 0.4185928702354431, "adv/mean_abs_step_conf": 0.7600575685501099, "adv/ratio_final_to_reasoning": 0.9886406079150791, "adv/ratio_step_to_reasoning": 1.8157441815064967, "adv/std_final_conf": 0.6787320375442505, "adv/std_reasoning": 0.6817558407783508, "adv/std_step_conf": 0.9247509837150574, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.50185785437584, "calib/avg_num_step_conf": 10.3359375, "calib/ece": 0.3854347826086957, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.952881650670648e-06, "calib/mean_conf": 0.9897826086956522, "calib/mu_c": 0.9897841726618704, "calib/mu_w": 0.9897802197802197, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3854347826086957, "calib/std_conf": 0.0023674701547933895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8834646840148699, "calib/step_q_c_n": 1345.0, "calib/step_q_gap": 0.03677752029465464, "calib/step_q_w": 0.8466871637202152, "calib/step_q_w_n": 1301.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 847.02734375, "completions/mean_terminated_length": 922.7191162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 507.0, "epoch": 0.034133333333333335, "grad_norm": 0.024691706523299217, "kl": 0.036327362060546875, "learning_rate": 4.666666666666667e-06, "loss": -0.2709, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.015017786994576454, "mask/share_reasoning": 0.7982213497161865, "mask/share_step_conf": 0.10472959280014038, "num_tokens": 10200367.0, "reward": 0.7324390411376953, "reward_std": 0.18825767934322357, "rewards/accuracy_reward_step": 0.54296875, "rewards/asymmetric_l2_reward": 0.6264613270759583, "rewards/final_brier_reward_step": 0.5501354932785034, "rewards/format_reward_step": 0.8984375, "step": 32 }, { "adv/mean_abs_final_conf": 0.5061349868774414, "adv/mean_abs_reasoning": 0.47032085061073303, "adv/mean_abs_step_conf": 0.7482904195785522, "adv/ratio_final_to_reasoning": 1.076148306459733, "adv/ratio_step_to_reasoning": 1.5910211478118885, "adv/std_final_conf": 0.7739526033401489, "adv/std_reasoning": 0.7576538920402527, "adv/std_step_conf": 0.9329660534858704, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4927536231884058, "calib/avg_num_step_conf": 9.43359375, "calib/ece": 0.43089068825910926, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": -0.0007246376811593791, "calib/mean_conf": 0.9895951417004049, "calib/mu_c": 0.9892753623188405, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43089068825910926, "calib/std_conf": 0.005747562120068867, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8886196319018405, "calib/step_q_c_n": 1304.0, "calib/step_q_gap": 0.007719541892839521, "calib/step_q_w": 0.8809000900090009, "calib/step_q_w_n": 1111.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 862.5625, "completions/mean_terminated_length": 883.2640380859375, "completions/min_length": 0.0, "completions/min_terminated_length": 474.0, "epoch": 0.0352, "grad_norm": 0.02115471102297306, "kl": 0.04196929931640625, "learning_rate": 4.638888888888889e-06, "loss": -0.1091, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.016444312408566475, "mask/share_reasoning": 0.84872967004776, "mask/share_step_conf": 0.11138851940631866, "num_tokens": 10528055.0, "reward": 0.7556190490722656, "reward_std": 0.2505682706832886, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.6630150079727173, "rewards/final_brier_reward_step": 0.5474417805671692, "rewards/format_reward_step": 0.96484375, "step": 33 }, { "adv/mean_abs_final_conf": 0.6398830413818359, "adv/mean_abs_reasoning": 0.6042325496673584, "adv/mean_abs_step_conf": 0.7579690217971802, "adv/ratio_final_to_reasoning": 1.0590012764689751, "adv/ratio_step_to_reasoning": 1.2544326223644466, "adv/std_final_conf": 0.8544065356254578, "adv/std_reasoning": 0.8431804180145264, "adv/std_step_conf": 0.9319725036621094, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.49333333333333335, "calib/avg_num_step_conf": 10.87109375, "calib/ece": 0.3567510548523207, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0005333333333330526, "calib/mean_conf": 0.9896624472573841, "calib/mu_c": 0.9894666666666666, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3567510548523207, "calib/std_conf": 0.0036589845381736407, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8843560872624913, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": 0.011830389758820115, "calib/step_q_w": 0.8725256975036711, "calib/step_q_w_n": 1362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 819.0859375, "completions/mean_terminated_length": 877.3472290039062, "completions/min_length": 0.0, "completions/min_terminated_length": 521.0, "epoch": 0.03626666666666667, "grad_norm": 0.024255702272057533, "kl": 0.039142608642578125, "learning_rate": 4.611111111111112e-06, "loss": -0.3183, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.015937991440296173, "mask/share_reasoning": 0.8053004741668701, "mask/share_step_conf": 0.11235527694225311, "num_tokens": 10842853.0, "reward": 0.7982138395309448, "reward_std": 0.3219480514526367, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.7014608979225159, "rewards/final_brier_reward_step": 0.5926230549812317, "rewards/format_reward_step": 0.92578125, "step": 34 }, { "adv/mean_abs_final_conf": 0.5378986597061157, "adv/mean_abs_reasoning": 0.5105568170547485, "adv/mean_abs_step_conf": 0.7341263294219971, "adv/ratio_final_to_reasoning": 1.053552987127063, "adv/ratio_step_to_reasoning": 1.4378935015635577, "adv/std_final_conf": 0.792346179485321, "adv/std_reasoning": 0.7756903171539307, "adv/std_step_conf": 0.9303335547447205, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5099009900990099, "calib/avg_num_step_conf": 10.52734375, "calib/ece": 0.4197021276595744, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00019801980198030922, "calib/mean_conf": 0.9899148936170212, "calib/mu_c": 0.99, "calib/mu_w": 0.9898019801980197, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4197021276595744, "calib/std_conf": 0.000918597155103103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8725444264943457, "calib/step_q_c_n": 1238.0, "calib/step_q_gap": 0.027980253536212474, "calib/step_q_w": 0.8445641729581332, "calib/step_q_w_n": 1457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 894.0, "completions/mean_terminated_length": 961.6134643554688, "completions/min_length": 0.0, "completions/min_terminated_length": 543.0, "epoch": 0.037333333333333336, "grad_norm": 0.022384105250239372, "kl": 0.038501739501953125, "learning_rate": 4.583333333333333e-06, "loss": -0.335, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.014649630524218082, "mask/share_reasoning": 0.8177850842475891, "mask/share_step_conf": 0.09725277125835419, "num_tokens": 11180973.0, "reward": 0.7150118350982666, "reward_std": 0.280878484249115, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6095709204673767, "rewards/final_brier_reward_step": 0.5313901901245117, "rewards/format_reward_step": 0.91796875, "step": 35 }, { "adv/mean_abs_final_conf": 0.3458389937877655, "adv/mean_abs_reasoning": 0.3302667737007141, "adv/mean_abs_step_conf": 0.7599014639854431, "adv/ratio_final_to_reasoning": 1.0471504290684803, "adv/ratio_step_to_reasoning": 2.300871672528771, "adv/std_final_conf": 0.6069803833961487, "adv/std_reasoning": 0.5962504148483276, "adv/std_step_conf": 0.9211536049842834, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49728260869565216, "calib/avg_num_step_conf": 11.12109375, "calib/ece": 0.2419105691056911, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00016304347826123777, "calib/mean_conf": 0.9898780487804878, "calib/mu_c": 0.9898369565217391, "calib/mu_w": 0.9900000000000003, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2419105691056911, "calib/std_conf": 0.0019088385173778714, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8667872340425532, "calib/step_q_c_n": 1880.0, "calib/step_q_gap": -0.013274813527250351, "calib/step_q_w": 0.8800620475698036, "calib/step_q_w_n": 967.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 861.90625, "completions/mean_terminated_length": 882.592041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.0384, "grad_norm": 684.32421875, "kl": 3056.043258666992, "learning_rate": 4.555555555555556e-06, "loss": 33.7113, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016752172261476517, "mask/share_reasoning": 0.8371760249137878, "mask/share_step_conf": 0.12263430655002594, "num_tokens": 11504333.0, "reward": 0.917310357093811, "reward_std": 0.16187608242034912, "rewards/accuracy_reward_step": 0.71875, "rewards/asymmetric_l2_reward": 0.7751913666725159, "rewards/final_brier_reward_step": 0.7234917879104614, "rewards/format_reward_step": 0.9609375, "step": 36 }, { "adv/mean_abs_final_conf": 0.3826274275779724, "adv/mean_abs_reasoning": 0.3682462275028229, "adv/mean_abs_step_conf": 0.7657290101051331, "adv/ratio_final_to_reasoning": 1.0390532176600216, "adv/ratio_step_to_reasoning": 2.0793940383252485, "adv/std_final_conf": 0.6385846734046936, "adv/std_reasoning": 0.6405808329582214, "adv/std_step_conf": 0.9296008348464966, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.5043478260869565, "calib/avg_num_step_conf": 10.8515625, "calib/ece": 0.48317596566523624, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.9957081545064378, "calib/gap": 0.0007826086956519429, "calib/mean_conf": 0.9896137339055796, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9892173913043478, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48317596566523624, "calib/std_conf": 0.005883429867191, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8636806722689075, "calib/step_q_c_n": 1190.0, "calib/step_q_gap": 0.02601064707999068, "calib/step_q_w": 0.8376700251889169, "calib/step_q_w_n": 1588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 896.26171875, "completions/mean_terminated_length": 964.0462646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 532.0, "epoch": 0.039466666666666664, "grad_norm": 0.023465532809495926, "kl": 0.043636322021484375, "learning_rate": 4.527777777777778e-06, "loss": -0.2311, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.015032488852739334, "mask/share_reasoning": 0.807826042175293, "mask/share_step_conf": 0.106828972697258, "num_tokens": 11840872.0, "reward": 0.6838352680206299, "reward_std": 0.19802510738372803, "rewards/accuracy_reward_step": 0.4609375, "rewards/asymmetric_l2_reward": 0.6229565143585205, "rewards/final_brier_reward_step": 0.4704952836036682, "rewards/format_reward_step": 0.91015625, "step": 37 }, { "adv/mean_abs_final_conf": 0.5782315135002136, "adv/mean_abs_reasoning": 0.5265728235244751, "adv/mean_abs_step_conf": 0.7494425773620605, "adv/ratio_final_to_reasoning": 1.0981036006187612, "adv/ratio_step_to_reasoning": 1.4232458339681604, "adv/std_final_conf": 0.8226618766784668, "adv/std_reasoning": 0.7931339740753174, "adv/std_step_conf": 0.9324886798858643, "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.5159055118110236, "calib/avg_num_step_conf": 11.44140625, "calib/ece": 0.42969162995594723, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006488188976377929, "calib/mean_conf": 0.9891629955947138, "calib/mu_c": 0.9894488188976377, "calib/mu_w": 0.9887999999999999, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42969162995594723, "calib/std_conf": 0.006133955886290556, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8472820109976434, "calib/step_q_c_n": 1273.0, "calib/step_q_gap": 0.021545296021798, "calib/step_q_w": 0.8257367149758454, "calib/step_q_w_n": 1656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 911.984375, "completions/mean_terminated_length": 985.0969848632812, "completions/min_length": 0.0, "completions/min_terminated_length": 585.0, "epoch": 0.04053333333333333, "grad_norm": 0.029373837634921074, "kl": 0.053211212158203125, "learning_rate": 4.5e-06, "loss": -0.1981, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.014381938613951206, "mask/share_reasoning": 0.805593729019165, "mask/share_step_conf": 0.10580561310052872, "num_tokens": 12181228.0, "reward": 0.692814826965332, "reward_std": 0.26417291164398193, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.604360818862915, "rewards/final_brier_reward_step": 0.5047062635421753, "rewards/format_reward_step": 0.88671875, "step": 38 }, { "adv/mean_abs_final_conf": 0.6115273237228394, "adv/mean_abs_reasoning": 0.5294957160949707, "adv/mean_abs_step_conf": 0.7513197064399719, "adv/ratio_final_to_reasoning": 1.1549240251325383, "adv/ratio_step_to_reasoning": 1.4189344381876259, "adv/std_final_conf": 0.8244526386260986, "adv/std_reasoning": 0.7757487297058105, "adv/std_step_conf": 0.9348276853561401, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.529126213592233, "calib/avg_num_step_conf": 13.20703125, "calib/ece": 0.4389082969432314, "calib/final_conf_rate": 0.89453125, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.9956331877729258, "calib/gap": 0.001941747572815733, "calib/mean_conf": 0.9891266375545851, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9880582524271841, "calib/nonempty_final_conf_rate": 0.89453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4389082969432314, "calib/std_conf": 0.006938698363682519, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8546702870442204, "calib/step_q_c_n": 1289.0, "calib/step_q_gap": 0.03671139603083606, "calib/step_q_w": 0.8179588910133844, "calib/step_q_w_n": 2092.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 853.80078125, "completions/mean_terminated_length": 942.125, "completions/min_length": 0.0, "completions/min_terminated_length": 563.0, "epoch": 0.0416, "grad_norm": 0.02601790241897106, "kl": 0.05804443359375, "learning_rate": 4.472222222222223e-06, "loss": -0.3918, "mask/has_final_conf_rate": 0.89453125, "mask/share_final_conf": 0.014618676155805588, "mask/share_reasoning": 0.7844524383544922, "mask/share_step_conf": 0.10717888176441193, "num_tokens": 12505889.0, "reward": 0.7072185277938843, "reward_std": 0.2773115038871765, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.635445237159729, "rewards/final_brier_reward_step": 0.5016480088233948, "rewards/format_reward_step": 0.89453125, "step": 39 }, { "adv/mean_abs_final_conf": 0.5351214408874512, "adv/mean_abs_reasoning": 0.5108170509338379, "adv/mean_abs_step_conf": 0.7385743260383606, "adv/ratio_final_to_reasoning": 1.0475794414246389, "adv/ratio_step_to_reasoning": 1.4458685838465135, "adv/std_final_conf": 0.7758241295814514, "adv/std_reasoning": 0.7756544947624207, "adv/std_step_conf": 0.9328163862228394, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5075821104122991, "calib/avg_num_step_conf": 10.9765625, "calib/ece": 0.42883817427385895, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004116002795245821, "calib/mean_conf": 0.9890041493775934, "calib/mu_c": 0.989185185185185, "calib/mu_w": 0.9887735849056604, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42883817427385895, "calib/std_conf": 0.005819196965396921, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8614264264264265, "calib/step_q_c_n": 1332.0, "calib/step_q_gap": 0.015364180147671491, "calib/step_q_w": 0.846062246278755, "calib/step_q_w_n": 1478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 921.484375, "completions/mean_terminated_length": 962.8571166992188, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.042666666666666665, "grad_norm": 0.022954532876610756, "kl": 0.06566619873046875, "learning_rate": 4.444444444444444e-06, "loss": -0.187, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.015071472153067589, "mask/share_reasoning": 0.8308209180831909, "mask/share_step_conf": 0.11113885045051575, "num_tokens": 12848549.0, "reward": 0.7284168004989624, "reward_std": 0.27028942108154297, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6276761889457703, "rewards/final_brier_reward_step": 0.5361886620521545, "rewards/format_reward_step": 0.9375, "step": 40 }, { "adv/mean_abs_final_conf": 0.5005245804786682, "adv/mean_abs_reasoning": 0.4871232509613037, "adv/mean_abs_step_conf": 0.7367668747901917, "adv/ratio_final_to_reasoning": 1.0275111678428774, "adv/ratio_step_to_reasoning": 1.5124855431068702, "adv/std_final_conf": 0.7579271197319031, "adv/std_reasoning": 0.757828950881958, "adv/std_step_conf": 0.9302553534507751, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5046875000000001, "calib/avg_num_step_conf": 11.92578125, "calib/ece": 0.1963223140495869, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011250000000007088, "calib/mean_conf": 0.989710743801653, "calib/mu_c": 0.9896875000000002, "calib/mu_w": 0.9898000000000002, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1963223140495869, "calib/std_conf": 0.002787036153540043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8462296650717704, "calib/step_q_c_n": 2090.0, "calib/step_q_gap": 0.03025874087654723, "calib/step_q_w": 0.8159709241952232, "calib/step_q_w_n": 963.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 869.15234375, "completions/mean_terminated_length": 908.1754760742188, "completions/min_length": 0.0, "completions/min_terminated_length": 477.0, "epoch": 0.04373333333333333, "grad_norm": 0.015356858260929585, "kl": 0.07578277587890625, "learning_rate": 4.416666666666667e-06, "loss": -0.1941, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.015964847058057785, "mask/share_reasoning": 0.8180328011512756, "mask/share_step_conf": 0.12303359061479568, "num_tokens": 13178300.0, "reward": 0.9321384429931641, "reward_std": 0.26598113775253296, "rewards/accuracy_reward_step": 0.75, "rewards/asymmetric_l2_reward": 0.7713373899459839, "rewards/final_brier_reward_step": 0.7538769245147705, "rewards/format_reward_step": 0.9453125, "step": 41 }, { "adv/mean_abs_final_conf": 0.3991064131259918, "adv/mean_abs_reasoning": 0.40221142768859863, "adv/mean_abs_step_conf": 0.7366739511489868, "adv/ratio_final_to_reasoning": 0.9922801433553231, "adv/ratio_step_to_reasoning": 1.8315589772833027, "adv/std_final_conf": 0.6996758580207825, "adv/std_reasoning": 0.7014210224151611, "adv/std_step_conf": 0.9316682815551758, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5180704898446834, "calib/avg_num_step_conf": 12.43359375, "calib/ece": 0.38135021097046407, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9957805907172996, "calib/gap": 0.001980286738351489, "calib/mean_conf": 0.9889451476793248, "calib/mu_c": 0.9897222222222223, "calib/mu_w": 0.9877419354838708, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38135021097046407, "calib/std_conf": 0.007750401356105529, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8366325224071705, "calib/step_q_c_n": 1562.0, "calib/step_q_gap": -0.0004865399000473225, "calib/step_q_w": 0.8371190623072178, "calib/step_q_w_n": 1621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 860.84765625, "completions/mean_terminated_length": 903.1843872070312, "completions/min_length": 0.0, "completions/min_terminated_length": 518.0, "epoch": 0.0448, "grad_norm": 0.018000788986682892, "kl": 0.0889892578125, "learning_rate": 4.388888888888889e-06, "loss": -0.1293, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.016016989946365356, "mask/share_reasoning": 0.8085422515869141, "mask/share_step_conf": 0.1285657286643982, "num_tokens": 13503045.0, "reward": 0.7686011791229248, "reward_std": 0.19247505068778992, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.6683084964752197, "rewards/final_brier_reward_step": 0.5712375044822693, "rewards/format_reward_step": 0.92578125, "step": 42 }, { "adv/mean_abs_final_conf": 0.5601272583007812, "adv/mean_abs_reasoning": 0.5113959908485413, "adv/mean_abs_step_conf": 0.7792887687683105, "adv/ratio_final_to_reasoning": 1.0952906716601003, "adv/ratio_step_to_reasoning": 1.5238460658936028, "adv/std_final_conf": 0.7849826812744141, "adv/std_reasoning": 0.7578847408294678, "adv/std_step_conf": 0.9340676069259644, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5139602319801161, "calib/avg_num_step_conf": 12.08203125, "calib/ece": 0.2834439834024896, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.983402489626556, "calib/gap": -0.001004971002485311, "calib/mean_conf": 0.9871784232365146, "calib/mu_c": 0.9868823529411763, "calib/mu_w": 0.9878873239436616, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2826141078838174, "calib/std_conf": 0.01650868346897965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8531187010078388, "calib/step_q_c_n": 1786.0, "calib/step_q_gap": 0.05383790529246024, "calib/step_q_w": 0.7992807957153786, "calib/step_q_w_n": 1307.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 865.765625, "completions/mean_terminated_length": 900.9592895507812, "completions/min_length": 0.0, "completions/min_terminated_length": 501.0, "epoch": 0.04586666666666667, "grad_norm": 0.018080631271004677, "kl": 0.095123291015625, "learning_rate": 4.361111111111112e-06, "loss": -0.1282, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016060657799243927, "mask/share_reasoning": 0.8252454996109009, "mask/share_step_conf": 0.11963135004043579, "num_tokens": 13829905.0, "reward": 0.859089732170105, "reward_std": 0.2762579023838043, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.7275747060775757, "rewards/final_brier_reward_step": 0.6702921390533447, "rewards/format_reward_step": 0.9375, "step": 43 }, { "adv/mean_abs_final_conf": 0.6635046005249023, "adv/mean_abs_reasoning": 0.5385121703147888, "adv/mean_abs_step_conf": 0.7993447780609131, "adv/ratio_final_to_reasoning": 1.2321069738072008, "adv/ratio_step_to_reasoning": 1.4843578699319901, "adv/std_final_conf": 0.8506072163581848, "adv/std_reasoning": 0.7756995558738708, "adv/std_step_conf": 0.9337899088859558, "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.4961926961926962, "calib/avg_num_step_conf": 13.08984375, "calib/ece": 0.5044933920704846, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.986784140969163, "calib/gap": -0.004595959595959709, "calib/mean_conf": 0.9855506607929516, "calib/mu_c": 0.9831818181818179, "calib/mu_w": 0.9877777777777776, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5027312775330397, "calib/std_conf": 0.027861059256808065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8404388185654009, "calib/step_q_c_n": 1185.0, "calib/step_q_gap": 0.005138726229297452, "calib/step_q_w": 0.8353000923361035, "calib/step_q_w_n": 2166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 913.73828125, "completions/mean_terminated_length": 999.6453247070312, "completions/min_length": 0.0, "completions/min_terminated_length": 517.0, "epoch": 0.046933333333333334, "grad_norm": 0.029077230021357536, "kl": 0.0980377197265625, "learning_rate": 4.333333333333334e-06, "loss": -0.3187, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.014052234590053558, "mask/share_reasoning": 0.7942177057266235, "mask/share_step_conf": 0.10579255223274231, "num_tokens": 14170142.0, "reward": 0.6271952390670776, "reward_std": 0.26996076107025146, "rewards/accuracy_reward_step": 0.43359375, "rewards/asymmetric_l2_reward": 0.5590915679931641, "rewards/final_brier_reward_step": 0.4327988028526306, "rewards/format_reward_step": 0.87890625, "step": 44 }, { "adv/mean_abs_final_conf": 0.5559475421905518, "adv/mean_abs_reasoning": 0.5480928421020508, "adv/mean_abs_step_conf": 0.751115083694458, "adv/ratio_final_to_reasoning": 1.014330966371275, "adv/ratio_step_to_reasoning": 1.3704157872483327, "adv/std_final_conf": 0.8367322683334351, "adv/std_reasoning": 0.8102704882621765, "adv/std_step_conf": 0.9335227608680725, "calib/answer_extract_rate": 0.83984375, "calib/auroc": 0.5272909821102592, "calib/avg_num_step_conf": 15.546875, "calib/ece": 0.373860465116279, "calib/final_conf_rate": 0.83984375, "calib/format_rate": 0.83203125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0013453815261041946, "calib/mean_conf": 0.9878139534883721, "calib/mu_c": 0.9883333333333332, "calib/mu_w": 0.986987951807229, "calib/nonempty_final_conf_rate": 0.83984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.373860465116279, "calib/std_conf": 0.008090349732152157, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8533936955063716, "calib/step_q_c_n": 1491.0, "calib/step_q_gap": 0.029629131424410815, "calib/step_q_w": 0.8237645640819607, "calib/step_q_w_n": 2489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 871.33203125, "completions/mean_terminated_length": 991.3822631835938, "completions/min_length": 0.0, "completions/min_terminated_length": 479.0, "epoch": 0.048, "grad_norm": 0.035998519510030746, "kl": 0.10224151611328125, "learning_rate": 4.305555555555556e-06, "loss": -0.4043, "mask/has_final_conf_rate": 0.83984375, "mask/share_final_conf": 0.013904090970754623, "mask/share_reasoning": 0.7514083385467529, "mask/share_step_conf": 0.11359383165836334, "num_tokens": 14498251.0, "reward": 0.6943749189376831, "reward_std": 0.2721937298774719, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.599319338798523, "rewards/final_brier_reward_step": 0.5198991894721985, "rewards/format_reward_step": 0.83203125, "step": 45 }, { "adv/mean_abs_final_conf": 0.5597379207611084, "adv/mean_abs_reasoning": 0.5508193373680115, "adv/mean_abs_step_conf": 0.7796428203582764, "adv/ratio_final_to_reasoning": 1.0161914856433922, "adv/ratio_step_to_reasoning": 1.4154238376663675, "adv/std_final_conf": 0.806285560131073, "adv/std_reasoning": 0.7932658195495605, "adv/std_step_conf": 0.9322901368141174, "calib/answer_extract_rate": 0.85546875, "calib/auroc": 0.5066286289645913, "calib/avg_num_step_conf": 14.1875, "calib/ece": 0.44771689497716904, "calib/final_conf_rate": 0.85546875, "calib/format_rate": 0.84765625, "calib/frac_conf_gt_0.9": 0.9908675799086758, "calib/gap": 0.0003582815908710346, "calib/mean_conf": 0.9865296803652969, "calib/mu_c": 0.9866949152542372, "calib/mu_w": 0.9863366336633662, "calib/nonempty_final_conf_rate": 0.85546875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44771689497716904, "calib/std_conf": 0.01245311885851197, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8513843774168601, "calib/step_q_c_n": 1293.0, "calib/step_q_gap": 0.0327140054630336, "calib/step_q_w": 0.8186703719538265, "calib/step_q_w_n": 2339.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 917.60546875, "completions/mean_terminated_length": 1016.9133911132812, "completions/min_length": 0.0, "completions/min_terminated_length": 504.0, "epoch": 0.04906666666666667, "grad_norm": 0.022643912583589554, "kl": 0.1025543212890625, "learning_rate": 4.277777777777778e-06, "loss": -0.3343, "mask/has_final_conf_rate": 0.85546875, "mask/share_final_conf": 0.014070061035454273, "mask/share_reasoning": 0.7752383947372437, "mask/share_step_conf": 0.11303532123565674, "num_tokens": 14837926.0, "reward": 0.6594477295875549, "reward_std": 0.27707844972610474, "rewards/accuracy_reward_step": 0.4609375, "rewards/asymmetric_l2_reward": 0.5859012603759766, "rewards/final_brier_reward_step": 0.47127535939216614, "rewards/format_reward_step": 0.84765625, "step": 46 }, { "adv/mean_abs_final_conf": 0.6609259843826294, "adv/mean_abs_reasoning": 0.5075252056121826, "adv/mean_abs_step_conf": 0.7495334148406982, "adv/ratio_final_to_reasoning": 1.3022525326311882, "adv/ratio_step_to_reasoning": 1.4768397836253326, "adv/std_final_conf": 0.8575954437255859, "adv/std_reasoning": 0.7579585909843445, "adv/std_step_conf": 0.9317649006843567, "calib/answer_extract_rate": 0.859375, "calib/auroc": 0.5688089727612604, "calib/avg_num_step_conf": 13.17578125, "calib/ece": 0.35684931506849316, "calib/final_conf_rate": 0.85546875, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.9863013698630136, "calib/gap": 0.005038276660139229, "calib/mean_conf": 0.9824200913242009, "calib/mu_c": 0.9843065693430658, "calib/mu_w": 0.9792682926829266, "calib/nonempty_final_conf_rate": 0.85546875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35684931506849316, "calib/std_conf": 0.016360291553566133, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8437517730496452, "calib/step_q_c_n": 1410.0, "calib/step_q_gap": 0.007088502545315123, "calib/step_q_w": 0.8366632705043301, "calib/step_q_w_n": 1963.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 917.69140625, "completions/mean_terminated_length": 1012.625, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.050133333333333335, "grad_norm": 0.023026349022984505, "kl": 0.120025634765625, "learning_rate": 4.25e-06, "loss": -0.3556, "mask/has_final_conf_rate": 0.85546875, "mask/share_final_conf": 0.013789718970656395, "mask/share_reasoning": 0.787351131439209, "mask/share_step_conf": 0.10510917007923126, "num_tokens": 15178831.0, "reward": 0.7321069240570068, "reward_std": 0.24354524910449982, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6390807032585144, "rewards/final_brier_reward_step": 0.547789454460144, "rewards/format_reward_step": 0.8515625, "step": 47 }, { "adv/mean_abs_final_conf": 0.7052232027053833, "adv/mean_abs_reasoning": 0.6574268341064453, "adv/mean_abs_step_conf": 0.7477045059204102, "adv/ratio_final_to_reasoning": 1.0727021869496722, "adv/ratio_step_to_reasoning": 1.1373197246149338, "adv/std_final_conf": 0.8916906714439392, "adv/std_reasoning": 0.8751892447471619, "adv/std_step_conf": 0.9354519844055176, "calib/answer_extract_rate": 0.8515625, "calib/auroc": 0.4789540816326531, "calib/avg_num_step_conf": 14.9296875, "calib/ece": 0.43339449541284414, "calib/final_conf_rate": 0.8515625, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.001341836734693791, "calib/mean_conf": 0.9838532110091744, "calib/mu_c": 0.9832500000000001, "calib/mu_w": 0.9845918367346939, "calib/nonempty_final_conf_rate": 0.8515625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43339449541284414, "calib/std_conf": 0.012371041717481143, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8498358208955226, "calib/step_q_c_n": 1340.0, "calib/step_q_gap": 0.016495772547416077, "calib/step_q_w": 0.8333400483481065, "calib/step_q_w_n": 2482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 853.24609375, "completions/mean_terminated_length": 970.804443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 534.0, "epoch": 0.0512, "grad_norm": 0.040837667882442474, "kl": 0.13056182861328125, "learning_rate": 4.222222222222223e-06, "loss": -0.4883, "mask/has_final_conf_rate": 0.8515625, "mask/share_final_conf": 0.014190906658768654, "mask/share_reasoning": 0.7518985271453857, "mask/share_step_conf": 0.11281684041023254, "num_tokens": 15500950.0, "reward": 0.67741858959198, "reward_std": 0.3460836708545685, "rewards/accuracy_reward_step": 0.46875, "rewards/asymmetric_l2_reward": 0.6105802059173584, "rewards/final_brier_reward_step": 0.4801945090293884, "rewards/format_reward_step": 0.8515625, "step": 48 }, { "adv/mean_abs_final_conf": 0.70012366771698, "adv/mean_abs_reasoning": 0.4757267236709595, "adv/mean_abs_step_conf": 0.7478195428848267, "adv/ratio_final_to_reasoning": 1.4716929549688837, "adv/ratio_step_to_reasoning": 1.5719519330641232, "adv/std_final_conf": 0.8788543939590454, "adv/std_reasoning": 0.7578703761100769, "adv/std_step_conf": 0.9321491718292236, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5525631939522797, "calib/avg_num_step_conf": 11.640625, "calib/ece": 0.3327542372881357, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.9957627118644068, "calib/gap": 0.0031207181667846084, "calib/mean_conf": 0.9810593220338985, "calib/mu_c": 0.982156862745098, "calib/mu_w": 0.9790361445783133, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3327542372881357, "calib/std_conf": 0.015047265854960285, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8497086801426873, "calib/step_q_c_n": 1682.0, "calib/step_q_gap": 0.0004097587251219448, "calib/step_q_w": 0.8492989214175654, "calib/step_q_w_n": 1298.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 868.53515625, "completions/mean_terminated_length": 915.0, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.05226666666666667, "grad_norm": 0.024042055010795593, "kl": 0.1303863525390625, "learning_rate": 4.194444444444445e-06, "loss": -0.1904, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015961725264787674, "mask/share_reasoning": 0.8114060163497925, "mask/share_step_conf": 0.12185099720954895, "num_tokens": 15827831.0, "reward": 0.7799708843231201, "reward_std": 0.2652823328971863, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.6593862771987915, "rewards/final_brier_reward_step": 0.5989929437637329, "rewards/format_reward_step": 0.91015625, "step": 49 }, { "adv/mean_abs_final_conf": 0.755120575428009, "adv/mean_abs_reasoning": 0.6018161773681641, "adv/mean_abs_step_conf": 0.7432763576507568, "adv/ratio_final_to_reasoning": 1.2547362530702797, "adv/ratio_step_to_reasoning": 1.2350554631170272, "adv/std_final_conf": 0.9192172884941101, "adv/std_reasoning": 0.8434169292449951, "adv/std_step_conf": 0.9341980814933777, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.5903263403263402, "calib/avg_num_step_conf": 12.55078125, "calib/ece": 0.3453303964757709, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.9911894273127754, "calib/gap": 0.0040825840825842175, "calib/mean_conf": 0.9735242290748899, "calib/mu_c": 0.9750349650349649, "calib/mu_w": 0.9709523809523807, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34444933920704845, "calib/std_conf": 0.019754062398247826, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8568041237113403, "calib/step_q_c_n": 1552.0, "calib/step_q_gap": 0.019308639063537636, "calib/step_q_w": 0.8374954846478027, "calib/step_q_w_n": 1661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 892.41796875, "completions/mean_terminated_length": 959.9118041992188, "completions/min_length": 0.0, "completions/min_terminated_length": 499.0, "epoch": 0.05333333333333334, "grad_norm": 0.0485478974878788, "kl": 0.1522216796875, "learning_rate": 4.166666666666667e-06, "loss": -0.3244, "mask/has_final_conf_rate": 0.88671875, "mask/share_final_conf": 0.01499384269118309, "mask/share_reasoning": 0.8000199794769287, "mask/share_step_conf": 0.1146736890077591, "num_tokens": 16161650.0, "reward": 0.7477984428405762, "reward_std": 0.3347986936569214, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.6312395334243774, "rewards/final_brier_reward_step": 0.5760761499404907, "rewards/format_reward_step": 0.87890625, "step": 50 }, { "adv/mean_abs_final_conf": 0.7460015416145325, "adv/mean_abs_reasoning": 0.49163341522216797, "adv/mean_abs_step_conf": 0.7601824402809143, "adv/ratio_final_to_reasoning": 1.5173938925152517, "adv/ratio_step_to_reasoning": 1.5462383490296112, "adv/std_final_conf": 0.9048994183540344, "adv/std_reasoning": 0.7755351662635803, "adv/std_step_conf": 0.9343783855438232, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5319148936170213, "calib/avg_num_step_conf": 12.09375, "calib/ece": 0.3700000000000001, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.9872881355932204, "calib/gap": 0.002463290380581462, "calib/mean_conf": 0.9716949152542372, "calib/mu_c": 0.972676056338028, "calib/mu_w": 0.9702127659574465, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3700000000000001, "calib/std_conf": 0.018124105343005043, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8501683501683501, "calib/step_q_c_n": 1485.0, "calib/step_q_gap": -0.0004834189191730287, "calib/step_q_w": 0.8506517690875232, "calib/step_q_w_n": 1611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 851.8828125, "completions/mean_terminated_length": 901.1652221679688, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 0.0544, "grad_norm": 0.04129322990775108, "kl": 0.1435089111328125, "learning_rate": 4.138888888888889e-06, "loss": -0.2908, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.0158391110599041, "mask/share_reasoning": 0.8140634894371033, "mask/share_step_conf": 0.11540989577770233, "num_tokens": 16489028.0, "reward": 0.7558123469352722, "reward_std": 0.2396428883075714, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.6501593589782715, "rewards/final_brier_reward_step": 0.5677152276039124, "rewards/format_reward_step": 0.9140625, "step": 51 }, { "adv/mean_abs_final_conf": 0.7636961936950684, "adv/mean_abs_reasoning": 0.515580415725708, "adv/mean_abs_step_conf": 0.7763593792915344, "adv/ratio_final_to_reasoning": 1.481235846827354, "adv/ratio_step_to_reasoning": 1.5057968759320806, "adv/std_final_conf": 0.8879469633102417, "adv/std_reasoning": 0.7578575015068054, "adv/std_step_conf": 0.9332748651504517, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.5637815278533842, "calib/avg_num_step_conf": 11.47265625, "calib/ece": 0.2529184549356223, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.9828326180257511, "calib/gap": 0.004101796407185443, "calib/mean_conf": 0.9679399141630902, "calib/mu_c": 0.9691017964071855, "calib/mu_w": 0.9650000000000001, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25206008583690986, "calib/std_conf": 0.02012953318957994, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8531514084507043, "calib/step_q_c_n": 1704.0, "calib/step_q_gap": 0.034789689067087126, "calib/step_q_w": 0.8183617193836171, "calib/step_q_w_n": 1233.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 899.96484375, "completions/mean_terminated_length": 944.225341796875, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.055466666666666664, "grad_norm": 0.03694462776184082, "kl": 0.1541900634765625, "learning_rate": 4.111111111111111e-06, "loss": -0.2186, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.01521845068782568, "mask/share_reasoning": 0.8235276937484741, "mask/share_step_conf": 0.11437886953353882, "num_tokens": 16827371.0, "reward": 0.8517237901687622, "reward_std": 0.2654600739479065, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.7257670760154724, "rewards/final_brier_reward_step": 0.6651804447174072, "rewards/format_reward_step": 0.90625, "step": 52 }, { "adv/mean_abs_final_conf": 0.7440406084060669, "adv/mean_abs_reasoning": 0.5574250221252441, "adv/mean_abs_step_conf": 0.739154577255249, "adv/ratio_final_to_reasoning": 1.3347815022176979, "adv/ratio_step_to_reasoning": 1.3260161419326693, "adv/std_final_conf": 0.8806307911872864, "adv/std_reasoning": 0.8100613355636597, "adv/std_step_conf": 0.9332544803619385, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5653889943074004, "calib/avg_num_step_conf": 10.86328125, "calib/ece": 0.3233749999999999, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9916666666666667, "calib/gap": 0.001628083491461041, "calib/mean_conf": 0.9658750000000002, "calib/mu_c": 0.9664516129032257, "calib/mu_w": 0.9648235294117646, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32170833333333326, "calib/std_conf": 0.027780107541188533, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8536102626756261, "calib/step_q_c_n": 1637.0, "calib/step_q_gap": 0.0011190039343673686, "calib/step_q_w": 0.8524912587412588, "calib/step_q_w_n": 1144.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 982.72265625, "completions/mean_terminated_length": 1002.298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.05653333333333333, "grad_norm": 0.06327595561742783, "kl": 0.1549835205078125, "learning_rate": 4.083333333333334e-06, "loss": -0.0358, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.014779354445636272, "mask/share_reasoning": 0.852211058139801, "mask/share_step_conf": 0.11347834765911102, "num_tokens": 17184772.0, "reward": 0.8161087036132812, "reward_std": 0.29630500078201294, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7033169865608215, "rewards/final_brier_reward_step": 0.6195253729820251, "rewards/format_reward_step": 0.92578125, "step": 53 }, { "adv/mean_abs_final_conf": 0.7146573662757874, "adv/mean_abs_reasoning": 0.5098264813423157, "adv/mean_abs_step_conf": 0.7374184131622314, "adv/ratio_final_to_reasoning": 1.4017658800189725, "adv/ratio_step_to_reasoning": 1.4464105733007275, "adv/std_final_conf": 0.8893719911575317, "adv/std_reasoning": 0.7930194139480591, "adv/std_step_conf": 0.9324125647544861, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.45959719504240065, "calib/avg_num_step_conf": 12.3515625, "calib/ece": 0.27360995850622416, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9875518672199171, "calib/gap": -0.0023287671232876672, "calib/mean_conf": 0.9707053941908714, "calib/mu_c": 0.9700000000000001, "calib/mu_w": 0.9723287671232878, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27360995850622416, "calib/std_conf": 0.017015989915706308, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8597100685292567, "calib/step_q_c_n": 1897.0, "calib/step_q_gap": 0.015520345209098507, "calib/step_q_w": 0.8441897233201582, "calib/step_q_w_n": 1265.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 891.2109375, "completions/mean_terminated_length": 919.9596557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 0.0576, "grad_norm": 0.025256404653191566, "kl": 0.159423828125, "learning_rate": 4.055555555555556e-06, "loss": -0.0848, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01595216616988182, "mask/share_reasoning": 0.8246407508850098, "mask/share_step_conf": 0.1281571239233017, "num_tokens": 17519154.0, "reward": 0.8532845973968506, "reward_std": 0.26809757947921753, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.71662437915802, "rewards/final_brier_reward_step": 0.6704136729240417, "rewards/format_reward_step": 0.93359375, "step": 54 }, { "adv/mean_abs_final_conf": 0.7599447965621948, "adv/mean_abs_reasoning": 0.6141306757926941, "adv/mean_abs_step_conf": 0.7744641304016113, "adv/ratio_final_to_reasoning": 1.2374317494909206, "adv/ratio_step_to_reasoning": 1.2610738413318396, "adv/std_final_conf": 0.8942292332649231, "adv/std_reasoning": 0.8267177939414978, "adv/std_step_conf": 0.9326158761978149, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.5516528925619836, "calib/avg_num_step_conf": 11.890625, "calib/ece": 0.4322869955156952, "calib/final_conf_rate": 0.87109375, "calib/format_rate": 0.87109375, "calib/frac_conf_gt_0.9": 0.9955156950672646, "calib/gap": 0.004671852211959759, "calib/mean_conf": 0.9748878923766816, "calib/mu_c": 0.97702479338843, "calib/mu_w": 0.9723529411764702, "calib/nonempty_final_conf_rate": 0.87109375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4322869955156952, "calib/std_conf": 0.019611952668874544, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8601275318829709, "calib/step_q_c_n": 1333.0, "calib/step_q_gap": -0.006073520133393906, "calib/step_q_w": 0.8662010520163648, "calib/step_q_w_n": 1711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 880.0546875, "completions/mean_terminated_length": 938.7250366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.058666666666666666, "grad_norm": 0.04848377779126167, "kl": 0.155914306640625, "learning_rate": 4.027777777777779e-06, "loss": -0.2427, "mask/has_final_conf_rate": 0.87109375, "mask/share_final_conf": 0.015094468370079994, "mask/share_reasoning": 0.8024145364761353, "mask/share_step_conf": 0.1199910044670105, "num_tokens": 17852272.0, "reward": 0.6880007982254028, "reward_std": 0.25061482191085815, "rewards/accuracy_reward_step": 0.48828125, "rewards/asymmetric_l2_reward": 0.610323429107666, "rewards/final_brier_reward_step": 0.49380311369895935, "rewards/format_reward_step": 0.87109375, "step": 55 }, { "adv/mean_abs_final_conf": 0.7145578861236572, "adv/mean_abs_reasoning": 0.6271743774414062, "adv/mean_abs_step_conf": 0.7482906579971313, "adv/ratio_final_to_reasoning": 1.13932888814549, "adv/ratio_step_to_reasoning": 1.1931142038197189, "adv/std_final_conf": 0.906951904296875, "adv/std_reasoning": 0.8591962456703186, "adv/std_step_conf": 0.9356598258018494, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6712661161813703, "calib/avg_num_step_conf": 11.71875, "calib/ece": 0.4738297872340427, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.9957446808510638, "calib/gap": 0.010332464146023512, "calib/mean_conf": 0.9759574468085107, "calib/mu_c": 0.9811016949152542, "calib/mu_w": 0.9707692307692307, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4738297872340427, "calib/std_conf": 0.01582612632112727, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8583504340962904, "calib/step_q_c_n": 1267.0, "calib/step_q_gap": -0.012578590716173621, "calib/step_q_w": 0.870929024812464, "calib/step_q_w_n": 1733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 957.1796875, "completions/mean_terminated_length": 1000.155029296875, "completions/min_length": 0.0, "completions/min_terminated_length": 449.0, "epoch": 0.05973333333333333, "grad_norm": 0.02709430269896984, "kl": 0.1496734619140625, "learning_rate": 4.000000000000001e-06, "loss": -0.1615, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.014955723658204079, "mask/share_reasoning": 0.830254852771759, "mask/share_step_conf": 0.11182068288326263, "num_tokens": 18204150.0, "reward": 0.6914936304092407, "reward_std": 0.304796040058136, "rewards/accuracy_reward_step": 0.46484375, "rewards/asymmetric_l2_reward": 0.6214755773544312, "rewards/final_brier_reward_step": 0.4865117073059082, "rewards/format_reward_step": 0.91015625, "step": 56 }, { "adv/mean_abs_final_conf": 0.7080997228622437, "adv/mean_abs_reasoning": 0.5576961040496826, "adv/mean_abs_step_conf": 0.7475917935371399, "adv/ratio_final_to_reasoning": 1.2696874116932368, "adv/ratio_step_to_reasoning": 1.3405002977581144, "adv/std_final_conf": 0.9036187529563904, "adv/std_reasoning": 0.8101306557655334, "adv/std_step_conf": 0.9331084489822388, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.5666931637519873, "calib/avg_num_step_conf": 11.23828125, "calib/ece": 0.34583690987124477, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.9957081545064378, "calib/gap": 0.0029936406995230147, "calib/mean_conf": 0.9793133047210301, "calib/mu_c": 0.9804054054054052, "calib/mu_w": 0.9774117647058822, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3449785407725323, "calib/std_conf": 0.018353950675204293, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8543750000000001, "calib/step_q_c_n": 1536.0, "calib/step_q_gap": -0.004916573452647177, "calib/step_q_w": 0.8592915734526473, "calib/step_q_w_n": 1341.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 880.0859375, "completions/mean_terminated_length": 930.9999389648438, "completions/min_length": 0.0, "completions/min_terminated_length": 517.0, "epoch": 0.0608, "grad_norm": 0.028827087953686714, "kl": 0.15277099609375, "learning_rate": 3.972222222222223e-06, "loss": -0.2121, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.015749987214803696, "mask/share_reasoning": 0.8137336373329163, "mask/share_step_conf": 0.11582887172698975, "num_tokens": 18536244.0, "reward": 0.7834737300872803, "reward_std": 0.30211466550827026, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.676862359046936, "rewards/final_brier_reward_step": 0.5924289226531982, "rewards/format_reward_step": 0.91015625, "step": 57 }, { "adv/mean_abs_final_conf": 0.7426167726516724, "adv/mean_abs_reasoning": 0.5847933888435364, "adv/mean_abs_step_conf": 0.7576659917831421, "adv/ratio_final_to_reasoning": 1.2698788782825352, "adv/ratio_step_to_reasoning": 1.2956131280510397, "adv/std_final_conf": 0.9091824293136597, "adv/std_reasoning": 0.8431410789489746, "adv/std_step_conf": 0.9340181946754456, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5531134226275293, "calib/avg_num_step_conf": 11.6015625, "calib/ece": 0.4662025316455697, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9957805907172996, "calib/gap": 0.005793673411228273, "calib/mean_conf": 0.9767510548523207, "calib/mu_c": 0.9795867768595041, "calib/mu_w": 0.9737931034482759, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4662025316455697, "calib/std_conf": 0.02817952635581979, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8562822458270105, "calib/step_q_c_n": 1318.0, "calib/step_q_gap": -0.0005579478775900037, "calib/step_q_w": 0.8568401937046005, "calib/step_q_w_n": 1652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2634.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 985.74609375, "completions/mean_terminated_length": 1025.8170166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 511.0, "epoch": 0.06186666666666667, "grad_norm": 0.020769745111465454, "kl": 0.148895263671875, "learning_rate": 3.944444444444445e-06, "loss": -0.2007, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.014388788491487503, "mask/share_reasoning": 0.8343918323516846, "mask/share_step_conf": 0.11215688288211823, "num_tokens": 18894915.0, "reward": 0.6976282596588135, "reward_std": 0.2969515323638916, "rewards/accuracy_reward_step": 0.4765625, "rewards/asymmetric_l2_reward": 0.6196170449256897, "rewards/final_brier_reward_step": 0.4951706826686859, "rewards/format_reward_step": 0.92578125, "step": 58 }, { "adv/mean_abs_final_conf": 0.6896352171897888, "adv/mean_abs_reasoning": 0.6210002899169922, "adv/mean_abs_step_conf": 0.7944797873497009, "adv/ratio_final_to_reasoning": 1.1105231807250378, "adv/ratio_step_to_reasoning": 1.2793549379113773, "adv/std_final_conf": 0.8672558069229126, "adv/std_reasoning": 0.8269534707069397, "adv/std_step_conf": 0.9332959055900574, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.5547254247572815, "calib/avg_num_step_conf": 11.2890625, "calib/ece": 0.4279220779220779, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.9913419913419913, "calib/gap": 0.003496662621359614, "calib/mean_conf": 0.982034632034632, "calib/mu_c": 0.9835937500000002, "calib/mu_w": 0.9800970873786405, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4279220779220779, "calib/std_conf": 0.015024327018331942, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8552635599694423, "calib/step_q_c_n": 1309.0, "calib/step_q_gap": 0.010437500513401754, "calib/step_q_w": 0.8448260594560405, "calib/step_q_w_n": 1581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 881.99609375, "completions/mean_terminated_length": 940.7958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.06293333333333333, "grad_norm": 0.018716448917984962, "kl": 0.161285400390625, "learning_rate": 3.916666666666667e-06, "loss": -0.2376, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.015686459839344025, "mask/share_reasoning": 0.8076034188270569, "mask/share_step_conf": 0.1142100840806961, "num_tokens": 19226954.0, "reward": 0.7167990207672119, "reward_std": 0.32347404956817627, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.6384674906730652, "rewards/final_brier_reward_step": 0.515442967414856, "rewards/format_reward_step": 0.8984375, "step": 59 }, { "adv/mean_abs_final_conf": 0.6956037282943726, "adv/mean_abs_reasoning": 0.5651669502258301, "adv/mean_abs_step_conf": 0.7514952421188354, "adv/ratio_final_to_reasoning": 1.2307933576378138, "adv/ratio_step_to_reasoning": 1.32968716910738, "adv/std_final_conf": 0.8582842946052551, "adv/std_reasoning": 0.8101160526275635, "adv/std_step_conf": 0.9319243431091309, "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.6115804976358422, "calib/avg_num_step_conf": 10.86328125, "calib/ece": 0.40213043478260874, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.9956521739130435, "calib/gap": 0.010303852414541126, "calib/mean_conf": 0.9803913043478262, "calib/mu_c": 0.9847368421052629, "calib/mu_w": 0.9744329896907218, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40213043478260874, "calib/std_conf": 0.027208117603319572, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8575264750378215, "calib/step_q_c_n": 1322.0, "calib/step_q_gap": 0.018479182371611724, "calib/step_q_w": 0.8390472926662098, "calib/step_q_w_n": 1459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 936.8046875, "completions/mean_terminated_length": 990.9999389648438, "completions/min_length": 0.0, "completions/min_terminated_length": 405.0, "epoch": 0.064, "grad_norm": 0.027904795482754707, "kl": 0.146087646484375, "learning_rate": 3.88888888888889e-06, "loss": -0.1497, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.015355521813035011, "mask/share_reasoning": 0.8180770874023438, "mask/share_step_conf": 0.11187991499900818, "num_tokens": 19575632.0, "reward": 0.7286934852600098, "reward_std": 0.2839881181716919, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6367565393447876, "rewards/final_brier_reward_step": 0.5378179550170898, "rewards/format_reward_step": 0.89453125, "step": 60 }, { "adv/mean_abs_final_conf": 0.5476254820823669, "adv/mean_abs_reasoning": 0.46652138233184814, "adv/mean_abs_step_conf": 0.7672770023345947, "adv/ratio_final_to_reasoning": 1.1738486226400389, "adv/ratio_step_to_reasoning": 1.6446770317353037, "adv/std_final_conf": 0.7907182574272156, "adv/std_reasoning": 0.7393697500228882, "adv/std_step_conf": 0.9312024712562561, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.561746437705517, "calib/avg_num_step_conf": 11.578125, "calib/ece": 0.33097560975609763, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.0031099744245526306, "calib/mean_conf": 0.9854471544715447, "calib/mu_c": 0.9865217391304347, "calib/mu_w": 0.9834117647058821, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33097560975609763, "calib/std_conf": 0.011634825248779398, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.849314606741573, "calib/step_q_c_n": 1780.0, "calib/step_q_gap": 0.00033656620103239554, "calib/step_q_w": 0.8489780405405406, "calib/step_q_w_n": 1184.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 852.55859375, "completions/mean_terminated_length": 866.09130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.06506666666666666, "grad_norm": 0.021844208240509033, "kl": 0.1577911376953125, "learning_rate": 3.861111111111112e-06, "loss": -0.0096, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.017297610640525818, "mask/share_reasoning": 0.8325221538543701, "mask/share_step_conf": 0.13455525040626526, "num_tokens": 19897951.0, "reward": 0.8288745880126953, "reward_std": 0.22828470170497894, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.701357364654541, "rewards/final_brier_reward_step": 0.6392042636871338, "rewards/format_reward_step": 0.953125, "step": 61 }, { "adv/mean_abs_final_conf": 0.7341781258583069, "adv/mean_abs_reasoning": 0.661810040473938, "adv/mean_abs_step_conf": 0.7774771451950073, "adv/ratio_final_to_reasoning": 1.1093487269134574, "adv/ratio_step_to_reasoning": 1.174773874144063, "adv/std_final_conf": 0.8844819664955139, "adv/std_reasoning": 0.8593194484710693, "adv/std_step_conf": 0.934329092502594, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.561044003451251, "calib/avg_num_step_conf": 11.546875, "calib/ece": 0.46584745762711866, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0035432844406099173, "calib/mean_conf": 0.9827966101694915, "calib/mu_c": 0.9845081967213115, "calib/mu_w": 0.9809649122807016, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.46584745762711866, "calib/std_conf": 0.013522460488446315, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8506163828061638, "calib/step_q_c_n": 1233.0, "calib/step_q_gap": -0.002302943949494929, "calib/step_q_w": 0.8529193267556587, "calib/step_q_w_n": 1723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 880.30859375, "completions/mean_terminated_length": 927.4032592773438, "completions/min_length": 0.0, "completions/min_terminated_length": 450.0, "epoch": 0.06613333333333334, "grad_norm": 0.0222491268068552, "kl": 0.1569976806640625, "learning_rate": 3.833333333333334e-06, "loss": -0.2313, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015586305409669876, "mask/share_reasoning": 0.8132479190826416, "mask/share_step_conf": 0.1203845664858818, "num_tokens": 20230390.0, "reward": 0.6956356763839722, "reward_std": 0.3364129066467285, "rewards/accuracy_reward_step": 0.4765625, "rewards/asymmetric_l2_reward": 0.6262305974960327, "rewards/final_brier_reward_step": 0.4884781241416931, "rewards/format_reward_step": 0.90625, "step": 62 }, { "adv/mean_abs_final_conf": 0.627443790435791, "adv/mean_abs_reasoning": 0.4884450137615204, "adv/mean_abs_step_conf": 0.7527927756309509, "adv/ratio_final_to_reasoning": 1.2845740518545568, "adv/ratio_step_to_reasoning": 1.5412027033169722, "adv/std_final_conf": 0.8280969858169556, "adv/std_reasoning": 0.7576963305473328, "adv/std_step_conf": 0.9316011071205139, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6043795620437957, "calib/avg_num_step_conf": 10.62890625, "calib/ece": 0.42854251012145755, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.007569343065693679, "calib/mean_conf": 0.9831983805668015, "calib/mu_c": 0.9865693430656934, "calib/mu_w": 0.9789999999999998, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42854251012145755, "calib/std_conf": 0.016710596816404612, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8481534090909091, "calib/step_q_c_n": 1408.0, "calib/step_q_gap": 0.020643888908121633, "calib/step_q_w": 0.8275095201827875, "calib/step_q_w_n": 1313.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 946.09765625, "completions/mean_terminated_length": 957.3162231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 478.0, "epoch": 0.0672, "grad_norm": 0.025082435458898544, "kl": 0.1472930908203125, "learning_rate": 3.8055555555555556e-06, "loss": -0.0259, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.015975866466760635, "mask/share_reasoning": 0.8547379374504089, "mask/share_step_conf": 0.11756742745637894, "num_tokens": 20581231.0, "reward": 0.7615793347358704, "reward_std": 0.25341373682022095, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6730742454528809, "rewards/final_brier_reward_step": 0.552428126335144, "rewards/format_reward_step": 0.953125, "step": 63 }, { "adv/mean_abs_final_conf": 0.7004935145378113, "adv/mean_abs_reasoning": 0.603089451789856, "adv/mean_abs_step_conf": 0.7326570749282837, "adv/ratio_final_to_reasoning": 1.1615084834577662, "adv/ratio_step_to_reasoning": 1.2148398098389805, "adv/std_final_conf": 0.896488606929779, "adv/std_reasoning": 0.8430898189544678, "adv/std_step_conf": 0.932219922542572, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5474950396825398, "calib/avg_num_step_conf": 11.20703125, "calib/ece": 0.28637500000000016, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0027579365079367646, "calib/mean_conf": 0.9863750000000001, "calib/mu_c": 0.9872023809523811, "calib/mu_w": 0.9844444444444443, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28637500000000016, "calib/std_conf": 0.009692404672388246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.844783362218371, "calib/step_q_c_n": 1731.0, "calib/step_q_gap": 0.024844873641921117, "calib/step_q_w": 0.8199384885764499, "calib/step_q_w_n": 1138.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 880.98046875, "completions/mean_terminated_length": 913.0809936523438, "completions/min_length": 0.0, "completions/min_terminated_length": 515.0, "epoch": 0.06826666666666667, "grad_norm": 0.02380029298365116, "kl": 0.1593780517578125, "learning_rate": 3.777777777777778e-06, "loss": -0.097, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.0161864273250103, "mask/share_reasoning": 0.8278210163116455, "mask/share_step_conf": 0.12083625793457031, "num_tokens": 20910538.0, "reward": 0.8677514791488647, "reward_std": 0.2944144904613495, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.7520149946212769, "rewards/final_brier_reward_step": 0.66473788022995, "rewards/format_reward_step": 0.9375, "step": 64 }, { "adv/mean_abs_final_conf": 0.5200560688972473, "adv/mean_abs_reasoning": 0.3561217188835144, "adv/mean_abs_step_conf": 0.756298303604126, "adv/ratio_final_to_reasoning": 1.4603323563855846, "adv/ratio_step_to_reasoning": 2.1237073267398983, "adv/std_final_conf": 0.7643717527389526, "adv/std_reasoning": 0.6403516530990601, "adv/std_step_conf": 0.9288119673728943, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5231881214024071, "calib/avg_num_step_conf": 10.87109375, "calib/ece": 0.40035856573705186, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.004525771847200155, "calib/mean_conf": 0.9860159362549801, "calib/mu_c": 0.9878911564625849, "calib/mu_w": 0.9833653846153847, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40035856573705186, "calib/std_conf": 0.024835389241660396, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8443627140139505, "calib/step_q_c_n": 1577.0, "calib/step_q_gap": 0.00232291301892551, "calib/step_q_w": 0.842039800995025, "calib/step_q_w_n": 1206.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 820.27734375, "completions/mean_terminated_length": 823.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 473.0, "epoch": 0.06933333333333333, "grad_norm": 0.01668195053935051, "kl": 0.1640777587890625, "learning_rate": 3.7500000000000005e-06, "loss": 0.0244, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.018098723143339157, "mask/share_reasoning": 0.8402106761932373, "mask/share_step_conf": 0.13778437674045563, "num_tokens": 21225553.0, "reward": 0.7908812761306763, "reward_std": 0.18446239829063416, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.6852684617042542, "rewards/final_brier_reward_step": 0.5863378643989563, "rewards/format_reward_step": 0.9765625, "step": 65 }, { "adv/mean_abs_final_conf": 0.6290441751480103, "adv/mean_abs_reasoning": 0.5717591047286987, "adv/mean_abs_step_conf": 0.7339279651641846, "adv/ratio_final_to_reasoning": 1.1001909194720974, "adv/ratio_step_to_reasoning": 1.2836314438970506, "adv/std_final_conf": 0.8466580510139465, "adv/std_reasoning": 0.8266621828079224, "adv/std_step_conf": 0.9322919249534607, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5569719413189005, "calib/avg_num_step_conf": 10.75, "calib/ece": 0.4869198312236286, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003078621279020144, "calib/mean_conf": 0.9848101265822784, "calib/mu_c": 0.9863559322033897, "calib/mu_w": 0.9832773109243695, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4869198312236286, "calib/std_conf": 0.011421435117915504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8389644012944985, "calib/step_q_c_n": 1236.0, "calib/step_q_gap": 0.003054111057031461, "calib/step_q_w": 0.835910290237467, "calib/step_q_w_n": 1516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 975.1328125, "completions/mean_terminated_length": 1002.546142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 490.0, "epoch": 0.0704, "grad_norm": 0.0274101160466671, "kl": 0.1490325927734375, "learning_rate": 3.7222222222222225e-06, "loss": -0.0766, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01519216038286686, "mask/share_reasoning": 0.8440859317779541, "mask/share_step_conf": 0.113378144800663, "num_tokens": 21581539.0, "reward": 0.6924327611923218, "reward_std": 0.2524971663951874, "rewards/accuracy_reward_step": 0.4609375, "rewards/asymmetric_l2_reward": 0.6322306394577026, "rewards/final_brier_reward_step": 0.4760722517967224, "rewards/format_reward_step": 0.921875, "step": 66 }, { "adv/mean_abs_final_conf": 0.5527406334877014, "adv/mean_abs_reasoning": 0.4371800720691681, "adv/mean_abs_step_conf": 0.7184213995933533, "adv/ratio_final_to_reasoning": 1.2643317223303125, "adv/ratio_step_to_reasoning": 1.6433077477504254, "adv/std_final_conf": 0.795620858669281, "adv/std_reasoning": 0.7393509149551392, "adv/std_step_conf": 0.9301301836967468, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5891039426523297, "calib/avg_num_step_conf": 11.0078125, "calib/ece": 0.36938271604938266, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005051612903225977, "calib/mean_conf": 0.9866666666666666, "calib/mu_c": 0.9885999999999998, "calib/mu_w": 0.9835483870967738, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36938271604938266, "calib/std_conf": 0.009428090415820642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8425621414913959, "calib/step_q_c_n": 1569.0, "calib/step_q_gap": 0.021385199938153243, "calib/step_q_w": 0.8211769415532426, "calib/step_q_w_n": 1249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 868.9375, "completions/mean_terminated_length": 900.5992431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 487.0, "epoch": 0.07146666666666666, "grad_norm": 0.020727286115288734, "kl": 0.1535186767578125, "learning_rate": 3.694444444444445e-06, "loss": -0.172, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01617474853992462, "mask/share_reasoning": 0.8293904066085815, "mask/share_step_conf": 0.11927864700555801, "num_tokens": 21908995.0, "reward": 0.7993296384811401, "reward_std": 0.21487489342689514, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.6986775398254395, "rewards/final_brier_reward_step": 0.5937316417694092, "rewards/format_reward_step": 0.9453125, "step": 67 }, { "adv/mean_abs_final_conf": 0.4896950125694275, "adv/mean_abs_reasoning": 0.4402713477611542, "adv/mean_abs_step_conf": 0.7603869438171387, "adv/ratio_final_to_reasoning": 1.1122572819230687, "adv/ratio_step_to_reasoning": 1.7270870513918752, "adv/std_final_conf": 0.7536525130271912, "adv/std_reasoning": 0.7206746935844421, "adv/std_step_conf": 0.9291406273841858, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5731757877280265, "calib/avg_num_step_conf": 10.5078125, "calib/ece": 0.4336363636363636, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9958677685950413, "calib/gap": 0.0027736318407960425, "calib/mean_conf": 0.985702479338843, "calib/mu_c": 0.9869402985074627, "calib/mu_w": 0.9841666666666666, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4328099173553719, "calib/std_conf": 0.015604960340907914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.838406827880512, "calib/step_q_c_n": 1406.0, "calib/step_q_gap": 0.012269756229421502, "calib/step_q_w": 0.8261370716510905, "calib/step_q_w_n": 1284.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 862.625, "completions/mean_terminated_length": 890.4515991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.07253333333333334, "grad_norm": 0.015150300227105618, "kl": 0.15191650390625, "learning_rate": 3.6666666666666666e-06, "loss": -0.1352, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016995739191770554, "mask/share_reasoning": 0.8243101835250854, "mask/share_step_conf": 0.1274440884590149, "num_tokens": 22233915.0, "reward": 0.7585927844047546, "reward_std": 0.20454856753349304, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6879211068153381, "rewards/final_brier_reward_step": 0.5362957119941711, "rewards/format_reward_step": 0.94140625, "step": 68 }, { "adv/mean_abs_final_conf": 0.7056964039802551, "adv/mean_abs_reasoning": 0.6312321424484253, "adv/mean_abs_step_conf": 0.7472624778747559, "adv/ratio_final_to_reasoning": 1.1179665237625538, "adv/ratio_step_to_reasoning": 1.1838156323540048, "adv/std_final_conf": 0.8818315863609314, "adv/std_reasoning": 0.8590497970581055, "adv/std_step_conf": 0.9349477887153625, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5457418727424284, "calib/avg_num_step_conf": 10.9765625, "calib/ece": 0.47625000000000006, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0028480133370382132, "calib/mean_conf": 0.9845833333333334, "calib/mu_c": 0.985983606557377, "calib/mu_w": 0.9831355932203388, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.47625000000000006, "calib/std_conf": 0.011430648372783693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8321756647864625, "calib/step_q_c_n": 1241.0, "calib/step_q_gap": 0.006898418132542883, "calib/step_q_w": 0.8252772466539197, "calib/step_q_w_n": 1569.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 934.42578125, "completions/mean_terminated_length": 972.4105224609375, "completions/min_length": 0.0, "completions/min_terminated_length": 532.0, "epoch": 0.0736, "grad_norm": 0.01587671972811222, "kl": 0.1492156982421875, "learning_rate": 3.638888888888889e-06, "loss": -0.1755, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.015048105269670486, "mask/share_reasoning": 0.8365316390991211, "mask/share_step_conf": 0.10935773700475693, "num_tokens": 22577624.0, "reward": 0.7187528610229492, "reward_std": 0.2709349989891052, "rewards/accuracy_reward_step": 0.4765625, "rewards/asymmetric_l2_reward": 0.6646471619606018, "rewards/final_brier_reward_step": 0.49160856008529663, "rewards/format_reward_step": 0.9296875, "step": 69 }, { "adv/mean_abs_final_conf": 0.5599315762519836, "adv/mean_abs_reasoning": 0.4661853313446045, "adv/mean_abs_step_conf": 0.7805700898170471, "adv/ratio_final_to_reasoning": 1.2010922236377315, "adv/ratio_step_to_reasoning": 1.6743772000840782, "adv/std_final_conf": 0.8068824410438538, "adv/std_reasoning": 0.7208148837089539, "adv/std_step_conf": 0.932051420211792, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5884371121979707, "calib/avg_num_step_conf": 10.9375, "calib/ece": 0.4188983050847458, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9915254237288136, "calib/gap": 0.007489597780860113, "calib/mean_conf": 0.9810169491525423, "calib/mu_c": 0.9842857142857142, "calib/mu_w": 0.9767961165048541, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41817796610169494, "calib/std_conf": 0.027888696164273755, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.839237958303379, "calib/step_q_c_n": 1391.0, "calib/step_q_gap": 0.02184973970863091, "calib/step_q_w": 0.8173882185947481, "calib/step_q_w_n": 1409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 915.4296875, "completions/mean_terminated_length": 960.4507446289062, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.07466666666666667, "grad_norm": 0.03501075878739357, "kl": 0.1421356201171875, "learning_rate": 3.6111111111111115e-06, "loss": -0.1519, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015805684030056, "mask/share_reasoning": 0.8204933404922485, "mask/share_step_conf": 0.11682601273059845, "num_tokens": 22918966.0, "reward": 0.7455442547798157, "reward_std": 0.22053103148937225, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6656532883644104, "rewards/final_brier_reward_step": 0.5371538996696472, "rewards/format_reward_step": 0.921875, "step": 70 }, { "adv/mean_abs_final_conf": 0.6588914394378662, "adv/mean_abs_reasoning": 0.5920547246932983, "adv/mean_abs_step_conf": 0.7465313673019409, "adv/ratio_final_to_reasoning": 1.1128894204487452, "adv/ratio_step_to_reasoning": 1.2609161554933388, "adv/std_final_conf": 0.860008180141449, "adv/std_reasoning": 0.8267221450805664, "adv/std_step_conf": 0.9302997589111328, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5749336870026525, "calib/avg_num_step_conf": 11.23046875, "calib/ece": 0.45276422764227625, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.006875331564986875, "calib/mean_conf": 0.9812195121951219, "calib/mu_c": 0.9844615384615385, "calib/mu_w": 0.9775862068965516, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45276422764227625, "calib/std_conf": 0.027735257296859942, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8306344628695026, "calib/step_q_c_n": 1387.0, "calib/step_q_gap": -7.790272189522884e-05, "calib/step_q_w": 0.8307123655913978, "calib/step_q_w_n": 1488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 918.62890625, "completions/mean_terminated_length": 940.676025390625, "completions/min_length": 0.0, "completions/min_terminated_length": 501.0, "epoch": 0.07573333333333333, "grad_norm": 0.031742144376039505, "kl": 0.1444549560546875, "learning_rate": 3.5833333333333335e-06, "loss": -0.0708, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.01613161340355873, "mask/share_reasoning": 0.8374041318893433, "mask/share_step_conf": 0.1230267882347107, "num_tokens": 23258543.0, "reward": 0.7496272325515747, "reward_std": 0.3023836016654968, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.678457498550415, "rewards/final_brier_reward_step": 0.527046799659729, "rewards/format_reward_step": 0.9609375, "step": 71 }, { "adv/mean_abs_final_conf": 0.664090633392334, "adv/mean_abs_reasoning": 0.5109577178955078, "adv/mean_abs_step_conf": 0.7358330488204956, "adv/ratio_final_to_reasoning": 1.2996978226056315, "adv/ratio_step_to_reasoning": 1.4401055567791137, "adv/std_final_conf": 0.8751429319381714, "adv/std_reasoning": 0.7929494976997375, "adv/std_step_conf": 0.9357478022575378, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6445384211906635, "calib/avg_num_step_conf": 11.00390625, "calib/ece": 0.4796761133603239, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008731313926042317, "calib/mean_conf": 0.9817004048582996, "calib/mu_c": 0.986048387096774, "calib/mu_w": 0.9773170731707317, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4796761133603239, "calib/std_conf": 0.01339010299197453, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8339120370370371, "calib/step_q_c_n": 1296.0, "calib/step_q_gap": 0.011755561034407225, "calib/step_q_w": 0.8221564760026299, "calib/step_q_w_n": 1521.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 843.8828125, "completions/mean_terminated_length": 864.1360473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.0768, "grad_norm": 0.027199726551771164, "kl": 0.15850830078125, "learning_rate": 3.555555555555556e-06, "loss": -0.1048, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.016982506960630417, "mask/share_reasoning": 0.8334957361221313, "mask/share_step_conf": 0.12608429789543152, "num_tokens": 23578985.0, "reward": 0.7423896789550781, "reward_std": 0.2794683873653412, "rewards/accuracy_reward_step": 0.484375, "rewards/asymmetric_l2_reward": 0.6939469575881958, "rewards/final_brier_reward_step": 0.5017699003219604, "rewards/format_reward_step": 0.9609375, "step": 72 }, { "adv/mean_abs_final_conf": 0.7219201922416687, "adv/mean_abs_reasoning": 0.4985666275024414, "adv/mean_abs_step_conf": 0.7653075456619263, "adv/ratio_final_to_reasoning": 1.447991406601184, "adv/ratio_step_to_reasoning": 1.5350155895827156, "adv/std_final_conf": 0.8606438636779785, "adv/std_reasoning": 0.7395066022872925, "adv/std_step_conf": 0.9299985766410828, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5828644501278772, "calib/avg_num_step_conf": 11.06640625, "calib/ece": 0.3224796747967481, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004689075630252448, "calib/mean_conf": 0.9769512195121952, "calib/mu_c": 0.9785714285714286, "calib/mu_w": 0.9738823529411762, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3224796747967481, "calib/std_conf": 0.014707640654936523, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8299045346062053, "calib/step_q_c_n": 1676.0, "calib/step_q_gap": 0.0015899278646321902, "calib/step_q_w": 0.8283146067415731, "calib/step_q_w_n": 1157.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 861.546875, "completions/mean_terminated_length": 882.2240600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.07786666666666667, "grad_norm": 0.03334764018654823, "kl": 0.1400146484375, "learning_rate": 3.5277777777777784e-06, "loss": -0.0773, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016731340438127518, "mask/share_reasoning": 0.8360512852668762, "mask/share_step_conf": 0.12377987056970596, "num_tokens": 23906573.0, "reward": 0.8440332412719727, "reward_std": 0.2482633739709854, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.7245661020278931, "rewards/final_brier_reward_step": 0.6455316543579102, "rewards/format_reward_step": 0.9609375, "step": 73 }, { "adv/mean_abs_final_conf": 0.7695039510726929, "adv/mean_abs_reasoning": 0.5811711549758911, "adv/mean_abs_step_conf": 0.7675015926361084, "adv/ratio_final_to_reasoning": 1.3240573701642409, "adv/ratio_step_to_reasoning": 1.3206119850665108, "adv/std_final_conf": 0.8966401815414429, "adv/std_reasoning": 0.8101295828819275, "adv/std_step_conf": 0.9333306550979614, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5982724440668366, "calib/avg_num_step_conf": 11.66796875, "calib/ece": 0.4176569037656903, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005846785613140892, "calib/mean_conf": 0.9699581589958158, "calib/mu_c": 0.9725757575757575, "calib/mu_w": 0.9667289719626166, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4176569037656903, "calib/std_conf": 0.014067913815529444, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8376606145251396, "calib/step_q_c_n": 1432.0, "calib/step_q_gap": 0.02402074314250291, "calib/step_q_w": 0.8136398713826367, "calib/step_q_w_n": 1555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 911.06640625, "completions/mean_terminated_length": 955.8729248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.07893333333333333, "grad_norm": 0.01728581264615059, "kl": 0.1635284423828125, "learning_rate": 3.5e-06, "loss": -0.1305, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.015495438128709793, "mask/share_reasoning": 0.814774751663208, "mask/share_step_conf": 0.12285478413105011, "num_tokens": 24243734.0, "reward": 0.7644367218017578, "reward_std": 0.29110610485076904, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.6974785327911377, "rewards/final_brier_reward_step": 0.5423324108123779, "rewards/format_reward_step": 0.9296875, "step": 74 }, { "adv/mean_abs_final_conf": 0.7328017950057983, "adv/mean_abs_reasoning": 0.49293094873428345, "adv/mean_abs_step_conf": 0.751336932182312, "adv/ratio_final_to_reasoning": 1.4866215985980185, "adv/ratio_step_to_reasoning": 1.5242234923807216, "adv/std_final_conf": 0.8824196457862854, "adv/std_reasoning": 0.7577341794967651, "adv/std_step_conf": 0.929182767868042, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5827731092436974, "calib/avg_num_step_conf": 10.87109375, "calib/ece": 0.2505349794238683, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.0043621848739494595, "calib/mean_conf": 0.9694650205761317, "calib/mu_c": 0.970685714285714, "calib/mu_w": 0.9663235294117646, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24991769547325102, "calib/std_conf": 0.01597715095289499, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8350551267916207, "calib/step_q_c_n": 1814.0, "calib/step_q_gap": 0.006159357957771339, "calib/step_q_w": 0.8288957688338494, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 850.83984375, "completions/mean_terminated_length": 874.759033203125, "completions/min_length": 0.0, "completions/min_terminated_length": 435.0, "epoch": 0.08, "grad_norm": 0.06992490589618683, "kl": 0.1568603515625, "learning_rate": 3.4722222222222224e-06, "loss": -0.1116, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01692769303917885, "mask/share_reasoning": 0.8271828889846802, "mask/share_step_conf": 0.12854570150375366, "num_tokens": 24566301.0, "reward": 0.8857764005661011, "reward_std": 0.25138625502586365, "rewards/accuracy_reward_step": 0.68359375, "rewards/asymmetric_l2_reward": 0.754860520362854, "rewards/final_brier_reward_step": 0.6924734115600586, "rewards/format_reward_step": 0.9375, "step": 75 }, { "adv/mean_abs_final_conf": 0.7342768311500549, "adv/mean_abs_reasoning": 0.4722954034805298, "adv/mean_abs_step_conf": 0.749976634979248, "adv/ratio_final_to_reasoning": 1.5546982370331819, "adv/ratio_step_to_reasoning": 1.587939728933156, "adv/std_final_conf": 0.8730382323265076, "adv/std_reasoning": 0.7395159602165222, "adv/std_step_conf": 0.9327054619789124, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6478126380910296, "calib/avg_num_step_conf": 10.921875, "calib/ece": 0.36020920502092035, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008645603181617445, "calib/mean_conf": 0.9710878661087865, "calib/mu_c": 0.9744520547945205, "calib/mu_w": 0.965806451612903, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36020920502092035, "calib/std_conf": 0.014422952826999605, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8340337224383917, "calib/step_q_c_n": 1542.0, "calib/step_q_gap": 0.018419687350672453, "calib/step_q_w": 0.8156140350877192, "calib/step_q_w_n": 1254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 876.48828125, "completions/mean_terminated_length": 915.8407592773438, "completions/min_length": 0.0, "completions/min_terminated_length": 432.0, "epoch": 0.08106666666666666, "grad_norm": 0.02016083337366581, "kl": 0.153961181640625, "learning_rate": 3.444444444444445e-06, "loss": -0.2091, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.015996340662240982, "mask/share_reasoning": 0.8220898509025574, "mask/share_step_conf": 0.11894506961107254, "num_tokens": 24893738.0, "reward": 0.7962677478790283, "reward_std": 0.24296647310256958, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6975725889205933, "rewards/final_brier_reward_step": 0.5941816568374634, "rewards/format_reward_step": 0.93359375, "step": 76 }, { "adv/mean_abs_final_conf": 0.7706581354141235, "adv/mean_abs_reasoning": 0.5369069576263428, "adv/mean_abs_step_conf": 0.7580994963645935, "adv/ratio_final_to_reasoning": 1.4353662668503888, "adv/ratio_step_to_reasoning": 1.411975549201559, "adv/std_final_conf": 0.8862397074699402, "adv/std_reasoning": 0.7756330370903015, "adv/std_step_conf": 0.9303173422813416, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.617000469410108, "calib/avg_num_step_conf": 12.078125, "calib/ece": 0.28868312757201664, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006833046471601212, "calib/mean_conf": 0.9718106995884775, "calib/mu_c": 0.9739759036144577, "calib/mu_w": 0.9671428571428565, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28868312757201664, "calib/std_conf": 0.014544283475479397, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.834141914191419, "calib/step_q_c_n": 1818.0, "calib/step_q_gap": -0.0021767671272623357, "calib/step_q_w": 0.8363186813186814, "calib/step_q_w_n": 1274.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 891.25, "completions/mean_terminated_length": 923.7247314453125, "completions/min_length": 0.0, "completions/min_terminated_length": 479.0, "epoch": 0.08213333333333334, "grad_norm": 0.05896781384944916, "kl": 0.1526947021484375, "learning_rate": 3.416666666666667e-06, "loss": -0.1115, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.015972012653946877, "mask/share_reasoning": 0.8222805261611938, "mask/share_step_conf": 0.12659120559692383, "num_tokens": 25226562.0, "reward": 0.8617645502090454, "reward_std": 0.26767075061798096, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.7367497682571411, "rewards/final_brier_reward_step": 0.667248010635376, "rewards/format_reward_step": 0.94921875, "step": 77 }, { "adv/mean_abs_final_conf": 0.7285124063491821, "adv/mean_abs_reasoning": 0.47509482502937317, "adv/mean_abs_step_conf": 0.7568221688270569, "adv/ratio_final_to_reasoning": 1.5334042131571128, "adv/ratio_step_to_reasoning": 1.5929918175393, "adv/std_final_conf": 0.8647627830505371, "adv/std_reasoning": 0.7206560969352722, "adv/std_step_conf": 0.9324377775192261, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5533693415637861, "calib/avg_num_step_conf": 10.16796875, "calib/ece": 0.4031349206349205, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0031250000000002665, "calib/mean_conf": 0.9745634920634919, "calib/mu_c": 0.9759027777777778, "calib/mu_w": 0.9727777777777775, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4031349206349205, "calib/std_conf": 0.014993647348839642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8269742198100407, "calib/step_q_c_n": 1474.0, "calib/step_q_gap": 0.0038032720686768107, "calib/step_q_w": 0.8231709477413639, "calib/step_q_w_n": 1129.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 942.19921875, "completions/mean_terminated_length": 949.6181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 500.0, "epoch": 0.0832, "grad_norm": 0.021009007468819618, "kl": 0.1553192138671875, "learning_rate": 3.3888888888888893e-06, "loss": -0.0389, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.015865862369537354, "mask/share_reasoning": 0.8624191284179688, "mask/share_step_conf": 0.11390247195959091, "num_tokens": 25575789.0, "reward": 0.8056474328041077, "reward_std": 0.2189893126487732, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.7173092365264893, "rewards/final_brier_reward_step": 0.5846105217933655, "rewards/format_reward_step": 0.984375, "step": 78 }, { "adv/mean_abs_final_conf": 0.6996117234230042, "adv/mean_abs_reasoning": 0.5063235759735107, "adv/mean_abs_step_conf": 0.7326298356056213, "adv/ratio_final_to_reasoning": 1.381748266566212, "adv/ratio_step_to_reasoning": 1.4469597513743864, "adv/std_final_conf": 0.8834125399589539, "adv/std_reasoning": 0.7755327224731445, "adv/std_step_conf": 0.9304167032241821, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5155382907880133, "calib/avg_num_step_conf": 10.75390625, "calib/ece": 0.3303688524590165, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": -0.0016374398816130231, "calib/mean_conf": 0.9779918032786885, "calib/mu_c": 0.9774213836477985, "calib/mu_w": 0.9790588235294115, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.328360655737705, "calib/std_conf": 0.025099472890304626, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8302879424115178, "calib/step_q_c_n": 1667.0, "calib/step_q_gap": 0.013160870588313278, "calib/step_q_w": 0.8171270718232045, "calib/step_q_w_n": 1086.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 880.53125, "completions/mean_terminated_length": 912.6154174804688, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.08426666666666667, "grad_norm": 0.019594348967075348, "kl": 0.1408843994140625, "learning_rate": 3.3611111111111117e-06, "loss": -0.1907, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01564885675907135, "mask/share_reasoning": 0.8325054049491882, "mask/share_step_conf": 0.11668950319290161, "num_tokens": 25907581.0, "reward": 0.8450396060943604, "reward_std": 0.25687703490257263, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.7405940294265747, "rewards/final_brier_reward_step": 0.6338601112365723, "rewards/format_reward_step": 0.94921875, "step": 79 }, { "adv/mean_abs_final_conf": 0.6465312242507935, "adv/mean_abs_reasoning": 0.5463018417358398, "adv/mean_abs_step_conf": 0.7440537214279175, "adv/ratio_final_to_reasoning": 1.1834688717074082, "adv/ratio_step_to_reasoning": 1.3619828171615411, "adv/std_final_conf": 0.8512998819351196, "adv/std_reasoning": 0.7928522825241089, "adv/std_step_conf": 0.9313137531280518, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5054459817486017, "calib/avg_num_step_conf": 11.00390625, "calib/ece": 0.2996015936254981, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004430379746838131, "calib/mean_conf": 0.9848605577689243, "calib/mu_c": 0.9849999999999999, "calib/mu_w": 0.9845569620253161, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2996015936254981, "calib/std_conf": 0.011054035970751991, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8219946666666665, "calib/step_q_c_n": 1875.0, "calib/step_q_gap": 0.022281290870488246, "calib/step_q_w": 0.7997133757961783, "calib/step_q_w_n": 942.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 858.70703125, "completions/mean_terminated_length": 868.8893432617188, "completions/min_length": 0.0, "completions/min_terminated_length": 484.0, "epoch": 0.08533333333333333, "grad_norm": 0.016964834183454514, "kl": 0.1446990966796875, "learning_rate": 3.3333333333333333e-06, "loss": -0.0618, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.01695704460144043, "mask/share_reasoning": 0.8412275910377502, "mask/share_step_conf": 0.13009659945964813, "num_tokens": 26229570.0, "reward": 0.8783482313156128, "reward_std": 0.2699282169342041, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.7498530149459839, "rewards/final_brier_reward_step": 0.6771558523178101, "rewards/format_reward_step": 0.9765625, "step": 80 }, { "adv/mean_abs_final_conf": 0.6169295907020569, "adv/mean_abs_reasoning": 0.5010291934013367, "adv/mean_abs_step_conf": 0.7242743968963623, "adv/ratio_final_to_reasoning": 1.231324639017354, "adv/ratio_step_to_reasoning": 1.4455732449031184, "adv/std_final_conf": 0.8448789119720459, "adv/std_reasoning": 0.7930719256401062, "adv/std_step_conf": 0.9312515258789062, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6767689061992859, "calib/avg_num_step_conf": 11.25390625, "calib/ece": 0.313135593220339, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.9957627118644068, "calib/gap": 0.010243427458617727, "calib/mean_conf": 0.9826271186440678, "calib/mu_c": 0.9860126582278479, "calib/mu_w": 0.9757692307692302, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.313135593220339, "calib/std_conf": 0.013711801554401222, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8287856257744732, "calib/step_q_c_n": 1614.0, "calib/step_q_gap": 0.01665460762135562, "calib/step_q_w": 0.8121310181531176, "calib/step_q_w_n": 1267.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 894.08203125, "completions/mean_terminated_length": 938.05322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 503.0, "epoch": 0.0864, "grad_norm": 0.01762450486421585, "kl": 0.13702392578125, "learning_rate": 3.3055555555555558e-06, "loss": -0.1868, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.016055608168244362, "mask/share_reasoning": 0.8190659284591675, "mask/share_step_conf": 0.1180034726858139, "num_tokens": 26564703.0, "reward": 0.8258570432662964, "reward_std": 0.27940231561660767, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.7132589221000671, "rewards/final_brier_reward_step": 0.631423830986023, "rewards/format_reward_step": 0.91796875, "step": 81 }, { "adv/mean_abs_final_conf": 0.5887628793716431, "adv/mean_abs_reasoning": 0.4778270125389099, "adv/mean_abs_step_conf": 0.7516727447509766, "adv/ratio_final_to_reasoning": 1.2321674244477745, "adv/ratio_step_to_reasoning": 1.5731064276944098, "adv/std_final_conf": 0.7913881540298462, "adv/std_reasoning": 0.7394364476203918, "adv/std_step_conf": 0.929599940776825, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5940373363053776, "calib/avg_num_step_conf": 11.39453125, "calib/ece": 0.38155102040816335, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.005399136249651604, "calib/mean_conf": 0.9856326530612246, "calib/mu_c": 0.9877702702702702, "calib/mu_w": 0.9823711340206186, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38155102040816335, "calib/std_conf": 0.011397625741929896, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.833109296482412, "calib/step_q_c_n": 1592.0, "calib/step_q_gap": 0.018347032331468727, "calib/step_q_w": 0.8147622641509433, "calib/step_q_w_n": 1325.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 838.4921875, "completions/mean_terminated_length": 858.6160278320312, "completions/min_length": 0.0, "completions/min_terminated_length": 517.0, "epoch": 0.08746666666666666, "grad_norm": 0.020330313593149185, "kl": 0.1495513916015625, "learning_rate": 3.277777777777778e-06, "loss": -0.1235, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.017101511359214783, "mask/share_reasoning": 0.8278982043266296, "mask/share_step_conf": 0.13156276941299438, "num_tokens": 26884909.0, "reward": 0.8035461902618408, "reward_std": 0.219723179936409, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.7088985443115234, "rewards/final_brier_reward_step": 0.5911625027656555, "rewards/format_reward_step": 0.95703125, "step": 82 }, { "adv/mean_abs_final_conf": 0.6281593441963196, "adv/mean_abs_reasoning": 0.469433069229126, "adv/mean_abs_step_conf": 0.7515369653701782, "adv/ratio_final_to_reasoning": 1.3381233350855408, "adv/ratio_step_to_reasoning": 1.60094593805312, "adv/std_final_conf": 0.841602087020874, "adv/std_reasoning": 0.7394808530807495, "adv/std_step_conf": 0.9299801588058472, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5899823633156966, "calib/avg_num_step_conf": 10.578125, "calib/ece": 0.4223333333333332, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": 0.0019894179894178743, "calib/mean_conf": 0.9808333333333334, "calib/mu_c": 0.9817037037037035, "calib/mu_w": 0.9797142857142856, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4203333333333332, "calib/std_conf": 0.03284010082539673, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8315280898876406, "calib/step_q_c_n": 1335.0, "calib/step_q_gap": 0.01302117073250586, "calib/step_q_w": 0.8185069191551347, "calib/step_q_w_n": 1373.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 923.578125, "completions/mean_terminated_length": 949.5421142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.08853333333333334, "grad_norm": 0.019025607034564018, "kl": 0.1397857666015625, "learning_rate": 3.2500000000000002e-06, "loss": -0.1334, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.015944354236125946, "mask/share_reasoning": 0.8402930498123169, "mask/share_step_conf": 0.11641887575387955, "num_tokens": 27228609.0, "reward": 0.7492298483848572, "reward_std": 0.22660614550113678, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6692612171173096, "rewards/final_brier_reward_step": 0.5385733842849731, "rewards/format_reward_step": 0.92578125, "step": 83 }, { "adv/mean_abs_final_conf": 0.6215704679489136, "adv/mean_abs_reasoning": 0.4951096475124359, "adv/mean_abs_step_conf": 0.7436130046844482, "adv/ratio_final_to_reasoning": 1.255419826844116, "adv/ratio_step_to_reasoning": 1.5019158047526644, "adv/std_final_conf": 0.8289499282836914, "adv/std_reasoning": 0.7576406002044678, "adv/std_step_conf": 0.9340832233428955, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.576388888888889, "calib/avg_num_step_conf": 11.8359375, "calib/ece": 0.3599586776859504, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.987603305785124, "calib/gap": 0.0008771929824562541, "calib/mean_conf": 0.9788842975206613, "calib/mu_c": 0.9792105263157895, "calib/mu_w": 0.9783333333333333, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35537190082644626, "calib/std_conf": 0.05506565428974546, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.829752504419564, "calib/step_q_c_n": 1697.0, "calib/step_q_gap": 0.025296390391057022, "calib/step_q_w": 0.804456114028507, "calib/step_q_w_n": 1333.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 848.2734375, "completions/mean_terminated_length": 886.359130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.0896, "grad_norm": 0.017854604870080948, "kl": 0.14837646484375, "learning_rate": 3.2222222222222227e-06, "loss": -0.1635, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016548432409763336, "mask/share_reasoning": 0.8113114833831787, "mask/share_step_conf": 0.12917135655879974, "num_tokens": 27551687.0, "reward": 0.8223565816879272, "reward_std": 0.24511127173900604, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7312041521072388, "rewards/final_brier_reward_step": 0.605696439743042, "rewards/format_reward_step": 0.9453125, "step": 84 }, { "adv/mean_abs_final_conf": 0.6229391694068909, "adv/mean_abs_reasoning": 0.5074774026870728, "adv/mean_abs_step_conf": 0.7801123261451721, "adv/ratio_final_to_reasoning": 1.2275210011489233, "adv/ratio_step_to_reasoning": 1.5372355931801263, "adv/std_final_conf": 0.8200711011886597, "adv/std_reasoning": 0.739483654499054, "adv/std_step_conf": 0.9316701292991638, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6454227812718379, "calib/avg_num_step_conf": 10.38671875, "calib/ece": 0.42203319502074677, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9875518672199171, "calib/gap": 0.013357092941998605, "calib/mean_conf": 0.9821991701244813, "calib/mu_c": 0.988074074074074, "calib/mu_w": 0.9747169811320754, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42203319502074677, "calib/std_conf": 0.025599406748347184, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8299927272727274, "calib/step_q_c_n": 1375.0, "calib/step_q_gap": 0.008754409515718042, "calib/step_q_w": 0.8212383177570094, "calib/step_q_w_n": 1284.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 896.77734375, "completions/mean_terminated_length": 929.4534912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.09066666666666667, "grad_norm": 0.030835958197712898, "kl": 0.145050048828125, "learning_rate": 3.1944444444444443e-06, "loss": -0.0587, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.015947245061397552, "mask/share_reasoning": 0.8292930126190186, "mask/share_step_conf": 0.11960351467132568, "num_tokens": 27889086.0, "reward": 0.7618950605392456, "reward_std": 0.23239392042160034, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6826740503311157, "rewards/final_brier_reward_step": 0.5473659634590149, "rewards/format_reward_step": 0.94140625, "step": 85 }, { "adv/mean_abs_final_conf": 0.5596708059310913, "adv/mean_abs_reasoning": 0.398525208234787, "adv/mean_abs_step_conf": 0.7483077049255371, "adv/ratio_final_to_reasoning": 1.4043548422195844, "adv/ratio_step_to_reasoning": 1.8776922750760585, "adv/std_final_conf": 0.7832596898078918, "adv/std_reasoning": 0.6817492246627808, "adv/std_step_conf": 0.9316436648368835, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5768398268398268, "calib/avg_num_step_conf": 11.32421875, "calib/ece": 0.43963114754098376, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9877049180327869, "calib/gap": 0.007843614718614389, "calib/mean_conf": 0.9789754098360656, "calib/mu_c": 0.9825757575757574, "calib/mu_w": 0.974732142857143, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4388114754098362, "calib/std_conf": 0.03275867126219898, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.82452216066482, "calib/step_q_c_n": 1444.0, "calib/step_q_gap": 0.017264428706057156, "calib/step_q_w": 0.8072577319587628, "calib/step_q_w_n": 1455.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 880.98828125, "completions/mean_terminated_length": 920.5428466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.09173333333333333, "grad_norm": 0.021164577454328537, "kl": 0.142791748046875, "learning_rate": 3.1666666666666667e-06, "loss": -0.1413, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.016112584620714188, "mask/share_reasoning": 0.8145743608474731, "mask/share_step_conf": 0.12634429335594177, "num_tokens": 28220131.0, "reward": 0.7546988725662231, "reward_std": 0.1984507292509079, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.6793574690818787, "rewards/final_brier_reward_step": 0.5362902283668518, "rewards/format_reward_step": 0.953125, "step": 86 }, { "adv/mean_abs_final_conf": 0.6093852519989014, "adv/mean_abs_reasoning": 0.5044052600860596, "adv/mean_abs_step_conf": 0.754356324672699, "adv/ratio_final_to_reasoning": 1.208126283010868, "adv/ratio_step_to_reasoning": 1.4955361975091095, "adv/std_final_conf": 0.8310155272483826, "adv/std_reasoning": 0.7755122780799866, "adv/std_step_conf": 0.9313125014305115, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6357083678541839, "calib/avg_num_step_conf": 10.44140625, "calib/ece": 0.2630290456431535, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9377593360995851, "calib/gap": 0.03550207125103577, "calib/mean_conf": 0.9684232365145228, "calib/mu_c": 0.9788823529411764, "calib/mu_w": 0.9433802816901407, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2630290456431535, "calib/std_conf": 0.07947965709286663, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8273033067274801, "calib/step_q_c_n": 1754.0, "calib/step_q_gap": 0.01316837745653332, "calib/step_q_w": 0.8141349292709468, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 812.0078125, "completions/mean_terminated_length": 855.4485473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0928, "grad_norm": 0.026130422949790955, "kl": 0.150726318359375, "learning_rate": 3.138888888888889e-06, "loss": -0.19, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016727639362215996, "mask/share_reasoning": 0.8078656792640686, "mask/share_step_conf": 0.12462540715932846, "num_tokens": 28533501.0, "reward": 0.8941546678543091, "reward_std": 0.24176597595214844, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.7786323428153992, "rewards/final_brier_reward_step": 0.6885831952095032, "rewards/format_reward_step": 0.94140625, "step": 87 }, { "adv/mean_abs_final_conf": 0.6701643466949463, "adv/mean_abs_reasoning": 0.46769726276397705, "adv/mean_abs_step_conf": 0.7661531567573547, "adv/ratio_final_to_reasoning": 1.4329020074533643, "adv/ratio_step_to_reasoning": 1.638139065064388, "adv/std_final_conf": 0.8791421055793762, "adv/std_reasoning": 0.7394110560417175, "adv/std_step_conf": 0.9323065876960754, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5340203274985884, "calib/avg_num_step_conf": 10.29296875, "calib/ece": 0.3519105691056911, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9390243902439024, "calib/gap": -0.000447487295313187, "calib/mean_conf": 0.968089430894309, "calib/mu_c": 0.967922077922078, "calib/mu_w": 0.9683695652173911, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34699186991869924, "calib/std_conf": 0.06487643140392849, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8267061923583663, "calib/step_q_c_n": 1518.0, "calib/step_q_gap": 0.013993569260783545, "calib/step_q_w": 0.8127126230975827, "calib/step_q_w_n": 1117.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 887.2734375, "completions/mean_terminated_length": 912.2168579101562, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.09386666666666667, "grad_norm": 0.030685633420944214, "kl": 0.137176513671875, "learning_rate": 3.1111111111111116e-06, "loss": -0.0492, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016321662813425064, "mask/share_reasoning": 0.8399770259857178, "mask/share_step_conf": 0.11635757982730865, "num_tokens": 28870491.0, "reward": 0.8365731835365295, "reward_std": 0.23255041241645813, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.7413724660873413, "rewards/final_brier_reward_step": 0.619273841381073, "rewards/format_reward_step": 0.9609375, "step": 88 }, { "adv/mean_abs_final_conf": 0.691696047782898, "adv/mean_abs_reasoning": 0.44166111946105957, "adv/mean_abs_step_conf": 0.7274096012115479, "adv/ratio_final_to_reasoning": 1.5661239291947306, "adv/ratio_step_to_reasoning": 1.6469858204842098, "adv/std_final_conf": 0.8953777551651001, "adv/std_reasoning": 0.7394091486930847, "adv/std_step_conf": 0.9307010769844055, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5642153781688665, "calib/avg_num_step_conf": 10.90625, "calib/ece": 0.4304583333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.9291666666666667, "calib/gap": 0.0010580347789652667, "calib/mean_conf": 0.9635416666666666, "calib/mu_c": 0.9640310077519381, "calib/mu_w": 0.9629729729729728, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4282500000000001, "calib/std_conf": 0.07483007370406335, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8260374531835207, "calib/step_q_c_n": 1335.0, "calib/step_q_gap": 0.013703891069588026, "calib/step_q_w": 0.8123335621139327, "calib/step_q_w_n": 1457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 898.11328125, "completions/mean_terminated_length": 927.0846557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.09493333333333333, "grad_norm": 0.02352975867688656, "kl": 0.142669677734375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0897, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01606300100684166, "mask/share_reasoning": 0.8333423137664795, "mask/share_step_conf": 0.11934471130371094, "num_tokens": 29209296.0, "reward": 0.7457454204559326, "reward_std": 0.2067442238330841, "rewards/accuracy_reward_step": 0.50390625, "rewards/asymmetric_l2_reward": 0.6747016906738281, "rewards/final_brier_reward_step": 0.5292890071868896, "rewards/format_reward_step": 0.93359375, "step": 89 }, { "adv/mean_abs_final_conf": 0.6971824169158936, "adv/mean_abs_reasoning": 0.46878379583358765, "adv/mean_abs_step_conf": 0.748717188835144, "adv/ratio_final_to_reasoning": 1.4872152645041181, "adv/ratio_step_to_reasoning": 1.597148185345829, "adv/std_final_conf": 0.8981074094772339, "adv/std_reasoning": 0.7394995093345642, "adv/std_step_conf": 0.9299719929695129, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5588279042094303, "calib/avg_num_step_conf": 10.8828125, "calib/ece": 0.31391836734693884, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.926530612244898, "calib/gap": -0.0013595121225642437, "calib/mean_conf": 0.9640408163265306, "calib/mu_c": 0.9635802469135802, "calib/mu_w": 0.9649397590361445, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3083673469387756, "calib/std_conf": 0.07823824399757069, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8285641605426795, "calib/step_q_c_n": 1769.0, "calib/step_q_gap": 0.015279991417802341, "calib/step_q_w": 0.8132841691248771, "calib/step_q_w_n": 1017.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 905.1875, "completions/mean_terminated_length": 923.2191772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.096, "grad_norm": 0.058208346366882324, "kl": 0.141021728515625, "learning_rate": 3.055555555555556e-06, "loss": -0.073, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016423799097537994, "mask/share_reasoning": 0.8371864557266235, "mask/share_step_conf": 0.12685847282409668, "num_tokens": 29544344.0, "reward": 0.8589211702346802, "reward_std": 0.24496112763881683, "rewards/accuracy_reward_step": 0.6328125, "rewards/asymmetric_l2_reward": 0.7514224052429199, "rewards/final_brier_reward_step": 0.6484510898590088, "rewards/format_reward_step": 0.95703125, "step": 90 }, { "adv/mean_abs_final_conf": 0.6069288849830627, "adv/mean_abs_reasoning": 0.4890816807746887, "adv/mean_abs_step_conf": 0.7374441623687744, "adv/ratio_final_to_reasoning": 1.2409560792007341, "adv/ratio_step_to_reasoning": 1.5078139119843703, "adv/std_final_conf": 0.8355866074562073, "adv/std_reasoning": 0.7576850652694702, "adv/std_step_conf": 0.9319037795066833, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5358258928571429, "calib/avg_num_step_conf": 11.01953125, "calib/ece": 0.3284836065573772, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9467213114754098, "calib/gap": -0.0008928571428568066, "calib/mean_conf": 0.9690573770491804, "calib/mu_c": 0.96875, "calib/mu_w": 0.9696428571428568, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32090163934426247, "calib/std_conf": 0.06612467102156572, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8274085952533674, "calib/step_q_c_n": 1559.0, "calib/step_q_gap": 0.01514235119631513, "calib/step_q_w": 0.8122662440570523, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 904.19140625, "completions/mean_terminated_length": 937.1376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 541.0, "epoch": 0.09706666666666666, "grad_norm": 0.037474095821380615, "kl": 0.14019775390625, "learning_rate": 3.0277777777777776e-06, "loss": -0.1753, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.015775000676512718, "mask/share_reasoning": 0.8341672420501709, "mask/share_step_conf": 0.11490151286125183, "num_tokens": 29883529.0, "reward": 0.8479245901107788, "reward_std": 0.23241955041885376, "rewards/accuracy_reward_step": 0.625, "rewards/asymmetric_l2_reward": 0.7403823137283325, "rewards/final_brier_reward_step": 0.6398417949676514, "rewards/format_reward_step": 0.953125, "step": 91 }, { "adv/mean_abs_final_conf": 0.6295256018638611, "adv/mean_abs_reasoning": 0.48855841159820557, "adv/mean_abs_step_conf": 0.7306718230247498, "adv/ratio_final_to_reasoning": 1.2885370242721113, "adv/ratio_step_to_reasoning": 1.4955669694326341, "adv/std_final_conf": 0.8479897975921631, "adv/std_reasoning": 0.7754259705543518, "adv/std_step_conf": 0.929085910320282, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5820178448867536, "calib/avg_num_step_conf": 10.65234375, "calib/ece": 0.3611646586345382, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.007409059711736665, "calib/mean_conf": 0.9836546184738956, "calib/mu_c": 0.9864516129032257, "calib/mu_w": 0.979042553191489, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3611646586345382, "calib/std_conf": 0.0217298396792282, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8437311557788943, "calib/step_q_c_n": 1592.0, "calib/step_q_gap": 0.010224547849378984, "calib/step_q_w": 0.8335066079295154, "calib/step_q_w_n": 1135.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 826.0078125, "completions/mean_terminated_length": 839.1190795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.09813333333333334, "grad_norm": 0.015464968979358673, "kl": 0.1487884521484375, "learning_rate": 3e-06, "loss": -0.0939, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01752406731247902, "mask/share_reasoning": 0.833724856376648, "mask/share_step_conf": 0.13312605023384094, "num_tokens": 30201707.0, "reward": 0.844549298286438, "reward_std": 0.23641173541545868, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.753333330154419, "rewards/final_brier_reward_step": 0.6201401948928833, "rewards/format_reward_step": 0.97265625, "step": 92 }, { "adv/mean_abs_final_conf": 0.5908430218696594, "adv/mean_abs_reasoning": 0.513970673084259, "adv/mean_abs_step_conf": 0.7497128844261169, "adv/ratio_final_to_reasoning": 1.149565632459341, "adv/ratio_step_to_reasoning": 1.458668604430687, "adv/std_final_conf": 0.8375146985054016, "adv/std_reasoning": 0.79283207654953, "adv/std_step_conf": 0.9285351634025574, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6064102564102565, "calib/avg_num_step_conf": 11.14453125, "calib/ece": 0.34967479674796753, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.008128205128205113, "calib/mean_conf": 0.9838211382113821, "calib/mu_c": 0.9867948717948718, "calib/mu_w": 0.9786666666666667, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34967479674796753, "calib/std_conf": 0.0176699793735646, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8417461263408821, "calib/step_q_c_n": 1678.0, "calib/step_q_gap": 0.014946126340882104, "calib/step_q_w": 0.8268, "calib/step_q_w_n": 1175.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 850.3984375, "completions/mean_terminated_length": 877.8306274414062, "completions/min_length": 0.0, "completions/min_terminated_length": 438.0, "epoch": 0.0992, "grad_norm": 0.02208016626536846, "kl": 0.137359619140625, "learning_rate": 2.9722222222222225e-06, "loss": -0.1543, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.016855405643582344, "mask/share_reasoning": 0.8227949738502502, "mask/share_step_conf": 0.12909963726997375, "num_tokens": 30525185.0, "reward": 0.8347985744476318, "reward_std": 0.2563871741294861, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7325701117515564, "rewards/final_brier_reward_step": 0.6237456798553467, "rewards/format_reward_step": 0.95703125, "step": 93 }, { "adv/mean_abs_final_conf": 0.5753787755966187, "adv/mean_abs_reasoning": 0.5310263633728027, "adv/mean_abs_step_conf": 0.7260737419128418, "adv/ratio_final_to_reasoning": 1.0835220532971517, "adv/ratio_step_to_reasoning": 1.3673026275027096, "adv/std_final_conf": 0.8271228075027466, "adv/std_reasoning": 0.7928115725517273, "adv/std_step_conf": 0.931684136390686, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5608802961744138, "calib/avg_num_step_conf": 10.5078125, "calib/ece": 0.4028163265306123, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00356026326614578, "calib/mean_conf": 0.9864897959183674, "calib/mu_c": 0.9879720279720278, "calib/mu_w": 0.984411764705882, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4028163265306123, "calib/std_conf": 0.009472026374109198, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8471638655462185, "calib/step_q_c_n": 1428.0, "calib/step_q_gap": 0.01842377045905519, "calib/step_q_w": 0.8287400950871633, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 820.51953125, "completions/mean_terminated_length": 843.5863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 470.0, "epoch": 0.10026666666666667, "grad_norm": 0.023399489000439644, "kl": 0.1503753662109375, "learning_rate": 2.944444444444445e-06, "loss": -0.0828, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0174710713326931, "mask/share_reasoning": 0.8291428685188293, "mask/share_step_conf": 0.12604230642318726, "num_tokens": 30843918.0, "reward": 0.7747670412063599, "reward_std": 0.24693280458450317, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6803410053253174, "rewards/final_brier_reward_step": 0.5668492317199707, "rewards/format_reward_step": 0.953125, "step": 94 }, { "adv/mean_abs_final_conf": 0.4851650297641754, "adv/mean_abs_reasoning": 0.40888628363609314, "adv/mean_abs_step_conf": 0.7229888439178467, "adv/ratio_final_to_reasoning": 1.1865524699184333, "adv/ratio_step_to_reasoning": 1.7681905039429089, "adv/std_final_conf": 0.7402144074440002, "adv/std_reasoning": 0.7013313174247742, "adv/std_step_conf": 0.92943274974823, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5654044117647059, "calib/avg_num_step_conf": 10.4921875, "calib/ece": 0.30435999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0036544117647059338, "calib/mean_conf": 0.98436, "calib/mu_c": 0.9855294117647059, "calib/mu_w": 0.9818749999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30435999999999996, "calib/std_conf": 0.011618536913054079, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8448533640023004, "calib/step_q_c_n": 1739.0, "calib/step_q_gap": 0.004853364002300409, "calib/step_q_w": 0.84, "calib/step_q_w_n": 947.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 849.15234375, "completions/mean_terminated_length": 866.0677490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 387.0, "epoch": 0.10133333333333333, "grad_norm": 0.013517041690647602, "kl": 0.136505126953125, "learning_rate": 2.916666666666667e-06, "loss": -0.0705, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017546728253364563, "mask/share_reasoning": 0.8325512409210205, "mask/share_step_conf": 0.13037078082561493, "num_tokens": 31167429.0, "reward": 0.871839165687561, "reward_std": 0.1962461769580841, "rewards/accuracy_reward_step": 0.6640625, "rewards/asymmetric_l2_reward": 0.7405334711074829, "rewards/final_brier_reward_step": 0.6750198602676392, "rewards/format_reward_step": 0.9765625, "step": 95 }, { "adv/mean_abs_final_conf": 0.5261593461036682, "adv/mean_abs_reasoning": 0.468733549118042, "adv/mean_abs_step_conf": 0.7362979650497437, "adv/ratio_final_to_reasoning": 1.1225126665110217, "adv/ratio_step_to_reasoning": 1.570824120516111, "adv/std_final_conf": 0.7908560633659363, "adv/std_reasoning": 0.7574588060379028, "adv/std_step_conf": 0.9261574149131775, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6119724025974026, "calib/avg_num_step_conf": 10.1875, "calib/ece": 0.27158536585365856, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.006435064935065293, "calib/mean_conf": 0.9870325203252033, "calib/mu_c": 0.9888636363636365, "calib/mu_w": 0.9824285714285712, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27158536585365856, "calib/std_conf": 0.010346386949860982, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.852012910798122, "calib/step_q_c_n": 1704.0, "calib/step_q_gap": 0.012189901948564508, "calib/step_q_w": 0.8398230088495575, "calib/step_q_w_n": 904.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 757.72265625, "completions/mean_terminated_length": 782.165283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.1024, "grad_norm": 0.015436707064509392, "kl": 0.1490325927734375, "learning_rate": 2.888888888888889e-06, "loss": -0.1416, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.018313266336917877, "mask/share_reasoning": 0.8223634958267212, "mask/share_step_conf": 0.12807327508926392, "num_tokens": 31467222.0, "reward": 0.9001051187515259, "reward_std": 0.2061901092529297, "rewards/accuracy_reward_step": 0.6875, "rewards/asymmetric_l2_reward": 0.7736778259277344, "rewards/final_brier_reward_step": 0.6968449354171753, "rewards/format_reward_step": 0.9609375, "step": 96 }, { "adv/mean_abs_final_conf": 0.6138774156570435, "adv/mean_abs_reasoning": 0.5735644102096558, "adv/mean_abs_step_conf": 0.7456457614898682, "adv/ratio_final_to_reasoning": 1.0702850538314468, "adv/ratio_step_to_reasoning": 1.3000209709966337, "adv/std_final_conf": 0.8496063351631165, "adv/std_reasoning": 0.8264009356498718, "adv/std_step_conf": 0.9341704845428467, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5817634746206175, "calib/avg_num_step_conf": 9.87109375, "calib/ece": 0.3716929133858268, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004733124018838186, "calib/mean_conf": 0.9858661417322835, "calib/mu_c": 0.9876923076923075, "calib/mu_w": 0.9829591836734693, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3716929133858268, "calib/std_conf": 0.010302405654948981, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8512392880685564, "calib/step_q_c_n": 1517.0, "calib/step_q_gap": 0.008981862325982082, "calib/step_q_w": 0.8422574257425743, "calib/step_q_w_n": 1010.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 841.16796875, "completions/mean_terminated_length": 844.4667358398438, "completions/min_length": 0.0, "completions/min_terminated_length": 430.0, "epoch": 0.10346666666666667, "grad_norm": 0.014179172925651073, "kl": 0.142333984375, "learning_rate": 2.861111111111111e-06, "loss": 0.023, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.017768729478120804, "mask/share_reasoning": 0.8533096313476562, "mask/share_step_conf": 0.12501543760299683, "num_tokens": 31787633.0, "reward": 0.8328101634979248, "reward_std": 0.26197588443756104, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7231894731521606, "rewards/final_brier_reward_step": 0.622118353843689, "rewards/format_reward_step": 0.9921875, "step": 97 }, { "adv/mean_abs_final_conf": 0.6074684858322144, "adv/mean_abs_reasoning": 0.4974590241909027, "adv/mean_abs_step_conf": 0.7617883682250977, "adv/ratio_final_to_reasoning": 1.2211427600901152, "adv/ratio_step_to_reasoning": 1.5313590289453811, "adv/std_final_conf": 0.8122832179069519, "adv/std_reasoning": 0.7577930092811584, "adv/std_step_conf": 0.9296541810035706, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5782942547648431, "calib/avg_num_step_conf": 10.1015625, "calib/ece": 0.4033061224489797, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.0021644042232277227, "calib/mean_conf": 0.9837142857142857, "calib/mu_c": 0.9846153846153843, "calib/mu_w": 0.9824509803921566, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40167346938775517, "calib/std_conf": 0.02682673050126972, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8541364902506964, "calib/step_q_c_n": 1436.0, "calib/step_q_gap": 0.012719098946348595, "calib/step_q_w": 0.8414173913043478, "calib/step_q_w_n": 1150.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 837.97265625, "completions/mean_terminated_length": 858.0840454101562, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.10453333333333334, "grad_norm": 0.018236543983221054, "kl": 0.1473541259765625, "learning_rate": 2.8333333333333335e-06, "loss": -0.1734, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0168771892786026, "mask/share_reasoning": 0.8379881381988525, "mask/share_step_conf": 0.12169711291790009, "num_tokens": 32108338.0, "reward": 0.7924503087997437, "reward_std": 0.26125431060791016, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.7101399898529053, "rewards/final_brier_reward_step": 0.5716354846954346, "rewards/format_reward_step": 0.95703125, "step": 98 }, { "adv/mean_abs_final_conf": 0.6645784378051758, "adv/mean_abs_reasoning": 0.5807794332504272, "adv/mean_abs_step_conf": 0.7467468976974487, "adv/ratio_final_to_reasoning": 1.1442871419977008, "adv/ratio_step_to_reasoning": 1.2857667729694857, "adv/std_final_conf": 0.8672755360603333, "adv/std_reasoning": 0.8265784382820129, "adv/std_step_conf": 0.9330686330795288, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.609092894583576, "calib/avg_num_step_conf": 10.4765625, "calib/ece": 0.5567932489451476, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0064130751310423895, "calib/mean_conf": 0.9829535864978902, "calib/mu_c": 0.9866336633663365, "calib/mu_w": 0.9802205882352941, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5567932489451476, "calib/std_conf": 0.012849746660799886, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8542406311637082, "calib/step_q_c_n": 1014.0, "calib/step_q_gap": 0.01385693811814459, "calib/step_q_w": 0.8403836930455636, "calib/step_q_w_n": 1668.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 880.34375, "completions/mean_terminated_length": 935.136962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 392.0, "epoch": 0.1056, "grad_norm": 0.02690640278160572, "kl": 0.131011962890625, "learning_rate": 2.805555555555556e-06, "loss": -0.2087, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.015637118369340897, "mask/share_reasoning": 0.8111535310745239, "mask/share_step_conf": 0.11461564898490906, "num_tokens": 32439506.0, "reward": 0.6306707262992859, "reward_std": 0.251910537481308, "rewards/accuracy_reward_step": 0.39453125, "rewards/asymmetric_l2_reward": 0.5821539163589478, "rewards/final_brier_reward_step": 0.4151250123977661, "rewards/format_reward_step": 0.92578125, "step": 99 }, { "adv/mean_abs_final_conf": 0.5684931874275208, "adv/mean_abs_reasoning": 0.4415876865386963, "adv/mean_abs_step_conf": 0.7473946809768677, "adv/ratio_final_to_reasoning": 1.2873846005162641, "adv/ratio_step_to_reasoning": 1.6925170328801131, "adv/std_final_conf": 0.7858427166938782, "adv/std_reasoning": 0.7207019925117493, "adv/std_step_conf": 0.9307321310043335, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5973707664884135, "calib/avg_num_step_conf": 9.8359375, "calib/ece": 0.3480497925311204, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.007260101010101105, "calib/mean_conf": 0.9829045643153527, "calib/mu_c": 0.9855555555555557, "calib/mu_w": 0.9782954545454546, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3480497925311204, "calib/std_conf": 0.016867411307882017, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8569694915254237, "calib/step_q_c_n": 1475.0, "calib/step_q_gap": 0.016451754229162785, "calib/step_q_w": 0.8405177372962609, "calib/step_q_w_n": 1043.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 877.6484375, "completions/mean_terminated_length": 913.3251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 479.0, "epoch": 0.10666666666666667, "grad_norm": 0.020915953442454338, "kl": 0.13641357421875, "learning_rate": 2.7777777777777783e-06, "loss": -0.1353, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016081880778074265, "mask/share_reasoning": 0.8332370519638062, "mask/share_step_conf": 0.11161854863166809, "num_tokens": 32771592.0, "reward": 0.8156791925430298, "reward_std": 0.21008773148059845, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.7115105986595154, "rewards/final_brier_reward_step": 0.6120351552963257, "rewards/format_reward_step": 0.94140625, "step": 100 }, { "adv/mean_abs_final_conf": 0.6653465032577515, "adv/mean_abs_reasoning": 0.47705355286598206, "adv/mean_abs_step_conf": 0.7555596828460693, "adv/ratio_final_to_reasoning": 1.3946998177889396, "adv/ratio_step_to_reasoning": 1.5838047496070689, "adv/std_final_conf": 0.8447847962379456, "adv/std_reasoning": 0.7393490672111511, "adv/std_step_conf": 0.9337489604949951, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5492322198275863, "calib/avg_num_step_conf": 9.953125, "calib/ece": 0.456844262295082, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0027343750000000666, "calib/mean_conf": 0.9814344262295083, "calib/mu_c": 0.982734375, "calib/mu_w": 0.98, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.456844262295082, "calib/std_conf": 0.013519559422032574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8568569131832798, "calib/step_q_c_n": 1244.0, "calib/step_q_gap": 0.010031759809046692, "calib/step_q_w": 0.8468251533742331, "calib/step_q_w_n": 1304.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 938.17578125, "completions/mean_terminated_length": 956.8645629882812, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.10773333333333333, "grad_norm": 0.014655107632279396, "kl": 0.13433837890625, "learning_rate": 2.7500000000000004e-06, "loss": -0.0587, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.015749523416161537, "mask/share_reasoning": 0.8516314029693604, "mask/share_step_conf": 0.11308782547712326, "num_tokens": 33118757.0, "reward": 0.7222472429275513, "reward_std": 0.224051371216774, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.6362471580505371, "rewards/final_brier_reward_step": 0.5176222324371338, "rewards/format_reward_step": 0.953125, "step": 101 }, { "adv/mean_abs_final_conf": 0.5480314493179321, "adv/mean_abs_reasoning": 0.4374270439147949, "adv/mean_abs_step_conf": 0.7339659929275513, "adv/ratio_final_to_reasoning": 1.2528522343138013, "adv/ratio_step_to_reasoning": 1.677916359168955, "adv/std_final_conf": 0.782140851020813, "adv/std_reasoning": 0.7205737829208374, "adv/std_step_conf": 0.9318551421165466, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5557624890446977, "calib/avg_num_step_conf": 10.27734375, "calib/ece": 0.32514170040485824, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003519573473561355, "calib/mean_conf": 0.9850607287449392, "calib/mu_c": 0.9862576687116564, "calib/mu_w": 0.9827380952380951, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32514170040485824, "calib/std_conf": 0.010830697314911472, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8625914445133293, "calib/step_q_c_n": 1613.0, "calib/step_q_gap": 0.014035452371875423, "calib/step_q_w": 0.8485559921414538, "calib/step_q_w_n": 1018.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 792.71875, "completions/mean_terminated_length": 811.7440185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 454.0, "epoch": 0.1088, "grad_norm": 0.0197947658598423, "kl": 0.147491455078125, "learning_rate": 2.7222222222222224e-06, "loss": -0.1023, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018000053241848946, "mask/share_reasoning": 0.8290439248085022, "mask/share_step_conf": 0.12951849400997162, "num_tokens": 33428389.0, "reward": 0.8536818027496338, "reward_std": 0.2171253263950348, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.7385513782501221, "rewards/final_brier_reward_step": 0.6477183103561401, "rewards/format_reward_step": 0.96484375, "step": 102 }, { "adv/mean_abs_final_conf": 0.5411339998245239, "adv/mean_abs_reasoning": 0.37070244550704956, "adv/mean_abs_step_conf": 0.7217574715614319, "adv/ratio_final_to_reasoning": 1.4597529808155347, "adv/ratio_step_to_reasoning": 1.9469994878890173, "adv/std_final_conf": 0.790514349937439, "adv/std_reasoning": 0.6816906929016113, "adv/std_step_conf": 0.9258291125297546, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6147619751626257, "calib/avg_num_step_conf": 9.95703125, "calib/ece": 0.3512448132780084, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006833234772324115, "calib/mean_conf": 0.9819502074688797, "calib/mu_c": 0.9844736842105264, "calib/mu_w": 0.9776404494382023, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3512448132780084, "calib/std_conf": 0.013292652712665354, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8647315436241612, "calib/step_q_c_n": 1490.0, "calib/step_q_gap": 0.012096982717645477, "calib/step_q_w": 0.8526345609065157, "calib/step_q_w_n": 1059.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 918.1015625, "completions/mean_terminated_length": 951.5546875, "completions/min_length": 0.0, "completions/min_terminated_length": 389.0, "epoch": 0.10986666666666667, "grad_norm": 0.01717713288962841, "kl": 0.1207427978515625, "learning_rate": 2.6944444444444444e-06, "loss": -0.163, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01587964966893196, "mask/share_reasoning": 0.838042140007019, "mask/share_step_conf": 0.1109219565987587, "num_tokens": 33767975.0, "reward": 0.8112150430679321, "reward_std": 0.18265941739082336, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7065750360488892, "rewards/final_brier_reward_step": 0.6088237762451172, "rewards/format_reward_step": 0.94140625, "step": 103 }, { "adv/mean_abs_final_conf": 0.5487852096557617, "adv/mean_abs_reasoning": 0.40357086062431335, "adv/mean_abs_step_conf": 0.7396882772445679, "adv/ratio_final_to_reasoning": 1.3598236721224264, "adv/ratio_step_to_reasoning": 1.8328584876031184, "adv/std_final_conf": 0.7952535152435303, "adv/std_reasoning": 0.7012667059898376, "adv/std_step_conf": 0.9316074252128601, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6060175619834711, "calib/avg_num_step_conf": 10.015625, "calib/ece": 0.46823293172690783, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006382360537190213, "calib/mean_conf": 0.9822891566265062, "calib/mu_c": 0.9853906250000002, "calib/mu_w": 0.97900826446281, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46823293172690783, "calib/std_conf": 0.013048439369190952, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8659394409937887, "calib/step_q_c_n": 1288.0, "calib/step_q_gap": 0.005853234097237148, "calib/step_q_w": 0.8600862068965516, "calib/step_q_w_n": 1276.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 884.9296875, "completions/mean_terminated_length": 895.4229736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 400.0, "epoch": 0.11093333333333333, "grad_norm": 0.0153735326603055, "kl": 0.131439208984375, "learning_rate": 2.666666666666667e-06, "loss": -0.023, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.01663128286600113, "mask/share_reasoning": 0.8520343899726868, "mask/share_step_conf": 0.11961561441421509, "num_tokens": 34101197.0, "reward": 0.7398614883422852, "reward_std": 0.18474718928337097, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.6658186316490173, "rewards/final_brier_reward_step": 0.5193730592727661, "rewards/format_reward_step": 0.97265625, "step": 104 }, { "adv/mean_abs_final_conf": 0.7172216176986694, "adv/mean_abs_reasoning": 0.5644861459732056, "adv/mean_abs_step_conf": 0.7502240538597107, "adv/ratio_final_to_reasoning": 1.270574349459258, "adv/ratio_step_to_reasoning": 1.3290389130211204, "adv/std_final_conf": 0.8941024541854858, "adv/std_reasoning": 0.8100602030754089, "adv/std_step_conf": 0.9334349632263184, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6729426263179515, "calib/avg_num_step_conf": 10.48046875, "calib/ece": 0.42987654320987667, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009874709023688921, "calib/mean_conf": 0.9813168724279837, "calib/mu_c": 0.9857462686567162, "calib/mu_w": 0.9758715596330273, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42987654320987667, "calib/std_conf": 0.013391631934015873, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8656762917933131, "calib/step_q_c_n": 1316.0, "calib/step_q_gap": 0.018456101595800245, "calib/step_q_w": 0.8472201901975128, "calib/step_q_w_n": 1367.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 896.37109375, "completions/mean_terminated_length": 925.2862548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 530.0, "epoch": 0.112, "grad_norm": 0.016667574644088745, "kl": 0.126190185546875, "learning_rate": 2.6388888888888893e-06, "loss": -0.151, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.016032157465815544, "mask/share_reasoning": 0.8382542133331299, "mask/share_step_conf": 0.11446364969015121, "num_tokens": 34436428.0, "reward": 0.7515039443969727, "reward_std": 0.2816525399684906, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6649938821792603, "rewards/final_brier_reward_step": 0.543482780456543, "rewards/format_reward_step": 0.94921875, "step": 105 }, { "adv/mean_abs_final_conf": 0.5788710117340088, "adv/mean_abs_reasoning": 0.46618354320526123, "adv/mean_abs_step_conf": 0.7207705974578857, "adv/ratio_final_to_reasoning": 1.2417233945110138, "adv/ratio_step_to_reasoning": 1.5461090550348524, "adv/std_final_conf": 0.8257135152816772, "adv/std_reasoning": 0.757561445236206, "adv/std_step_conf": 0.9304320812225342, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6588401697312588, "calib/avg_num_step_conf": 10.21484375, "calib/ece": 0.4016597510373445, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.009882602545968955, "calib/mean_conf": 0.9825726141078839, "calib/mu_c": 0.9867142857142857, "calib/mu_w": 0.9768316831683167, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4016597510373445, "calib/std_conf": 0.016994626836127357, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8627635542168675, "calib/step_q_c_n": 1328.0, "calib/step_q_gap": 0.011737913191226412, "calib/step_q_w": 0.851025641025641, "calib/step_q_w_n": 1287.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 841.17578125, "completions/mean_terminated_length": 875.369873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 518.0, "epoch": 0.11306666666666666, "grad_norm": 0.03570663556456566, "kl": 0.138427734375, "learning_rate": 2.6111111111111113e-06, "loss": -0.1133, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016544461250305176, "mask/share_reasoning": 0.8300613164901733, "mask/share_step_conf": 0.11433173716068268, "num_tokens": 34756353.0, "reward": 0.7651780843734741, "reward_std": 0.2139662802219391, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.6681015491485596, "rewards/final_brier_reward_step": 0.5645984411239624, "rewards/format_reward_step": 0.94140625, "step": 106 }, { "adv/mean_abs_final_conf": 0.6399675011634827, "adv/mean_abs_reasoning": 0.5369926691055298, "adv/mean_abs_step_conf": 0.7458997368812561, "adv/ratio_final_to_reasoning": 1.1917620816490444, "adv/ratio_step_to_reasoning": 1.3890315078671436, "adv/std_final_conf": 0.8324251770973206, "adv/std_reasoning": 0.7928897142410278, "adv/std_step_conf": 0.9346669912338257, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5739526823387583, "calib/avg_num_step_conf": 11.05859375, "calib/ece": 0.3011336032388664, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004510247136830126, "calib/mean_conf": 0.9812955465587044, "calib/mu_c": 0.9827380952380954, "calib/mu_w": 0.9782278481012653, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3011336032388664, "calib/std_conf": 0.013525418430851373, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8619087136929461, "calib/step_q_c_n": 1687.0, "calib/step_q_gap": -0.00026785973362730697, "calib/step_q_w": 0.8621765734265734, "calib/step_q_w_n": 1144.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 829.03125, "completions/mean_terminated_length": 852.3373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 446.0, "epoch": 0.11413333333333334, "grad_norm": 0.01998072676360607, "kl": 0.144256591796875, "learning_rate": 2.5833333333333337e-06, "loss": -0.1303, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.017042508348822594, "mask/share_reasoning": 0.8300793170928955, "mask/share_step_conf": 0.12553441524505615, "num_tokens": 35073201.0, "reward": 0.8738058805465698, "reward_std": 0.2717360854148865, "rewards/accuracy_reward_step": 0.65625, "rewards/asymmetric_l2_reward": 0.7542195320129395, "rewards/final_brier_reward_step": 0.6691734194755554, "rewards/format_reward_step": 0.96484375, "step": 107 }, { "adv/mean_abs_final_conf": 0.5755144357681274, "adv/mean_abs_reasoning": 0.48384425044059753, "adv/mean_abs_step_conf": 0.7267765998840332, "adv/ratio_final_to_reasoning": 1.1894621776409524, "adv/ratio_step_to_reasoning": 1.5020879120134567, "adv/std_final_conf": 0.7753496766090393, "adv/std_reasoning": 0.7394925355911255, "adv/std_step_conf": 0.9305362701416016, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6644811397118342, "calib/avg_num_step_conf": 9.90625, "calib/ece": 0.2712653061224489, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009406669904484688, "calib/mean_conf": 0.981469387755102, "calib/mu_c": 0.9841954022988506, "calib/mu_w": 0.9747887323943659, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2712653061224489, "calib/std_conf": 0.013381531654525757, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8594519740718917, "calib/step_q_c_n": 1697.0, "calib/step_q_gap": 0.0017642505915579365, "calib/step_q_w": 0.8576877234803337, "calib/step_q_w_n": 839.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2823.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 869.64453125, "completions/mean_terminated_length": 890.5160522460938, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.1152, "grad_norm": 0.03150971978902817, "kl": 0.1255035400390625, "learning_rate": 2.5555555555555557e-06, "loss": -0.1344, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01689765229821205, "mask/share_reasoning": 0.8402672410011292, "mask/share_step_conf": 0.11939756572246552, "num_tokens": 35399062.0, "reward": 0.8926793336868286, "reward_std": 0.23910677433013916, "rewards/accuracy_reward_step": 0.6796875, "rewards/asymmetric_l2_reward": 0.7657020092010498, "rewards/final_brier_reward_step": 0.6930941343307495, "rewards/format_reward_step": 0.953125, "step": 108 }, { "adv/mean_abs_final_conf": 0.6754946112632751, "adv/mean_abs_reasoning": 0.5071865320205688, "adv/mean_abs_step_conf": 0.7378177642822266, "adv/ratio_final_to_reasoning": 1.3318465073829693, "adv/ratio_step_to_reasoning": 1.4547266492721154, "adv/std_final_conf": 0.8642860651016235, "adv/std_reasoning": 0.7753763794898987, "adv/std_step_conf": 0.9282304048538208, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.6707766439909297, "calib/avg_num_step_conf": 10.73046875, "calib/ece": 0.44756302521008395, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009970238095237782, "calib/mean_conf": 0.9769747899159663, "calib/mu_c": 0.9816666666666665, "calib/mu_w": 0.9716964285714287, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.44756302521008395, "calib/std_conf": 0.014755977102516214, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8645178849144635, "calib/step_q_c_n": 1286.0, "calib/step_q_gap": 0.017775927351150522, "calib/step_q_w": 0.8467419575633129, "calib/step_q_w_n": 1461.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 915.2734375, "completions/mean_terminated_length": 956.3673095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 0.11626666666666667, "grad_norm": 0.01444100122898817, "kl": 0.1240234375, "learning_rate": 2.5277777777777778e-06, "loss": -0.2686, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.015557501465082169, "mask/share_reasoning": 0.8258721828460693, "mask/share_step_conf": 0.11560158431529999, "num_tokens": 35737972.0, "reward": 0.7213988304138184, "reward_std": 0.21585328876972198, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.6432522535324097, "rewards/final_brier_reward_step": 0.5159515142440796, "rewards/format_reward_step": 0.92578125, "step": 109 }, { "adv/mean_abs_final_conf": 0.6226210594177246, "adv/mean_abs_reasoning": 0.4714178144931793, "adv/mean_abs_step_conf": 0.7386190891265869, "adv/ratio_final_to_reasoning": 1.3207414744966812, "adv/ratio_step_to_reasoning": 1.5668035157319529, "adv/std_final_conf": 0.8406306505203247, "adv/std_reasoning": 0.7575274109840393, "adv/std_step_conf": 0.928333580493927, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5543624161073825, "calib/avg_num_step_conf": 10.32421875, "calib/ece": 0.38096385542168676, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.0036053691275168553, "calib/mean_conf": 0.9793574297188755, "calib/mu_c": 0.9808053691275167, "calib/mu_w": 0.9771999999999998, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38096385542168676, "calib/std_conf": 0.014848253687567507, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8625860904794058, "calib/step_q_c_n": 1481.0, "calib/step_q_gap": 0.004290049171316146, "calib/step_q_w": 0.8582960413080897, "calib/step_q_w_n": 1162.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 815.44140625, "completions/mean_terminated_length": 831.685302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.11733333333333333, "grad_norm": 0.01637006178498268, "kl": 0.1345367431640625, "learning_rate": 2.5e-06, "loss": -0.0706, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017808744683861732, "mask/share_reasoning": 0.8331002593040466, "mask/share_step_conf": 0.129559725522995, "num_tokens": 36051645.0, "reward": 0.8081911206245422, "reward_std": 0.22839653491973877, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.7062299251556396, "rewards/final_brier_reward_step": 0.5992147922515869, "rewards/format_reward_step": 0.97265625, "step": 110 }, { "adv/mean_abs_final_conf": 0.6984493136405945, "adv/mean_abs_reasoning": 0.542428731918335, "adv/mean_abs_step_conf": 0.7579163312911987, "adv/ratio_final_to_reasoning": 1.2876333286595687, "adv/ratio_step_to_reasoning": 1.3972643532557312, "adv/std_final_conf": 0.8653133511543274, "adv/std_reasoning": 0.7929964661598206, "adv/std_step_conf": 0.9306600689888, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6711769759450171, "calib/avg_num_step_conf": 10.12109375, "calib/ece": 0.3814107883817428, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010099513172966867, "calib/mean_conf": 0.9789211618257262, "calib/mu_c": 0.9829861111111112, "calib/mu_w": 0.9728865979381444, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3814107883817428, "calib/std_conf": 0.014478414620961383, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8633705357142857, "calib/step_q_c_n": 1344.0, "calib/step_q_gap": 0.015615924647725876, "calib/step_q_w": 0.8477546110665598, "calib/step_q_w_n": 1247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 873.1015625, "completions/mean_terminated_length": 904.9149780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.1184, "grad_norm": 0.029058046638965607, "kl": 0.1292877197265625, "learning_rate": 2.4722222222222226e-06, "loss": -0.1011, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016590723767876625, "mask/share_reasoning": 0.8334500789642334, "mask/share_step_conf": 0.11480294167995453, "num_tokens": 36382567.0, "reward": 0.7858656644821167, "reward_std": 0.26266515254974365, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.6896809339523315, "rewards/final_brier_reward_step": 0.5820503830909729, "rewards/format_reward_step": 0.9375, "step": 111 }, { "adv/mean_abs_final_conf": 0.6206170916557312, "adv/mean_abs_reasoning": 0.44810646772384644, "adv/mean_abs_step_conf": 0.7446877956390381, "adv/ratio_final_to_reasoning": 1.3849768667882685, "adv/ratio_step_to_reasoning": 1.6618546021477314, "adv/std_final_conf": 0.8321802020072937, "adv/std_reasoning": 0.7207315564155579, "adv/std_step_conf": 0.9297779202461243, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.758238510418206, "calib/avg_num_step_conf": 10.796875, "calib/ece": 0.40736170212765965, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.015019949756169715, "calib/mean_conf": 0.9775744680851065, "calib/mu_c": 0.9840298507462686, "calib/mu_w": 0.9690099009900989, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40736170212765965, "calib/std_conf": 0.014603620479184115, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8601647564469914, "calib/step_q_c_n": 1396.0, "calib/step_q_gap": 0.0030083236984532746, "calib/step_q_w": 0.8571564327485381, "calib/step_q_w_n": 1368.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 902.65234375, "completions/mean_terminated_length": 950.9423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.11946666666666667, "grad_norm": 0.03636123239994049, "kl": 0.1224365234375, "learning_rate": 2.4444444444444447e-06, "loss": -0.1538, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.015544177033007145, "mask/share_reasoning": 0.8192787170410156, "mask/share_step_conf": 0.11439579725265503, "num_tokens": 36721566.0, "reward": 0.7470136880874634, "reward_std": 0.20874658226966858, "rewards/accuracy_reward_step": 0.52734375, "rewards/asymmetric_l2_reward": 0.6588191986083984, "rewards/final_brier_reward_step": 0.5469269156455994, "rewards/format_reward_step": 0.9140625, "step": 112 }, { "adv/mean_abs_final_conf": 0.6355844736099243, "adv/mean_abs_reasoning": 0.5024422407150269, "adv/mean_abs_step_conf": 0.7353067994117737, "adv/ratio_final_to_reasoning": 1.2649901264380605, "adv/ratio_step_to_reasoning": 1.4634653295976003, "adv/std_final_conf": 0.8575465083122253, "adv/std_reasoning": 0.7753759026527405, "adv/std_step_conf": 0.9332528710365295, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.658067940552017, "calib/avg_num_step_conf": 9.56640625, "calib/ece": 0.34550607287449386, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009125265392781134, "calib/mean_conf": 0.9811336032388663, "calib/mu_c": 0.9844585987261145, "calib/mu_w": 0.9753333333333334, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34550607287449386, "calib/std_conf": 0.013237566329213, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8643951093951093, "calib/step_q_c_n": 1554.0, "calib/step_q_gap": 0.008707958557120477, "calib/step_q_w": 0.8556871508379889, "calib/step_q_w_n": 895.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 800.5546875, "completions/mean_terminated_length": 810.0474853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.12053333333333334, "grad_norm": 0.02966303937137127, "kl": 0.1484527587890625, "learning_rate": 2.4166666666666667e-06, "loss": -0.0225, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.018074627965688705, "mask/share_reasoning": 0.8422780632972717, "mask/share_step_conf": 0.12792859971523285, "num_tokens": 37031708.0, "reward": 0.829046368598938, "reward_std": 0.25141140818595886, "rewards/accuracy_reward_step": 0.62109375, "rewards/asymmetric_l2_reward": 0.7154799699783325, "rewards/final_brier_reward_step": 0.626206636428833, "rewards/format_reward_step": 0.9609375, "step": 113 }, { "adv/mean_abs_final_conf": 0.5766456127166748, "adv/mean_abs_reasoning": 0.3910996913909912, "adv/mean_abs_step_conf": 0.7413148283958435, "adv/ratio_final_to_reasoning": 1.4744210374233948, "adv/ratio_step_to_reasoning": 1.8954625756907957, "adv/std_final_conf": 0.7828994393348694, "adv/std_reasoning": 0.681779682636261, "adv/std_step_conf": 0.9287707805633545, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6706652929055049, "calib/avg_num_step_conf": 11.3125, "calib/ece": 0.3340163934426229, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00994701206947346, "calib/mean_conf": 0.9815573770491803, "calib/mu_c": 0.9850632911392404, "calib/mu_w": 0.975116279069767, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3340163934426229, "calib/std_conf": 0.013338007556571024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.86858125, "calib/step_q_c_n": 1600.0, "calib/step_q_gap": 0.016721682098765234, "calib/step_q_w": 0.8518595679012347, "calib/step_q_w_n": 1296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 825.6171875, "completions/mean_terminated_length": 855.700439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.1216, "grad_norm": 0.02331947349011898, "kl": 0.1299896240234375, "learning_rate": 2.388888888888889e-06, "loss": -0.1173, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01743999309837818, "mask/share_reasoning": 0.8186057209968567, "mask/share_step_conf": 0.12879803776741028, "num_tokens": 37348090.0, "reward": 0.8373966217041016, "reward_std": 0.20491275191307068, "rewards/accuracy_reward_step": 0.6171875, "rewards/asymmetric_l2_reward": 0.727318286895752, "rewards/final_brier_reward_step": 0.6334124803543091, "rewards/format_reward_step": 0.953125, "step": 114 }, { "adv/mean_abs_final_conf": 0.59438157081604, "adv/mean_abs_reasoning": 0.5020387768745422, "adv/mean_abs_step_conf": 0.7483762502670288, "adv/ratio_final_to_reasoning": 1.183935580666459, "adv/ratio_step_to_reasoning": 1.4906741963759613, "adv/std_final_conf": 0.8215389251708984, "adv/std_reasoning": 0.7576356530189514, "adv/std_step_conf": 0.9327325820922852, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5949225252913305, "calib/avg_num_step_conf": 10.0546875, "calib/ece": 0.43605577689243025, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.005359200922013141, "calib/mean_conf": 0.9818725099601593, "calib/mu_c": 0.9843065693430658, "calib/mu_w": 0.9789473684210527, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43605577689243025, "calib/std_conf": 0.014144941280175828, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8640315315315314, "calib/step_q_c_n": 1332.0, "calib/step_q_gap": -0.0008074378726554698, "calib/step_q_w": 0.8648389694041869, "calib/step_q_w_n": 1242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 865.6328125, "completions/mean_terminated_length": 879.373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.12266666666666666, "grad_norm": 0.02993854694068432, "kl": 0.129150390625, "learning_rate": 2.361111111111111e-06, "loss": -0.073, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.016907256096601486, "mask/share_reasoning": 0.8484659194946289, "mask/share_step_conf": 0.1190018355846405, "num_tokens": 37674956.0, "reward": 0.7629421949386597, "reward_std": 0.25496959686279297, "rewards/accuracy_reward_step": 0.53515625, "rewards/asymmetric_l2_reward": 0.6740585565567017, "rewards/final_brier_reward_step": 0.549481987953186, "rewards/format_reward_step": 0.9765625, "step": 115 }, { "adv/mean_abs_final_conf": 0.6308901309967041, "adv/mean_abs_reasoning": 0.4708402454853058, "adv/mean_abs_step_conf": 0.7694852352142334, "adv/ratio_final_to_reasoning": 1.3399239700642651, "adv/ratio_step_to_reasoning": 1.6342809319987237, "adv/std_final_conf": 0.8313889503479004, "adv/std_reasoning": 0.7394453883171082, "adv/std_step_conf": 0.9321046471595764, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5725085910652922, "calib/avg_num_step_conf": 10.48046875, "calib/ece": 0.3777732793522268, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0042591065292093555, "calib/mean_conf": 0.9850607287449393, "calib/mu_c": 0.9867333333333331, "calib/mu_w": 0.9824742268041238, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3777732793522268, "calib/std_conf": 0.011198266397935598, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8641393168117886, "calib/step_q_c_n": 1493.0, "calib/step_q_gap": 0.019441837820191932, "calib/step_q_w": 0.8446974789915966, "calib/step_q_w_n": 1190.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 854.0390625, "completions/mean_terminated_length": 881.5886840820312, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.12373333333333333, "grad_norm": 0.028440121561288834, "kl": 0.1218414306640625, "learning_rate": 2.3333333333333336e-06, "loss": -0.1257, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.016650334000587463, "mask/share_reasoning": 0.8338371515274048, "mask/share_step_conf": 0.11826249212026596, "num_tokens": 37998110.0, "reward": 0.8119787573814392, "reward_std": 0.24483713507652283, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.714918851852417, "rewards/final_brier_reward_step": 0.5988823771476746, "rewards/format_reward_step": 0.96484375, "step": 116 }, { "adv/mean_abs_final_conf": 0.6085239052772522, "adv/mean_abs_reasoning": 0.5667084455490112, "adv/mean_abs_step_conf": 0.741919994354248, "adv/ratio_final_to_reasoning": 1.0737865476624957, "adv/ratio_step_to_reasoning": 1.309174056221267, "adv/std_final_conf": 0.837278425693512, "adv/std_reasoning": 0.8099818229675293, "adv/std_step_conf": 0.9349244236946106, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5788172334372037, "calib/avg_num_step_conf": 9.9375, "calib/ece": 0.48271604938271606, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.004240617802465763, "calib/mean_conf": 0.9847736625514404, "calib/mu_c": 0.9868852459016394, "calib/mu_w": 0.9826446280991736, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.48271604938271606, "calib/std_conf": 0.012316563747766475, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8623914893617022, "calib/step_q_c_n": 1175.0, "calib/step_q_gap": 0.001441891114806526, "calib/step_q_w": 0.8609495982468957, "calib/step_q_w_n": 1369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 851.1015625, "completions/mean_terminated_length": 882.1134033203125, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.1248, "grad_norm": 0.01481497474014759, "kl": 0.1201629638671875, "learning_rate": 2.305555555555556e-06, "loss": -0.1201, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.016754884272813797, "mask/share_reasoning": 0.831017017364502, "mask/share_step_conf": 0.11707185953855515, "num_tokens": 38322592.0, "reward": 0.7045153379440308, "reward_std": 0.2681896686553955, "rewards/accuracy_reward_step": 0.4765625, "rewards/asymmetric_l2_reward": 0.6370803117752075, "rewards/final_brier_reward_step": 0.4883566200733185, "rewards/format_reward_step": 0.94140625, "step": 117 }, { "adv/mean_abs_final_conf": 0.44347405433654785, "adv/mean_abs_reasoning": 0.3869093060493469, "adv/mean_abs_step_conf": 0.7286380529403687, "adv/ratio_final_to_reasoning": 1.1461964015928492, "adv/ratio_step_to_reasoning": 1.8832270031970675, "adv/std_final_conf": 0.7195388674736023, "adv/std_reasoning": 0.6816406846046448, "adv/std_step_conf": 0.9258517026901245, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6229621921609434, "calib/avg_num_step_conf": 10.890625, "calib/ece": 0.3604435483870969, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00698924731182804, "calib/mean_conf": 0.9854435483870969, "calib/mu_c": 0.9880645161290321, "calib/mu_w": 0.9810752688172041, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3604435483870969, "calib/std_conf": 0.01069202477833652, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8498784194528877, "calib/step_q_c_n": 1645.0, "calib/step_q_gap": 0.003736687169422992, "calib/step_q_w": 0.8461417322834647, "calib/step_q_w_n": 1143.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 849.3671875, "completions/mean_terminated_length": 866.286865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 408.0, "epoch": 0.12586666666666665, "grad_norm": 0.025349227711558342, "kl": 0.115478515625, "learning_rate": 2.277777777777778e-06, "loss": -0.1366, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.017265360802412033, "mask/share_reasoning": 0.8359560966491699, "mask/share_step_conf": 0.12724733352661133, "num_tokens": 38644038.0, "reward": 0.8097965717315674, "reward_std": 0.17407071590423584, "rewards/accuracy_reward_step": 0.60546875, "rewards/asymmetric_l2_reward": 0.6867055892944336, "rewards/final_brier_reward_step": 0.6188250184059143, "rewards/format_reward_step": 0.96484375, "step": 118 }, { "adv/mean_abs_final_conf": 0.6102824211120605, "adv/mean_abs_reasoning": 0.5631183385848999, "adv/mean_abs_step_conf": 0.7842050790786743, "adv/ratio_final_to_reasoning": 1.0837551883777798, "adv/ratio_step_to_reasoning": 1.3926115087094464, "adv/std_final_conf": 0.8209497332572937, "adv/std_reasoning": 0.7930397987365723, "adv/std_step_conf": 0.932422935962677, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5962677317173943, "calib/avg_num_step_conf": 10.0546875, "calib/ece": 0.4205349794238683, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005656245696185058, "calib/mean_conf": 0.984320987654321, "calib/mu_c": 0.9867883211678832, "calib/mu_w": 0.9811320754716981, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4205349794238683, "calib/std_conf": 0.011646890286489645, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8619156804733727, "calib/step_q_c_n": 1352.0, "calib/step_q_gap": 0.01773401107893724, "calib/step_q_w": 0.8441816693944355, "calib/step_q_w_n": 1222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 907.3125, "completions/mean_terminated_length": 932.8192138671875, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.12693333333333334, "grad_norm": 0.017647327855229378, "kl": 0.115753173828125, "learning_rate": 2.25e-06, "loss": -0.1018, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.016389839351177216, "mask/share_reasoning": 0.8423788547515869, "mask/share_step_conf": 0.11388753354549408, "num_tokens": 38981374.0, "reward": 0.7642459273338318, "reward_std": 0.28795790672302246, "rewards/accuracy_reward_step": 0.5390625, "rewards/asymmetric_l2_reward": 0.680416464805603, "rewards/final_brier_reward_step": 0.5504191517829895, "rewards/format_reward_step": 0.94921875, "step": 119 }, { "adv/mean_abs_final_conf": 0.5616532564163208, "adv/mean_abs_reasoning": 0.5108141899108887, "adv/mean_abs_step_conf": 0.7378266453742981, "adv/ratio_final_to_reasoning": 1.099525556473482, "adv/ratio_step_to_reasoning": 1.4444129782358075, "adv/std_final_conf": 0.8124690055847168, "adv/std_reasoning": 0.792822539806366, "adv/std_step_conf": 0.9319746494293213, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6360431813174707, "calib/avg_num_step_conf": 9.78515625, "calib/ece": 0.35239669421487607, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007853418520966327, "calib/mean_conf": 0.9846280991735538, "calib/mu_c": 0.9875163398692809, "calib/mu_w": 0.9796629213483146, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35239669421487607, "calib/std_conf": 0.011430083715451561, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8604258241758244, "calib/step_q_c_n": 1456.0, "calib/step_q_gap": 0.009243746006138975, "calib/step_q_w": 0.8511820781696854, "calib/step_q_w_n": 1049.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 829.47265625, "completions/mean_terminated_length": 866.7142333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.128, "grad_norm": 0.013983098790049553, "kl": 0.118865966796875, "learning_rate": 2.222222222222222e-06, "loss": -0.1756, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01693129353225231, "mask/share_reasoning": 0.8256010413169861, "mask/share_step_conf": 0.11449891328811646, "num_tokens": 39300407.0, "reward": 0.8072173595428467, "reward_std": 0.2549954056739807, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.699937105178833, "rewards/final_brier_reward_step": 0.6074663996696472, "rewards/format_reward_step": 0.9375, "step": 120 }, { "adv/mean_abs_final_conf": 0.6494415998458862, "adv/mean_abs_reasoning": 0.5642135143280029, "adv/mean_abs_step_conf": 0.763487696647644, "adv/ratio_final_to_reasoning": 1.1510564411407138, "adv/ratio_step_to_reasoning": 1.3531893108887747, "adv/std_final_conf": 0.8462623357772827, "adv/std_reasoning": 0.8100326657295227, "adv/std_step_conf": 0.9326791763305664, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5847723704866562, "calib/avg_num_step_conf": 9.84765625, "calib/ece": 0.3912448132780084, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00501070358213207, "calib/mean_conf": 0.9846058091286308, "calib/mu_c": 0.9866433566433565, "calib/mu_w": 0.9816326530612244, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3912448132780084, "calib/std_conf": 0.011412221713071413, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8529175946547883, "calib/step_q_c_n": 1347.0, "calib/step_q_gap": 0.007457628726338594, "calib/step_q_w": 0.8454599659284497, "calib/step_q_w_n": 1174.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 899.78515625, "completions/mean_terminated_length": 932.5708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.12906666666666666, "grad_norm": 0.02037104405462742, "kl": 0.1131134033203125, "learning_rate": 2.1944444444444445e-06, "loss": -0.0718, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.015997782349586487, "mask/share_reasoning": 0.8389323949813843, "mask/share_step_conf": 0.10991354286670685, "num_tokens": 39635808.0, "reward": 0.7773492932319641, "reward_std": 0.271402508020401, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6823880672454834, "rewards/final_brier_reward_step": 0.5723105072975159, "rewards/format_reward_step": 0.94140625, "step": 121 }, { "adv/mean_abs_final_conf": 0.5551102161407471, "adv/mean_abs_reasoning": 0.4751325845718384, "adv/mean_abs_step_conf": 0.7243285775184631, "adv/ratio_final_to_reasoning": 1.1683269768605322, "adv/ratio_step_to_reasoning": 1.5244767482558275, "adv/std_final_conf": 0.7889186143875122, "adv/std_reasoning": 0.7576159238815308, "adv/std_step_conf": 0.931524932384491, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6446941612604262, "calib/avg_num_step_conf": 9.8359375, "calib/ece": 0.30397540983606564, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008775100401606672, "calib/mean_conf": 0.9843032786885246, "calib/mu_c": 0.9871084337349396, "calib/mu_w": 0.9783333333333329, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30397540983606564, "calib/std_conf": 0.011870468484313796, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8596106304079111, "calib/step_q_c_n": 1618.0, "calib/step_q_gap": 0.013721741519022279, "calib/step_q_w": 0.8458888888888888, "calib/step_q_w_n": 900.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 868.0546875, "completions/mean_terminated_length": 896.056396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.13013333333333332, "grad_norm": 0.043400537222623825, "kl": 0.1139068603515625, "learning_rate": 2.166666666666667e-06, "loss": -0.0989, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01688627153635025, "mask/share_reasoning": 0.8362627029418945, "mask/share_step_conf": 0.11560102552175522, "num_tokens": 39965374.0, "reward": 0.8607096672058105, "reward_std": 0.22507581114768982, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.739835262298584, "rewards/final_brier_reward_step": 0.6612714529037476, "rewards/format_reward_step": 0.953125, "step": 122 }, { "adv/mean_abs_final_conf": 0.6799473762512207, "adv/mean_abs_reasoning": 0.5501300096511841, "adv/mean_abs_step_conf": 0.7506182193756104, "adv/ratio_final_to_reasoning": 1.2359757953985253, "adv/ratio_step_to_reasoning": 1.3644378714252436, "adv/std_final_conf": 0.8617650270462036, "adv/std_reasoning": 0.7928981184959412, "adv/std_step_conf": 0.9324036240577698, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5671607378129118, "calib/avg_num_step_conf": 9.46484375, "calib/ece": 0.44789473684210523, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0038300395256917197, "calib/mean_conf": 0.9823076923076923, "calib/mu_c": 0.9840909090909091, "calib/mu_w": 0.9802608695652174, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44789473684210523, "calib/std_conf": 0.01294407393907125, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8543739837398375, "calib/step_q_c_n": 1230.0, "calib/step_q_gap": 0.011959901593986744, "calib/step_q_w": 0.8424140821458508, "calib/step_q_w_n": 1193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 937.48046875, "completions/mean_terminated_length": 967.7217407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 509.0, "epoch": 0.1312, "grad_norm": 0.021335626021027565, "kl": 0.1084747314453125, "learning_rate": 2.138888888888889e-06, "loss": -0.1323, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.015504155308008194, "mask/share_reasoning": 0.8473955392837524, "mask/share_step_conf": 0.10585027933120728, "num_tokens": 40310657.0, "reward": 0.7545461654663086, "reward_std": 0.26759061217308044, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.6801028251647949, "rewards/final_brier_reward_step": 0.5328956842422485, "rewards/format_reward_step": 0.96484375, "step": 123 }, { "adv/mean_abs_final_conf": 0.6220813989639282, "adv/mean_abs_reasoning": 0.49169105291366577, "adv/mean_abs_step_conf": 0.7388211488723755, "adv/ratio_final_to_reasoning": 1.2651875507548787, "adv/ratio_step_to_reasoning": 1.5026125541521749, "adv/std_final_conf": 0.838358461856842, "adv/std_reasoning": 0.757723331451416, "adv/std_step_conf": 0.9337208867073059, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6167235009946007, "calib/avg_num_step_conf": 10.2421875, "calib/ece": 0.35746938775510195, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006618357487922943, "calib/mean_conf": 0.9819591836734693, "calib/mu_c": 0.9844444444444445, "calib/mu_w": 0.9778260869565215, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35746938775510195, "calib/std_conf": 0.013133473643551283, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8522455902306649, "calib/step_q_c_n": 1474.0, "calib/step_q_gap": 0.009092280126135277, "calib/step_q_w": 0.8431533101045297, "calib/step_q_w_n": 1148.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 884.62109375, "completions/mean_terminated_length": 916.854248046875, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.13226666666666667, "grad_norm": 0.02123633213341236, "kl": 0.1136016845703125, "learning_rate": 2.1111111111111114e-06, "loss": -0.1747, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016168804839253426, "mask/share_reasoning": 0.8370611667633057, "mask/share_step_conf": 0.11161378026008606, "num_tokens": 40643936.0, "reward": 0.8192969560623169, "reward_std": 0.2530980706214905, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.7145392298698425, "rewards/final_brier_reward_step": 0.6131171584129333, "rewards/format_reward_step": 0.95703125, "step": 124 }, { "adv/mean_abs_final_conf": 0.5496960282325745, "adv/mean_abs_reasoning": 0.509536862373352, "adv/mean_abs_step_conf": 0.7545234560966492, "adv/ratio_final_to_reasoning": 1.078815035426812, "adv/ratio_step_to_reasoning": 1.4808024930368797, "adv/std_final_conf": 0.7946847677230835, "adv/std_reasoning": 0.7754969596862793, "adv/std_step_conf": 0.9305669069290161, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5919524336283186, "calib/avg_num_step_conf": 10.23828125, "calib/ece": 0.4519502074688797, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005281388274336329, "calib/mean_conf": 0.9830705394190872, "calib/mu_c": 0.985546875, "calib/mu_w": 0.9802654867256637, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4519502074688797, "calib/std_conf": 0.012544989714182483, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8530210016155089, "calib/step_q_c_n": 1238.0, "calib/step_q_gap": 0.0013941035677863534, "calib/step_q_w": 0.8516268980477225, "calib/step_q_w_n": 1383.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 891.6171875, "completions/mean_terminated_length": 931.64892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.13333333333333333, "grad_norm": 0.027528828009963036, "kl": 0.1133575439453125, "learning_rate": 2.0833333333333334e-06, "loss": -0.1541, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01594950258731842, "mask/share_reasoning": 0.8337675333023071, "mask/share_step_conf": 0.10731419920921326, "num_tokens": 40976998.0, "reward": 0.7393513321876526, "reward_std": 0.24845367670059204, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.6734174489974976, "rewards/final_brier_reward_step": 0.5170038938522339, "rewards/format_reward_step": 0.94140625, "step": 125 }, { "adv/mean_abs_final_conf": 0.6131834983825684, "adv/mean_abs_reasoning": 0.505977988243103, "adv/mean_abs_step_conf": 0.7746909856796265, "adv/ratio_final_to_reasoning": 1.211877814115418, "adv/ratio_step_to_reasoning": 1.5310764572379325, "adv/std_final_conf": 0.8193503618240356, "adv/std_reasoning": 0.7578449845314026, "adv/std_step_conf": 0.9281861186027527, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6112836812829588, "calib/avg_num_step_conf": 10.14453125, "calib/ece": 0.4433474576271186, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006677020876977524, "calib/mean_conf": 0.9814830508474576, "calib/mu_c": 0.9845669291338581, "calib/mu_w": 0.9778899082568806, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4433474576271186, "calib/std_conf": 0.013526642292501095, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8479055118110236, "calib/step_q_c_n": 1270.0, "calib/step_q_gap": 0.015275519346818589, "calib/step_q_w": 0.832629992464205, "calib/step_q_w_n": 1327.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 887.1015625, "completions/mean_terminated_length": 934.5596313476562, "completions/min_length": 0.0, "completions/min_terminated_length": 425.0, "epoch": 0.1344, "grad_norm": 0.027923274785280228, "kl": 0.1035614013671875, "learning_rate": 2.0555555555555555e-06, "loss": -0.2056, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.016019921749830246, "mask/share_reasoning": 0.8219481706619263, "mask/share_step_conf": 0.1112506240606308, "num_tokens": 41309560.0, "reward": 0.731331467628479, "reward_std": 0.26831701397895813, "rewards/accuracy_reward_step": 0.5, "rewards/asymmetric_l2_reward": 0.6638506650924683, "rewards/final_brier_reward_step": 0.5144370794296265, "rewards/format_reward_step": 0.921875, "step": 126 }, { "adv/mean_abs_final_conf": 0.5383285284042358, "adv/mean_abs_reasoning": 0.45178020000457764, "adv/mean_abs_step_conf": 0.7675524950027466, "adv/ratio_final_to_reasoning": 1.1915717607783194, "adv/ratio_step_to_reasoning": 1.6989511603097467, "adv/std_final_conf": 0.7692522406578064, "adv/std_reasoning": 0.7207222580909729, "adv/std_step_conf": 0.9278998374938965, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5820910973084885, "calib/avg_num_step_conf": 9.23046875, "calib/ece": 0.46024896265560167, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004875086266390438, "calib/mean_conf": 0.9830705394190872, "calib/mu_c": 0.9853968253968253, "calib/mu_w": 0.9805217391304348, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46024896265560167, "calib/std_conf": 0.012610968161338316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.850793388429752, "calib/step_q_c_n": 1210.0, "calib/step_q_gap": 0.016578297362969674, "calib/step_q_w": 0.8342150910667824, "calib/step_q_w_n": 1153.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 838.109375, "completions/mean_terminated_length": 868.6477661132812, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.13546666666666668, "grad_norm": 0.06415653973817825, "kl": 0.127960205078125, "learning_rate": 2.027777777777778e-06, "loss": -0.0919, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016996802762150764, "mask/share_reasoning": 0.8339184522628784, "mask/share_step_conf": 0.11392848193645477, "num_tokens": 41627788.0, "reward": 0.7279948592185974, "reward_std": 0.21570391952991486, "rewards/accuracy_reward_step": 0.4921875, "rewards/asymmetric_l2_reward": 0.6600029468536377, "rewards/final_brier_reward_step": 0.5092679262161255, "rewards/format_reward_step": 0.94140625, "step": 127 }, { "adv/mean_abs_final_conf": 0.6560871005058289, "adv/mean_abs_reasoning": 0.5164062976837158, "adv/mean_abs_step_conf": 0.7905282974243164, "adv/ratio_final_to_reasoning": 1.2704862497778127, "adv/ratio_step_to_reasoning": 1.5308262137199817, "adv/std_final_conf": 0.8383203148841858, "adv/std_reasoning": 0.7577394843101501, "adv/std_step_conf": 0.92868971824646, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6346334586466165, "calib/avg_num_step_conf": 9.8359375, "calib/ece": 0.4346530612244899, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008078007518796837, "calib/mean_conf": 0.9775102040816327, "calib/mu_c": 0.981203007518797, "calib/mu_w": 0.9731250000000001, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4346530612244899, "calib/std_conf": 0.01478847103214376, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8522678018575852, "calib/step_q_c_n": 1292.0, "calib/step_q_gap": 0.0211095636846651, "calib/step_q_w": 0.8311582381729201, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 950.8515625, "completions/mean_terminated_length": 969.7928466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 499.0, "epoch": 0.13653333333333334, "grad_norm": 0.04046202450990677, "kl": 0.11383056640625, "learning_rate": 2.0000000000000003e-06, "loss": -0.0728, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.015950171276926994, "mask/share_reasoning": 0.8544788360595703, "mask/share_step_conf": 0.11003974080085754, "num_tokens": 41977870.0, "reward": 0.7633705139160156, "reward_std": 0.2553948760032654, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6937562823295593, "rewards/final_brier_reward_step": 0.5384535193443298, "rewards/format_reward_step": 0.953125, "step": 128 }, { "adv/mean_abs_final_conf": 0.5023715496063232, "adv/mean_abs_reasoning": 0.3759290277957916, "adv/mean_abs_step_conf": 0.763197124004364, "adv/ratio_final_to_reasoning": 1.3363467901159696, "adv/ratio_step_to_reasoning": 2.030162790245983, "adv/std_final_conf": 0.7428443431854248, "adv/std_reasoning": 0.6614742875099182, "adv/std_step_conf": 0.9317966103553772, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.62, "calib/avg_num_step_conf": 9.74609375, "calib/ece": 0.3892244897959183, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007200000000000317, "calib/mean_conf": 0.9810612244897958, "calib/mu_c": 0.9840000000000001, "calib/mu_w": 0.9767999999999998, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3892244897959183, "calib/std_conf": 0.013720843912977747, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.847810650887574, "calib/step_q_c_n": 1352.0, "calib/step_q_gap": 0.004477317554240634, "calib/step_q_w": 0.8433333333333334, "calib/step_q_w_n": 1143.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 838.765625, "completions/mean_terminated_length": 862.3453369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.1376, "grad_norm": 0.03307565301656723, "kl": 0.1181793212890625, "learning_rate": 1.9722222222222224e-06, "loss": -0.1346, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01715748943388462, "mask/share_reasoning": 0.8388442993164062, "mask/share_step_conf": 0.1166544258594513, "num_tokens": 42294978.0, "reward": 0.8008184432983398, "reward_std": 0.20455628633499146, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.7129415273666382, "rewards/final_brier_reward_step": 0.5840077996253967, "rewards/format_reward_step": 0.95703125, "step": 129 }, { "adv/mean_abs_final_conf": 0.49385571479797363, "adv/mean_abs_reasoning": 0.31653401255607605, "adv/mean_abs_step_conf": 0.7410389184951782, "adv/ratio_final_to_reasoning": 1.5601979414786709, "adv/ratio_step_to_reasoning": 2.34110360687984, "adv/std_final_conf": 0.7291826009750366, "adv/std_reasoning": 0.6186844110488892, "adv/std_step_conf": 0.9263778924942017, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6813129697745083, "calib/avg_num_step_conf": 9.4375, "calib/ece": 0.28510288065843614, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010738045738046265, "calib/mean_conf": 0.9805761316872428, "calib/mu_c": 0.983846153846154, "calib/mu_w": 0.9731081081081078, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28510288065843614, "calib/std_conf": 0.013746580804025008, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8522900262467191, "calib/step_q_c_n": 1524.0, "calib/step_q_gap": 0.007346080058378224, "calib/step_q_w": 0.8449439461883409, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 793.53125, "completions/mean_terminated_length": 825.78857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.13866666666666666, "grad_norm": 0.03069177456200123, "kl": 0.114990234375, "learning_rate": 1.944444444444445e-06, "loss": -0.1845, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01768389716744423, "mask/share_reasoning": 0.8277462124824524, "mask/share_step_conf": 0.11550737917423248, "num_tokens": 42603410.0, "reward": 0.8770005106925964, "reward_std": 0.16171851754188538, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.7616474628448486, "rewards/final_brier_reward_step": 0.6712597608566284, "rewards/format_reward_step": 0.9453125, "step": 130 }, { "adv/mean_abs_final_conf": 0.5914018154144287, "adv/mean_abs_reasoning": 0.5124457478523254, "adv/mean_abs_step_conf": 0.7483264803886414, "adv/ratio_final_to_reasoning": 1.1540769298857692, "adv/ratio_step_to_reasoning": 1.4603038146474991, "adv/std_final_conf": 0.8130688667297363, "adv/std_reasoning": 0.7754942178726196, "adv/std_step_conf": 0.9339196085929871, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6212319790301443, "calib/avg_num_step_conf": 10.33984375, "calib/ece": 0.5120425531914893, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.9957446808510638, "calib/gap": 0.007698412698412649, "calib/mean_conf": 0.9758723404255318, "calib/mu_c": 0.9799999999999998, "calib/mu_w": 0.9723015873015871, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5120425531914893, "calib/std_conf": 0.01575020029764783, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8409661835748792, "calib/step_q_c_n": 1035.0, "calib/step_q_gap": 0.018280079356516876, "calib/step_q_w": 0.8226861042183623, "calib/step_q_w_n": 1612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2675.0, "completions/max_terminated_length": 2675.0, "completions/mean_length": 808.34375, "completions/mean_terminated_length": 865.8410034179688, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.13973333333333332, "grad_norm": 0.033274900168180466, "kl": 0.125213623046875, "learning_rate": 1.916666666666667e-06, "loss": -0.2252, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.01633717119693756, "mask/share_reasoning": 0.8041282892227173, "mask/share_step_conf": 0.11312822997570038, "num_tokens": 42916554.0, "reward": 0.6535139083862305, "reward_std": 0.24080276489257812, "rewards/accuracy_reward_step": 0.4296875, "rewards/asymmetric_l2_reward": 0.5873867273330688, "rewards/final_brier_reward_step": 0.4516722857952118, "rewards/format_reward_step": 0.91015625, "step": 131 }, { "adv/mean_abs_final_conf": 0.5862385034561157, "adv/mean_abs_reasoning": 0.5381482243537903, "adv/mean_abs_step_conf": 0.7538403272628784, "adv/ratio_final_to_reasoning": 1.0893625156156046, "adv/ratio_step_to_reasoning": 1.4008042638588871, "adv/std_final_conf": 0.7897200584411621, "adv/std_reasoning": 0.7754783630371094, "adv/std_step_conf": 0.9292224645614624, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7671005261700359, "calib/avg_num_step_conf": 9.6875, "calib/ece": 0.34542168674698803, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.015805178620880822, "calib/mean_conf": 0.9759437751004016, "calib/mu_c": 0.9817834394904458, "calib/mu_w": 0.965978260869565, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34542168674698803, "calib/std_conf": 0.014916529853053997, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.846875420875421, "calib/step_q_c_n": 1485.0, "calib/step_q_gap": 0.02651361183019485, "calib/step_q_w": 0.8203618090452262, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 872.87109375, "completions/mean_terminated_length": 883.2213745117188, "completions/min_length": 0.0, "completions/min_terminated_length": 469.0, "epoch": 0.1408, "grad_norm": 0.04500338062644005, "kl": 0.3972930908203125, "learning_rate": 1.888888888888889e-06, "loss": -0.0107, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.017480727285146713, "mask/share_reasoning": 0.8506555557250977, "mask/share_step_conf": 0.12014497816562653, "num_tokens": 43245601.0, "reward": 0.8479660749435425, "reward_std": 0.24857370555400848, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7464767694473267, "rewards/final_brier_reward_step": 0.6330491900444031, "rewards/format_reward_step": 0.96875, "step": 132 }, { "adv/mean_abs_final_conf": 0.6809636950492859, "adv/mean_abs_reasoning": 0.560234010219574, "adv/mean_abs_step_conf": 0.7415635585784912, "adv/ratio_final_to_reasoning": 1.2154986713184264, "adv/ratio_step_to_reasoning": 1.3236675122380526, "adv/std_final_conf": 0.894681453704834, "adv/std_reasoning": 0.8429984450340271, "adv/std_step_conf": 0.9343292713165283, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6300944669365721, "calib/avg_num_step_conf": 9.78125, "calib/ece": 0.5029508196721313, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007924426450742206, "calib/mean_conf": 0.9701639344262296, "calib/mu_c": 0.9743859649122807, "calib/mu_w": 0.9664615384615385, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5029508196721313, "calib/std_conf": 0.014256641049997975, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.833780260707635, "calib/step_q_c_n": 1074.0, "calib/step_q_gap": 0.026066973994348097, "calib/step_q_w": 0.8077132867132869, "calib/step_q_w_n": 1430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 946.0859375, "completions/mean_terminated_length": 976.6047973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.14186666666666667, "grad_norm": 0.032102812081575394, "kl": 0.1101837158203125, "learning_rate": 1.8611111111111113e-06, "loss": -0.0748, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.015190929174423218, "mask/share_reasoning": 0.8482433557510376, "mask/share_step_conf": 0.10531570762395859, "num_tokens": 43594143.0, "reward": 0.6962805986404419, "reward_std": 0.29506897926330566, "rewards/accuracy_reward_step": 0.4453125, "rewards/asymmetric_l2_reward": 0.6364873647689819, "rewards/final_brier_reward_step": 0.4779488146305084, "rewards/format_reward_step": 0.9453125, "step": 133 }, { "adv/mean_abs_final_conf": 0.6505323648452759, "adv/mean_abs_reasoning": 0.5010393261909485, "adv/mean_abs_step_conf": 0.7333165407180786, "adv/ratio_final_to_reasoning": 1.2983658783649148, "adv/ratio_step_to_reasoning": 1.4635907849648675, "adv/std_final_conf": 0.8285301923751831, "adv/std_reasoning": 0.7577097415924072, "adv/std_step_conf": 0.9305702447891235, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6528799628819513, "calib/avg_num_step_conf": 9.2890625, "calib/ece": 0.40048387096774196, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00881818784383892, "calib/mean_conf": 0.9690322580645162, "calib/mu_c": 0.972836879432624, "calib/mu_w": 0.9640186915887851, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40048387096774196, "calib/std_conf": 0.013703034443715784, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.827359375, "calib/step_q_c_n": 1280.0, "calib/step_q_gap": 0.019189976092896188, "calib/step_q_w": 0.8081693989071038, "calib/step_q_w_n": 1098.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 957.24609375, "completions/mean_terminated_length": 972.4405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.14293333333333333, "grad_norm": 0.05993098393082619, "kl": 0.1142578125, "learning_rate": 1.8333333333333333e-06, "loss": -0.0637, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.015441222116351128, "mask/share_reasoning": 0.8652579188346863, "mask/share_step_conf": 0.10367587953805923, "num_tokens": 43948150.0, "reward": 0.7980170845985413, "reward_std": 0.24089252948760986, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.7133153676986694, "rewards/final_brier_reward_step": 0.5795937776565552, "rewards/format_reward_step": 0.96484375, "step": 134 }, { "adv/mean_abs_final_conf": 0.6484160423278809, "adv/mean_abs_reasoning": 0.49199366569519043, "adv/mean_abs_step_conf": 0.7641174793243408, "adv/ratio_final_to_reasoning": 1.3179357531192288, "adv/ratio_step_to_reasoning": 1.553104303171541, "adv/std_final_conf": 0.84324711561203, "adv/std_reasoning": 0.7575876712799072, "adv/std_step_conf": 0.9321054220199585, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.604211882799783, "calib/avg_num_step_conf": 9.78515625, "calib/ece": 0.36265060240963853, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006247965274009837, "calib/mean_conf": 0.9730923694779117, "calib/mu_c": 0.9755263157894737, "calib/mu_w": 0.9692783505154638, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36265060240963853, "calib/std_conf": 0.014851186246889227, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8211809392265192, "calib/step_q_c_n": 1448.0, "calib/step_q_gap": 0.0074723299549960265, "calib/step_q_w": 0.8137086092715232, "calib/step_q_w_n": 1057.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 891.28125, "completions/mean_terminated_length": 905.4286499023438, "completions/min_length": 0.0, "completions/min_terminated_length": 480.0, "epoch": 0.144, "grad_norm": 0.02977120503783226, "kl": 0.1153411865234375, "learning_rate": 1.8055555555555557e-06, "loss": -0.0757, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.016633452847599983, "mask/share_reasoning": 0.8540046215057373, "mask/share_step_conf": 0.11373695731163025, "num_tokens": 44282198.0, "reward": 0.8326214551925659, "reward_std": 0.22683849930763245, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7369366884231567, "rewards/final_brier_reward_step": 0.6158062219619751, "rewards/format_reward_step": 0.96875, "step": 135 }, { "adv/mean_abs_final_conf": 0.5686486959457397, "adv/mean_abs_reasoning": 0.4569851756095886, "adv/mean_abs_step_conf": 0.7533724904060364, "adv/ratio_final_to_reasoning": 1.2443482333692755, "adv/ratio_step_to_reasoning": 1.6485709616314934, "adv/std_final_conf": 0.7785744667053223, "adv/std_reasoning": 0.7207177877426147, "adv/std_step_conf": 0.9300398826599121, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6370212468549064, "calib/avg_num_step_conf": 10.1875, "calib/ece": 0.37372950819672135, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008268101761252145, "calib/mean_conf": 0.9720901639344263, "calib/mu_c": 0.9754109589041093, "calib/mu_w": 0.9671428571428572, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37372950819672135, "calib/std_conf": 0.014687177466768042, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8182135661517799, "calib/step_q_c_n": 1489.0, "calib/step_q_gap": 0.022136711817552945, "calib/step_q_w": 0.7960768543342269, "calib/step_q_w_n": 1119.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 882.46875, "completions/mean_terminated_length": 910.9354858398438, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.14506666666666668, "grad_norm": 0.024910585954785347, "kl": 0.11865234375, "learning_rate": 1.777777777777778e-06, "loss": -0.2104, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01636427454650402, "mask/share_reasoning": 0.8356093168258667, "mask/share_step_conf": 0.11677637696266174, "num_tokens": 44616598.0, "reward": 0.7921877503395081, "reward_std": 0.21893033385276794, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6898484230041504, "rewards/final_brier_reward_step": 0.590620756149292, "rewards/format_reward_step": 0.94921875, "step": 136 }, { "adv/mean_abs_final_conf": 0.5874014496803284, "adv/mean_abs_reasoning": 0.35178643465042114, "adv/mean_abs_step_conf": 0.7439720034599304, "adv/ratio_final_to_reasoning": 1.669767199136157, "adv/ratio_step_to_reasoning": 2.1148399431582225, "adv/std_final_conf": 0.8005043268203735, "adv/std_reasoning": 0.6611693501472473, "adv/std_step_conf": 0.9287843704223633, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6661753144654087, "calib/avg_num_step_conf": 10.35546875, "calib/ece": 0.39668000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009730083857442229, "calib/mean_conf": 0.9726800000000001, "calib/mu_c": 0.9768055555555555, "calib/mu_w": 0.9670754716981133, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39668000000000014, "calib/std_conf": 0.01468392318149344, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8195276162790698, "calib/step_q_c_n": 1376.0, "calib/step_q_gap": 0.02033545941632464, "calib/step_q_w": 0.7991921568627451, "calib/step_q_w_n": 1275.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 914.8203125, "completions/mean_terminated_length": 925.6680297851562, "completions/min_length": 0.0, "completions/min_terminated_length": 489.0, "epoch": 0.14613333333333334, "grad_norm": 0.032471757382154465, "kl": 0.114349365234375, "learning_rate": 1.75e-06, "loss": -0.0626, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.016321057453751564, "mask/share_reasoning": 0.8556821346282959, "mask/share_step_conf": 0.11627800762653351, "num_tokens": 44957776.0, "reward": 0.8128588199615479, "reward_std": 0.1617228239774704, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.7290788888931274, "rewards/final_brier_reward_step": 0.5888261795043945, "rewards/format_reward_step": 0.9765625, "step": 137 }, { "adv/mean_abs_final_conf": 0.5582438111305237, "adv/mean_abs_reasoning": 0.46346306800842285, "adv/mean_abs_step_conf": 0.7503811120986938, "adv/ratio_final_to_reasoning": 1.2045054928095766, "adv/ratio_step_to_reasoning": 1.6190742345947975, "adv/std_final_conf": 0.7708762288093567, "adv/std_reasoning": 0.7208356261253357, "adv/std_step_conf": 0.9279402494430542, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6776187883972316, "calib/avg_num_step_conf": 9.95703125, "calib/ece": 0.2922131147540983, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010649350649350797, "calib/mean_conf": 0.976639344262295, "calib/mu_c": 0.9799999999999998, "calib/mu_w": 0.969350649350649, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2922131147540983, "calib/std_conf": 0.014855073413732612, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8179331306990881, "calib/step_q_c_n": 1645.0, "calib/step_q_gap": 0.0156101218495307, "calib/step_q_w": 0.8023230088495574, "calib/step_q_w_n": 904.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 896.2890625, "completions/mean_terminated_length": 925.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 446.0, "epoch": 0.1472, "grad_norm": 0.049276694655418396, "kl": 0.121612548828125, "learning_rate": 1.7222222222222224e-06, "loss": -0.0866, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.016306929290294647, "mask/share_reasoning": 0.8377681970596313, "mask/share_step_conf": 0.11467483639717102, "num_tokens": 45291562.0, "reward": 0.8790077567100525, "reward_std": 0.24240699410438538, "rewards/accuracy_reward_step": 0.65234375, "rewards/asymmetric_l2_reward": 0.7668709754943848, "rewards/final_brier_reward_step": 0.6700507402420044, "rewards/format_reward_step": 0.953125, "step": 138 }, { "adv/mean_abs_final_conf": 0.5593092441558838, "adv/mean_abs_reasoning": 0.3993344306945801, "adv/mean_abs_step_conf": 0.7353893518447876, "adv/ratio_final_to_reasoning": 1.4006036073149326, "adv/ratio_step_to_reasoning": 1.8415375567934182, "adv/std_final_conf": 0.7827629446983337, "adv/std_reasoning": 0.6816564798355103, "adv/std_step_conf": 0.9297609925270081, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6579830053667263, "calib/avg_num_step_conf": 9.88671875, "calib/ece": 0.2951600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009251639833035341, "calib/mean_conf": 0.98316, "calib/mu_c": 0.9860465116279068, "calib/mu_w": 0.9767948717948715, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2951600000000001, "calib/std_conf": 0.01277553912756719, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8246811070998796, "calib/step_q_c_n": 1662.0, "calib/step_q_gap": 0.0137835236706505, "calib/step_q_w": 0.8108975834292291, "calib/step_q_w_n": 869.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 792.83203125, "completions/mean_terminated_length": 811.8600463867188, "completions/min_length": 0.0, "completions/min_terminated_length": 524.0, "epoch": 0.14826666666666666, "grad_norm": 0.02060694247484207, "kl": 0.12457275390625, "learning_rate": 1.6944444444444446e-06, "loss": -0.0802, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017769603058695793, "mask/share_reasoning": 0.8357161283493042, "mask/share_step_conf": 0.12307681888341904, "num_tokens": 45597623.0, "reward": 0.9000256061553955, "reward_std": 0.20213943719863892, "rewards/accuracy_reward_step": 0.671875, "rewards/asymmetric_l2_reward": 0.7916465997695923, "rewards/final_brier_reward_step": 0.6810609102249146, "rewards/format_reward_step": 0.96484375, "step": 139 }, { "adv/mean_abs_final_conf": 0.48904263973236084, "adv/mean_abs_reasoning": 0.42571309208869934, "adv/mean_abs_step_conf": 0.7449794411659241, "adv/ratio_final_to_reasoning": 1.1487610994835613, "adv/ratio_step_to_reasoning": 1.749956613997448, "adv/std_final_conf": 0.7239442467689514, "adv/std_reasoning": 0.7015735507011414, "adv/std_step_conf": 0.9283146858215332, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6101296958855098, "calib/avg_num_step_conf": 10.92578125, "calib/ece": 0.3378925619834712, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0064355992844369325, "calib/mean_conf": 0.9825206611570249, "calib/mu_c": 0.9848076923076923, "calib/mu_w": 0.9783720930232553, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3378925619834712, "calib/std_conf": 0.012946552030568872, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8200129032258064, "calib/step_q_c_n": 1550.0, "calib/step_q_gap": 0.039202959360529754, "calib/step_q_w": 0.7808099438652767, "calib/step_q_w_n": 1247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 814.66015625, "completions/mean_terminated_length": 851.2366943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.14933333333333335, "grad_norm": 0.02468651905655861, "kl": 0.1251983642578125, "learning_rate": 1.6666666666666667e-06, "loss": -0.1502, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016852054744958878, "mask/share_reasoning": 0.8168226480484009, "mask/share_step_conf": 0.12335656583309174, "num_tokens": 45911192.0, "reward": 0.8422061204910278, "reward_std": 0.21512775123119354, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7500156760215759, "rewards/final_brier_reward_step": 0.6234589219093323, "rewards/format_reward_step": 0.9453125, "step": 140 }, { "adv/mean_abs_final_conf": 0.4663376808166504, "adv/mean_abs_reasoning": 0.4102482199668884, "adv/mean_abs_step_conf": 0.749131441116333, "adv/ratio_final_to_reasoning": 1.1367207902920067, "adv/ratio_step_to_reasoning": 1.8260443425611845, "adv/std_final_conf": 0.7231074571609497, "adv/std_reasoning": 0.7014703154563904, "adv/std_step_conf": 0.9257126450538635, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6944444444444445, "calib/avg_num_step_conf": 10.15625, "calib/ece": 0.3129795918367346, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.011467178560674585, "calib/mean_conf": 0.9823673469387755, "calib/mu_c": 0.9861585365853658, "calib/mu_w": 0.9746913580246912, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3129795918367346, "calib/std_conf": 0.013034794687800158, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8134662576687115, "calib/step_q_c_n": 1630.0, "calib/step_q_gap": 0.02948687622541246, "calib/step_q_w": 0.783979381443299, "calib/step_q_w_n": 970.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 861.04296875, "completions/mean_terminated_length": 885.2489624023438, "completions/min_length": 0.0, "completions/min_terminated_length": 459.0, "epoch": 0.1504, "grad_norm": 0.02938557043671608, "kl": 0.1165771484375, "learning_rate": 1.638888888888889e-06, "loss": -0.1667, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.01663772389292717, "mask/share_reasoning": 0.835965633392334, "mask/share_step_conf": 0.12005292624235153, "num_tokens": 46238715.0, "reward": 0.8724101781845093, "reward_std": 0.194613516330719, "rewards/accuracy_reward_step": 0.640625, "rewards/asymmetric_l2_reward": 0.7691086530685425, "rewards/final_brier_reward_step": 0.6561804413795471, "rewards/format_reward_step": 0.95703125, "step": 141 }, { "adv/mean_abs_final_conf": 0.5036216974258423, "adv/mean_abs_reasoning": 0.4271680414676666, "adv/mean_abs_step_conf": 0.750043511390686, "adv/ratio_final_to_reasoning": 1.1789779396780145, "adv/ratio_step_to_reasoning": 1.7558511840297832, "adv/std_final_conf": 0.758115828037262, "adv/std_reasoning": 0.7014580965042114, "adv/std_step_conf": 0.9286013245582581, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6115348588251037, "calib/avg_num_step_conf": 10.296875, "calib/ece": 0.4116599190283402, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006710825638966633, "calib/mean_conf": 0.98251012145749, "calib/mu_c": 0.9853900709219856, "calib/mu_w": 0.978679245283019, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4116599190283402, "calib/std_conf": 0.013108659050052051, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.815952380952381, "calib/step_q_c_n": 1428.0, "calib/step_q_gap": 0.025207347839798278, "calib/step_q_w": 0.7907450331125827, "calib/step_q_w_n": 1208.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 893.21875, "completions/mean_terminated_length": 914.6560668945312, "completions/min_length": 0.0, "completions/min_terminated_length": 413.0, "epoch": 0.15146666666666667, "grad_norm": 0.022861601784825325, "kl": 0.1132965087890625, "learning_rate": 1.6111111111111113e-06, "loss": -0.1392, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.016534026712179184, "mask/share_reasoning": 0.8400761485099792, "mask/share_step_conf": 0.11995226889848709, "num_tokens": 46572539.0, "reward": 0.7890394926071167, "reward_std": 0.20801588892936707, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.7069774866104126, "rewards/final_brier_reward_step": 0.5679764747619629, "rewards/format_reward_step": 0.96484375, "step": 142 }, { "adv/mean_abs_final_conf": 0.5079202651977539, "adv/mean_abs_reasoning": 0.4386137127876282, "adv/mean_abs_step_conf": 0.723576545715332, "adv/ratio_final_to_reasoning": 1.158012735100426, "adv/ratio_step_to_reasoning": 1.6496897489059574, "adv/std_final_conf": 0.7746670246124268, "adv/std_reasoning": 0.7205743789672852, "adv/std_step_conf": 0.9297850131988525, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.658318615330272, "calib/avg_num_step_conf": 10.03125, "calib/ece": 0.3740573770491803, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00978664782762284, "calib/mean_conf": 0.9847131147540983, "calib/mu_c": 0.9885234899328856, "calib/mu_w": 0.9787368421052628, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3740573770491803, "calib/std_conf": 0.012024551052044528, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8181613756613758, "calib/step_q_c_n": 1512.0, "calib/step_q_gap": 0.020670845358345402, "calib/step_q_w": 0.7974905303030304, "calib/step_q_w_n": 1056.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 829.546875, "completions/mean_terminated_length": 856.306396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 380.0, "epoch": 0.15253333333333333, "grad_norm": 0.027216384187340736, "kl": 0.1242828369140625, "learning_rate": 1.5833333333333333e-06, "loss": -0.1348, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01705748960375786, "mask/share_reasoning": 0.8287699818611145, "mask/share_step_conf": 0.12292252480983734, "num_tokens": 46892239.0, "reward": 0.804224967956543, "reward_std": 0.20171010494232178, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.7086536884307861, "rewards/final_brier_reward_step": 0.5935460329055786, "rewards/format_reward_step": 0.94921875, "step": 143 }, { "adv/mean_abs_final_conf": 0.5042335987091064, "adv/mean_abs_reasoning": 0.4892060458660126, "adv/mean_abs_step_conf": 0.736696720123291, "adv/ratio_final_to_reasoning": 1.0307182484151263, "adv/ratio_step_to_reasoning": 1.5059027302476615, "adv/std_final_conf": 0.7585073709487915, "adv/std_reasoning": 0.7576208710670471, "adv/std_step_conf": 0.9292923212051392, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6311817970745298, "calib/avg_num_step_conf": 10.15625, "calib/ece": 0.2748400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007880968965250479, "calib/mean_conf": 0.98284, "calib/mu_c": 0.9851412429378531, "calib/mu_w": 0.9772602739726026, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2748400000000001, "calib/std_conf": 0.012693872537567102, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8190409356725146, "calib/step_q_c_n": 1710.0, "calib/step_q_gap": 0.026872396346671934, "calib/step_q_w": 0.7921685393258426, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 852.80078125, "completions/mean_terminated_length": 869.7888793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.1536, "grad_norm": 0.02283472567796707, "kl": 0.125030517578125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0987, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.01706160604953766, "mask/share_reasoning": 0.8418867588043213, "mask/share_step_conf": 0.12152038514614105, "num_tokens": 47214684.0, "reward": 0.9079037308692932, "reward_std": 0.23519548773765564, "rewards/accuracy_reward_step": 0.69140625, "rewards/asymmetric_l2_reward": 0.7887454032897949, "rewards/final_brier_reward_step": 0.6958121061325073, "rewards/format_reward_step": 0.96484375, "step": 144 }, { "adv/mean_abs_final_conf": 0.6386491656303406, "adv/mean_abs_reasoning": 0.5374621152877808, "adv/mean_abs_step_conf": 0.7449643611907959, "adv/ratio_final_to_reasoning": 1.1882682471272972, "adv/ratio_step_to_reasoning": 1.3860779020525182, "adv/std_final_conf": 0.8432608842849731, "adv/std_reasoning": 0.7929659485816956, "adv/std_step_conf": 0.9301500916481018, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5916508538899431, "calib/avg_num_step_conf": 11.1796875, "calib/ece": 0.3404166666666667, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0053055028462999365, "calib/mean_conf": 0.9862500000000001, "calib/mu_c": 0.9881290322580644, "calib/mu_w": 0.9828235294117644, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3404166666666667, "calib/std_conf": 0.009837216747298675, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8118131868131868, "calib/step_q_c_n": 1638.0, "calib/step_q_gap": 0.02967266393736978, "calib/step_q_w": 0.782140522875817, "calib/step_q_w_n": 1224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 894.8984375, "completions/mean_terminated_length": 916.3760375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.15466666666666667, "grad_norm": 0.0357648991048336, "kl": 0.1115264892578125, "learning_rate": 1.527777777777778e-06, "loss": -0.0304, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016461670398712158, "mask/share_reasoning": 0.8312902450561523, "mask/share_step_conf": 0.12881061434745789, "num_tokens": 47546482.0, "reward": 0.8309597373008728, "reward_std": 0.2730830907821655, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7351561188697815, "rewards/final_brier_reward_step": 0.6166070103645325, "rewards/format_reward_step": 0.9375, "step": 145 }, { "adv/mean_abs_final_conf": 0.6367698907852173, "adv/mean_abs_reasoning": 0.4902743697166443, "adv/mean_abs_step_conf": 0.7329921722412109, "adv/ratio_final_to_reasoning": 1.2988031398688873, "adv/ratio_step_to_reasoning": 1.4950652481891848, "adv/std_final_conf": 0.8359977602958679, "adv/std_reasoning": 0.7575299739837646, "adv/std_step_conf": 0.930633008480072, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5765886287625417, "calib/avg_num_step_conf": 10.46484375, "calib/ece": 0.514, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004595317725752279, "calib/mean_conf": 0.9833877551020408, "calib/mu_c": 0.9858260869565216, "calib/mu_w": 0.9812307692307694, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.514, "calib/std_conf": 0.0124356569729222, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8136788445199661, "calib/step_q_c_n": 1177.0, "calib/step_q_gap": 0.015303345185745165, "calib/step_q_w": 0.7983754993342209, "calib/step_q_w_n": 1502.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 875.3671875, "completions/mean_terminated_length": 907.26318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 492.0, "epoch": 0.15573333333333333, "grad_norm": 0.03923913091421127, "kl": 0.11285400390625, "learning_rate": 1.5e-06, "loss": -0.1544, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016228225082159042, "mask/share_reasoning": 0.8297832012176514, "mask/share_step_conf": 0.11883234977722168, "num_tokens": 47877792.0, "reward": 0.7035650014877319, "reward_std": 0.21938246488571167, "rewards/accuracy_reward_step": 0.44921875, "rewards/asymmetric_l2_reward": 0.6580109596252441, "rewards/final_brier_reward_step": 0.4678691327571869, "rewards/format_reward_step": 0.95703125, "step": 146 }, { "adv/mean_abs_final_conf": 0.4933481216430664, "adv/mean_abs_reasoning": 0.43553024530410767, "adv/mean_abs_step_conf": 0.7302189469337463, "adv/ratio_final_to_reasoning": 1.1327528385510577, "adv/ratio_step_to_reasoning": 1.6766205213230903, "adv/std_final_conf": 0.7655453681945801, "adv/std_reasoning": 0.7393311262130737, "adv/std_step_conf": 0.9292852878570557, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6033057851239669, "calib/avg_num_step_conf": 11.26171875, "calib/ece": 0.48541322314049595, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006198347107438051, "calib/mean_conf": 0.985413223140496, "calib/mu_c": 0.9885123966942149, "calib/mu_w": 0.9823140495867768, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48541322314049595, "calib/std_conf": 0.010834723351670466, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8105288461538462, "calib/step_q_c_n": 1248.0, "calib/step_q_gap": 0.023158815572806524, "calib/step_q_w": 0.7873700305810397, "calib/step_q_w_n": 1635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 849.50390625, "completions/mean_terminated_length": 887.6448364257812, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.1568, "grad_norm": 0.05791052430868149, "kl": 0.151153564453125, "learning_rate": 1.4722222222222225e-06, "loss": -0.1968, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016502130776643753, "mask/share_reasoning": 0.8159295916557312, "mask/share_step_conf": 0.12459948658943176, "num_tokens": 48198945.0, "reward": 0.7110254168510437, "reward_std": 0.19756188988685608, "rewards/accuracy_reward_step": 0.47265625, "rewards/asymmetric_l2_reward": 0.649394154548645, "rewards/final_brier_reward_step": 0.4890628457069397, "rewards/format_reward_step": 0.9453125, "step": 147 }, { "adv/mean_abs_final_conf": 0.4634344279766083, "adv/mean_abs_reasoning": 0.41140034794807434, "adv/mean_abs_step_conf": 0.7451192140579224, "adv/ratio_final_to_reasoning": 1.1264803986872212, "adv/ratio_step_to_reasoning": 1.8111778897959732, "adv/std_final_conf": 0.7121722102165222, "adv/std_reasoning": 0.6816737651824951, "adv/std_step_conf": 0.9280598163604736, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6173570019723866, "calib/avg_num_step_conf": 10.6171875, "calib/ece": 0.30992, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006931112572138187, "calib/mean_conf": 0.98592, "calib/mu_c": 0.9881656804733728, "calib/mu_w": 0.9812345679012346, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30992, "calib/std_conf": 0.010322480322093142, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8252042529378847, "calib/step_q_c_n": 1787.0, "calib/step_q_gap": 0.027298774957218663, "calib/step_q_w": 0.797905477980666, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 833.15234375, "completions/mean_terminated_length": 843.0316772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 417.0, "epoch": 0.15786666666666666, "grad_norm": 0.020612573251128197, "kl": 0.1186370849609375, "learning_rate": 1.4444444444444445e-06, "loss": -0.0327, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.017827773466706276, "mask/share_reasoning": 0.8354048728942871, "mask/share_step_conf": 0.13504862785339355, "num_tokens": 48517344.0, "reward": 0.8763802647590637, "reward_std": 0.20133787393569946, "rewards/accuracy_reward_step": 0.66015625, "rewards/asymmetric_l2_reward": 0.7545421123504639, "rewards/final_brier_reward_step": 0.6716558933258057, "rewards/format_reward_step": 0.97265625, "step": 148 }, { "adv/mean_abs_final_conf": 0.6034526824951172, "adv/mean_abs_reasoning": 0.5531489849090576, "adv/mean_abs_step_conf": 0.7516754269599915, "adv/ratio_final_to_reasoning": 1.0909405945928472, "adv/ratio_step_to_reasoning": 1.358902298417077, "adv/std_final_conf": 0.8103293776512146, "adv/std_reasoning": 0.7930197715759277, "adv/std_step_conf": 0.9299349188804626, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6671369737644904, "calib/avg_num_step_conf": 10.43359375, "calib/ece": 0.3564135021097047, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00992830994508831, "calib/mean_conf": 0.9851054852320675, "calib/mu_c": 0.9887919463087247, "calib/mu_w": 0.9788636363636364, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3564135021097047, "calib/std_conf": 0.01108508765073446, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8246044624746451, "calib/step_q_c_n": 1479.0, "calib/step_q_gap": 0.05184439536055108, "calib/step_q_w": 0.7727600671140941, "calib/step_q_w_n": 1192.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 881.70703125, "completions/mean_terminated_length": 928.8765258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 409.0, "epoch": 0.15893333333333334, "grad_norm": 0.01654718443751335, "kl": 0.10540771484375, "learning_rate": 1.4166666666666667e-06, "loss": -0.2621, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.016193099319934845, "mask/share_reasoning": 0.8126864433288574, "mask/share_step_conf": 0.12033917754888535, "num_tokens": 48847517.0, "reward": 0.8093559145927429, "reward_std": 0.2620517313480377, "rewards/accuracy_reward_step": 0.58203125, "rewards/asymmetric_l2_reward": 0.7209059596061707, "rewards/final_brier_reward_step": 0.5962433815002441, "rewards/format_reward_step": 0.92578125, "step": 149 }, { "adv/mean_abs_final_conf": 0.5678569078445435, "adv/mean_abs_reasoning": 0.5295990705490112, "adv/mean_abs_step_conf": 0.7428033351898193, "adv/ratio_final_to_reasoning": 1.0722392455407297, "adv/ratio_step_to_reasoning": 1.402576734924759, "adv/std_final_conf": 0.8097455501556396, "adv/std_reasoning": 0.7755541205406189, "adv/std_step_conf": 0.932079553604126, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.551062271062271, "calib/avg_num_step_conf": 12.6640625, "calib/ece": 0.36302904564315364, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.002730402930402831, "calib/mean_conf": 0.985435684647303, "calib/mu_c": 0.9864666666666665, "calib/mu_w": 0.9837362637362637, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36302904564315364, "calib/std_conf": 0.011733424454417551, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8237485101311085, "calib/step_q_c_n": 1678.0, "calib/step_q_gap": 0.010666668698883441, "calib/step_q_w": 0.8130818414322251, "calib/step_q_w_n": 1564.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 816.42578125, "completions/mean_terminated_length": 860.1028442382812, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.16, "grad_norm": 0.025528794154524803, "kl": 0.11785888671875, "learning_rate": 1.3888888888888892e-06, "loss": -0.2231, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.0170246921479702, "mask/share_reasoning": 0.7955192923545837, "mask/share_step_conf": 0.13667477667331696, "num_tokens": 49161482.0, "reward": 0.8199504613876343, "reward_std": 0.24832351505756378, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.7381203174591064, "rewards/final_brier_reward_step": 0.5970929861068726, "rewards/format_reward_step": 0.9375, "step": 150 }, { "adv/mean_abs_final_conf": 0.6715290546417236, "adv/mean_abs_reasoning": 0.6039446592330933, "adv/mean_abs_step_conf": 0.7560272812843323, "adv/ratio_final_to_reasoning": 1.1119049475401455, "adv/ratio_step_to_reasoning": 1.2518154929035352, "adv/std_final_conf": 0.859036386013031, "adv/std_reasoning": 0.826852023601532, "adv/std_step_conf": 0.9320445656776428, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6452629291612342, "calib/avg_num_step_conf": 12.24609375, "calib/ece": 0.4825106382978723, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008428943937418554, "calib/mean_conf": 0.9803829787234042, "calib/mu_c": 0.9846153846153844, "calib/mu_w": 0.9761864406779659, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4825106382978723, "calib/std_conf": 0.01393992153506622, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8167237354085604, "calib/step_q_c_n": 1285.0, "calib/step_q_gap": 0.034350762435587434, "calib/step_q_w": 0.7823729729729729, "calib/step_q_w_n": 1850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 897.87890625, "completions/mean_terminated_length": 965.7857666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 541.0, "epoch": 0.16106666666666666, "grad_norm": 0.025068072602152824, "kl": 0.107177734375, "learning_rate": 1.3611111111111112e-06, "loss": -0.2567, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.015033187344670296, "mask/share_reasoning": 0.7970659732818604, "mask/share_step_conf": 0.11758832633495331, "num_tokens": 49498363.0, "reward": 0.7052844762802124, "reward_std": 0.29788458347320557, "rewards/accuracy_reward_step": 0.45703125, "rewards/asymmetric_l2_reward": 0.657116174697876, "rewards/final_brier_reward_step": 0.4784526824951172, "rewards/format_reward_step": 0.91796875, "step": 151 }, { "adv/mean_abs_final_conf": 0.628909707069397, "adv/mean_abs_reasoning": 0.5587596297264099, "adv/mean_abs_step_conf": 0.7436010241508484, "adv/ratio_final_to_reasoning": 1.1255460731429994, "adv/ratio_step_to_reasoning": 1.3308066377575343, "adv/std_final_conf": 0.8389648199081421, "adv/std_reasoning": 0.810052752494812, "adv/std_step_conf": 0.9334518313407898, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.60296992481203, "calib/avg_num_step_conf": 12.109375, "calib/ece": 0.41248927038626615, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006315037593985107, "calib/mean_conf": 0.983304721030043, "calib/mu_c": 0.9860150375939849, "calib/mu_w": 0.9796999999999998, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.41248927038626615, "calib/std_conf": 0.012593915829054145, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8207415107415107, "calib/step_q_c_n": 1443.0, "calib/step_q_gap": 0.020783755762633138, "calib/step_q_w": 0.7999577549788776, "calib/step_q_w_n": 1657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 852.71875, "completions/mean_terminated_length": 917.2101440429688, "completions/min_length": 0.0, "completions/min_terminated_length": 487.0, "epoch": 0.16213333333333332, "grad_norm": 0.020201267674565315, "kl": 0.1133880615234375, "learning_rate": 1.3333333333333334e-06, "loss": -0.3514, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.01569315977394581, "mask/share_reasoning": 0.7901881337165833, "mask/share_step_conf": 0.12380620837211609, "num_tokens": 49822051.0, "reward": 0.7507580518722534, "reward_std": 0.28133276104927063, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6852730512619019, "rewards/final_brier_reward_step": 0.5310866832733154, "rewards/format_reward_step": 0.90625, "step": 152 }, { "adv/mean_abs_final_conf": 0.5813193917274475, "adv/mean_abs_reasoning": 0.5272849798202515, "adv/mean_abs_step_conf": 0.7468937635421753, "adv/ratio_final_to_reasoning": 1.1024766757543825, "adv/ratio_step_to_reasoning": 1.4164897391857951, "adv/std_final_conf": 0.7978730797767639, "adv/std_reasoning": 0.7755938768386841, "adv/std_step_conf": 0.9315870404243469, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5637419871794871, "calib/avg_num_step_conf": 10.6875, "calib/ece": 0.3234322033898306, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0037051282051281387, "calib/mean_conf": 0.984449152542373, "calib/mu_c": 0.9857051282051281, "calib/mu_w": 0.982, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3234322033898306, "calib/std_conf": 0.011613184688269178, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8166891064871481, "calib/step_q_c_n": 1634.0, "calib/step_q_gap": 0.02629890685012448, "calib/step_q_w": 0.7903901996370236, "calib/step_q_w_n": 1102.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 850.31640625, "completions/mean_terminated_length": 903.24072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 473.0, "epoch": 0.1632, "grad_norm": 0.019481275230646133, "kl": 0.1083831787109375, "learning_rate": 1.3055555555555556e-06, "loss": -0.2613, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.01606718823313713, "mask/share_reasoning": 0.8041765689849854, "mask/share_step_conf": 0.12116247415542603, "num_tokens": 50147052.0, "reward": 0.8226182460784912, "reward_std": 0.2654617428779602, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7187087535858154, "rewards/final_brier_reward_step": 0.6202777624130249, "rewards/format_reward_step": 0.921875, "step": 153 }, { "adv/mean_abs_final_conf": 0.5330643653869629, "adv/mean_abs_reasoning": 0.41719967126846313, "adv/mean_abs_step_conf": 0.7645653486251831, "adv/ratio_final_to_reasoning": 1.2777200033888383, "adv/ratio_step_to_reasoning": 1.8326125385012448, "adv/std_final_conf": 0.7777272462844849, "adv/std_reasoning": 0.7014922499656677, "adv/std_step_conf": 0.9263600707054138, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5758741258741258, "calib/avg_num_step_conf": 11.234375, "calib/ece": 0.4437083333333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004552447552447769, "calib/mean_conf": 0.9853750000000001, "calib/mu_c": 0.9874615384615385, "calib/mu_w": 0.9829090909090907, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4437083333333335, "calib/std_conf": 0.01083325320483188, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8275321725965178, "calib/step_q_c_n": 1321.0, "calib/step_q_gap": 0.028303876776582082, "calib/step_q_w": 0.7992282958199357, "calib/step_q_w_n": 1555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 813.80859375, "completions/mean_terminated_length": 860.8883666992188, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.16426666666666667, "grad_norm": 0.049363043159246445, "kl": 0.12261962890625, "learning_rate": 1.2777777777777779e-06, "loss": -0.1861, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.016554001718759537, "mask/share_reasoning": 0.802830696105957, "mask/share_step_conf": 0.12592774629592896, "num_tokens": 50459827.0, "reward": 0.7538589239120483, "reward_std": 0.19900038838386536, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.6964658498764038, "rewards/final_brier_reward_step": 0.5221894383430481, "rewards/format_reward_step": 0.9375, "step": 154 }, { "adv/mean_abs_final_conf": 0.597705602645874, "adv/mean_abs_reasoning": 0.48494482040405273, "adv/mean_abs_step_conf": 0.7656376957893372, "adv/ratio_final_to_reasoning": 1.2325229129117614, "adv/ratio_step_to_reasoning": 1.5788140497127343, "adv/std_final_conf": 0.7969999313354492, "adv/std_reasoning": 0.7393460869789124, "adv/std_step_conf": 0.9281103014945984, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.600145137880987, "calib/avg_num_step_conf": 12.203125, "calib/ece": 0.43076271186440684, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005834542815675081, "calib/mean_conf": 0.9816101694915255, "calib/mu_c": 0.9842307692307694, "calib/mu_w": 0.9783962264150943, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43076271186440684, "calib/std_conf": 0.013401893300540396, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8178236529041288, "calib/step_q_c_n": 1429.0, "calib/step_q_gap": 0.010714508361356012, "calib/step_q_w": 0.8071091445427728, "calib/step_q_w_n": 1695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 804.8515625, "completions/mean_terminated_length": 865.7227172851562, "completions/min_length": 0.0, "completions/min_terminated_length": 514.0, "epoch": 0.16533333333333333, "grad_norm": 0.020702464506030083, "kl": 0.127410888671875, "learning_rate": 1.25e-06, "loss": -0.2912, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015847964212298393, "mask/share_reasoning": 0.784995436668396, "mask/share_step_conf": 0.12884411215782166, "num_tokens": 50773085.0, "reward": 0.7318023443222046, "reward_std": 0.19392001628875732, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.6524415016174316, "rewards/final_brier_reward_step": 0.5252257585525513, "rewards/format_reward_step": 0.921875, "step": 155 }, { "adv/mean_abs_final_conf": 0.6328127384185791, "adv/mean_abs_reasoning": 0.5709925889968872, "adv/mean_abs_step_conf": 0.7423877120018005, "adv/ratio_final_to_reasoning": 1.108267866541485, "adv/ratio_step_to_reasoning": 1.3001704861108936, "adv/std_final_conf": 0.8355315327644348, "adv/std_reasoning": 0.8098950386047363, "adv/std_step_conf": 0.9297937750816345, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5610687022900763, "calib/avg_num_step_conf": 11.86328125, "calib/ece": 0.43983402489626555, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003583622484386262, "calib/mean_conf": 0.983402489626556, "calib/mu_c": 0.9850381679389315, "calib/mu_w": 0.9814545454545452, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43983402489626555, "calib/std_conf": 0.012325118316657337, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8224016282225237, "calib/step_q_c_n": 1474.0, "calib/step_q_gap": 0.011454731229561421, "calib/step_q_w": 0.8109468969929623, "calib/step_q_w_n": 1563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 858.23828125, "completions/mean_terminated_length": 893.1259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 485.0, "epoch": 0.1664, "grad_norm": 0.02893920987844467, "kl": 0.11334228515625, "learning_rate": 1.2222222222222223e-06, "loss": -0.1956, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.0159677155315876, "mask/share_reasoning": 0.8142074346542358, "mask/share_step_conf": 0.13076233863830566, "num_tokens": 51097554.0, "reward": 0.764479398727417, "reward_std": 0.25560951232910156, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.7110797762870789, "rewards/final_brier_reward_step": 0.5272538661956787, "rewards/format_reward_step": 0.94140625, "step": 156 }, { "adv/mean_abs_final_conf": 0.5657306909561157, "adv/mean_abs_reasoning": 0.46107548475265503, "adv/mean_abs_step_conf": 0.726364254951477, "adv/ratio_final_to_reasoning": 1.2269806347643992, "adv/ratio_step_to_reasoning": 1.5753694979925832, "adv/std_final_conf": 0.7988712787628174, "adv/std_reasoning": 0.7394241690635681, "adv/std_step_conf": 0.9278371930122375, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6322088353413655, "calib/avg_num_step_conf": 13.40625, "calib/ece": 0.2919502074688797, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007859437751004172, "calib/mean_conf": 0.980746887966805, "calib/mu_c": 0.9831927710843372, "calib/mu_w": 0.975333333333333, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2919502074688797, "calib/std_conf": 0.013915208576745691, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8242296511627907, "calib/step_q_c_n": 2064.0, "calib/step_q_gap": 0.06155421256629934, "calib/step_q_w": 0.7626754385964913, "calib/step_q_w_n": 1368.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 860.27734375, "completions/mean_terminated_length": 913.8216552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 480.0, "epoch": 0.16746666666666668, "grad_norm": 0.020823344588279724, "kl": 0.1176910400390625, "learning_rate": 1.1944444444444446e-06, "loss": -0.2683, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.015636589378118515, "mask/share_reasoning": 0.7919765710830688, "mask/share_step_conf": 0.13379308581352234, "num_tokens": 51421513.0, "reward": 0.8676108121871948, "reward_std": 0.2390524446964264, "rewards/accuracy_reward_step": 0.6484375, "rewards/asymmetric_l2_reward": 0.7548935413360596, "rewards/final_brier_reward_step": 0.662359356880188, "rewards/format_reward_step": 0.94140625, "step": 157 }, { "adv/mean_abs_final_conf": 0.5855216979980469, "adv/mean_abs_reasoning": 0.5074455738067627, "adv/mean_abs_step_conf": 0.7488413453102112, "adv/ratio_final_to_reasoning": 1.1538610803234948, "adv/ratio_step_to_reasoning": 1.4757077092870121, "adv/std_final_conf": 0.8118085861206055, "adv/std_reasoning": 0.7754456400871277, "adv/std_step_conf": 0.9327535033226013, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6261183261183261, "calib/avg_num_step_conf": 11.73828125, "calib/ece": 0.35143442622950816, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00743578643578624, "calib/mean_conf": 0.9825819672131147, "calib/mu_c": 0.9853246753246752, "calib/mu_w": 0.9778888888888889, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35143442622950816, "calib/std_conf": 0.012847363806859777, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8337210648148149, "calib/step_q_c_n": 1728.0, "calib/step_q_gap": 0.029805638033295834, "calib/step_q_w": 0.8039154267815191, "calib/step_q_w_n": 1277.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 843.34765625, "completions/mean_terminated_length": 870.5523681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.16853333333333334, "grad_norm": 0.041330933570861816, "kl": 0.1220550537109375, "learning_rate": 1.1666666666666668e-06, "loss": -0.1525, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01676858961582184, "mask/share_reasoning": 0.8167569637298584, "mask/share_step_conf": 0.13522447645664215, "num_tokens": 51742650.0, "reward": 0.8231799602508545, "reward_std": 0.2605646252632141, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.7187595367431641, "rewards/final_brier_reward_step": 0.616662859916687, "rewards/format_reward_step": 0.953125, "step": 158 }, { "adv/mean_abs_final_conf": 0.6384714841842651, "adv/mean_abs_reasoning": 0.5273313522338867, "adv/mean_abs_step_conf": 0.7569658756256104, "adv/ratio_final_to_reasoning": 1.2107595755108536, "adv/ratio_step_to_reasoning": 1.4354653339289298, "adv/std_final_conf": 0.8230856657028198, "adv/std_reasoning": 0.775598406791687, "adv/std_step_conf": 0.9249182343482971, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6248127978216473, "calib/avg_num_step_conf": 12.60546875, "calib/ece": 0.44563786008230455, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007439754935330223, "calib/mean_conf": 0.9806172839506173, "calib/mu_c": 0.9840769230769232, "calib/mu_w": 0.976637168141593, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44563786008230455, "calib/std_conf": 0.013908490968402642, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8364338983050849, "calib/step_q_c_n": 1475.0, "calib/step_q_gap": 0.006091432551660114, "calib/step_q_w": 0.8303424657534247, "calib/step_q_w_n": 1752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 860.046875, "completions/mean_terminated_length": 895.0081176757812, "completions/min_length": 0.0, "completions/min_terminated_length": 490.0, "epoch": 0.1696, "grad_norm": 0.026793481782078743, "kl": 0.1242218017578125, "learning_rate": 1.138888888888889e-06, "loss": -0.1202, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01638055592775345, "mask/share_reasoning": 0.8136330246925354, "mask/share_step_conf": 0.13092389702796936, "num_tokens": 52067606.0, "reward": 0.7726041078567505, "reward_std": 0.25494319200515747, "rewards/accuracy_reward_step": 0.5078125, "rewards/asymmetric_l2_reward": 0.7267637848854065, "rewards/final_brier_reward_step": 0.5278195142745972, "rewards/format_reward_step": 0.9453125, "step": 159 }, { "adv/mean_abs_final_conf": 0.5665127038955688, "adv/mean_abs_reasoning": 0.4910869002342224, "adv/mean_abs_step_conf": 0.7735710144042969, "adv/ratio_final_to_reasoning": 1.1535895248384194, "adv/ratio_step_to_reasoning": 1.5752222550333648, "adv/std_final_conf": 0.7866808176040649, "adv/std_reasoning": 0.7395008206367493, "adv/std_step_conf": 0.9210416674613953, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5971209641422407, "calib/avg_num_step_conf": 12.1171875, "calib/ece": 0.37843881856540085, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.9957805907172996, "calib/gap": 0.00635694093140915, "calib/mean_conf": 0.9818143459915611, "calib/mu_c": 0.9843356643356642, "calib/mu_w": 0.9779787234042551, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37843881856540085, "calib/std_conf": 0.01418976739886446, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8283772198407838, "calib/step_q_c_n": 1633.0, "calib/step_q_gap": 0.019091991794493723, "calib/step_q_w": 0.8092852280462901, "calib/step_q_w_n": 1469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 889.9140625, "completions/mean_terminated_length": 929.8693237304688, "completions/min_length": 0.0, "completions/min_terminated_length": 557.0, "epoch": 0.17066666666666666, "grad_norm": 0.0199897438287735, "kl": 0.1261749267578125, "learning_rate": 1.111111111111111e-06, "loss": -0.1778, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01571989618241787, "mask/share_reasoning": 0.8111594319343567, "mask/share_step_conf": 0.1301519125699997, "num_tokens": 52400264.0, "reward": 0.7833555936813354, "reward_std": 0.2323831170797348, "rewards/accuracy_reward_step": 0.55859375, "rewards/asymmetric_l2_reward": 0.6977380514144897, "rewards/final_brier_reward_step": 0.5736604928970337, "rewards/format_reward_step": 0.91796875, "step": 160 }, { "adv/mean_abs_final_conf": 0.5213401317596436, "adv/mean_abs_reasoning": 0.38532155752182007, "adv/mean_abs_step_conf": 0.7431133985519409, "adv/ratio_final_to_reasoning": 1.3530001672177945, "adv/ratio_step_to_reasoning": 1.9285539156730407, "adv/std_final_conf": 0.7485088109970093, "adv/std_reasoning": 0.6817570328712463, "adv/std_step_conf": 0.9272958040237427, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6418430335097002, "calib/avg_num_step_conf": 13.37890625, "calib/ece": 0.24082304526748965, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00853968253968218, "calib/mean_conf": 0.9815637860082304, "calib/mu_c": 0.9837777777777776, "calib/mu_w": 0.9752380952380955, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24082304526748965, "calib/std_conf": 0.013487650397239634, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.830957345971564, "calib/step_q_c_n": 2110.0, "calib/step_q_gap": 0.004637954336582872, "calib/step_q_w": 0.8263193916349811, "calib/step_q_w_n": 1315.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 834.53125, "completions/mean_terminated_length": 875.57373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.17173333333333332, "grad_norm": 0.04618752375245094, "kl": 0.12261962890625, "learning_rate": 1.0833333333333335e-06, "loss": -0.2442, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01637279987335205, "mask/share_reasoning": 0.7970735430717468, "mask/share_step_conf": 0.13967867195606232, "num_tokens": 52717824.0, "reward": 0.9100635051727295, "reward_std": 0.20310011506080627, "rewards/accuracy_reward_step": 0.703125, "rewards/asymmetric_l2_reward": 0.7748411893844604, "rewards/final_brier_reward_step": 0.7148171663284302, "rewards/format_reward_step": 0.94921875, "step": 161 }, { "adv/mean_abs_final_conf": 0.4991297721862793, "adv/mean_abs_reasoning": 0.36160850524902344, "adv/mean_abs_step_conf": 0.7166227102279663, "adv/ratio_final_to_reasoning": 1.3803042930158713, "adv/ratio_step_to_reasoning": 1.981763979070295, "adv/std_final_conf": 0.7472751140594482, "adv/std_reasoning": 0.6614494919776917, "adv/std_step_conf": 0.9230762720108032, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6346153846153846, "calib/avg_num_step_conf": 12.59765625, "calib/ece": 0.25020746887966816, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.008897727272727418, "calib/mean_conf": 0.9804979253112034, "calib/mu_c": 0.9828977272727275, "calib/mu_w": 0.9740000000000001, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25020746887966816, "calib/std_conf": 0.014737009746776298, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8359530791788857, "calib/step_q_c_n": 2046.0, "calib/step_q_gap": 0.03344247697362701, "calib/step_q_w": 0.8025106022052587, "calib/step_q_w_n": 1179.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 840.6953125, "completions/mean_terminated_length": 882.0409545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 500.0, "epoch": 0.1728, "grad_norm": 0.03857972472906113, "kl": 0.1229400634765625, "learning_rate": 1.0555555555555557e-06, "loss": -0.1944, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.016243983060121536, "mask/share_reasoning": 0.8029369711875916, "mask/share_step_conf": 0.1339440643787384, "num_tokens": 53037186.0, "reward": 0.9082566499710083, "reward_std": 0.2010715901851654, "rewards/accuracy_reward_step": 0.69140625, "rewards/asymmetric_l2_reward": 0.789810061454773, "rewards/final_brier_reward_step": 0.7001405954360962, "rewards/format_reward_step": 0.94140625, "step": 162 }, { "adv/mean_abs_final_conf": 0.5536430478096008, "adv/mean_abs_reasoning": 0.4788112938404083, "adv/mean_abs_step_conf": 0.7330542206764221, "adv/ratio_final_to_reasoning": 1.156286526512331, "adv/ratio_step_to_reasoning": 1.530987740069379, "adv/std_final_conf": 0.7959635257720947, "adv/std_reasoning": 0.7577126622200012, "adv/std_step_conf": 0.9270132184028625, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.6336287313432836, "calib/avg_num_step_conf": 12.34765625, "calib/ece": 0.398, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008017723880596739, "calib/mean_conf": 0.9806086956521739, "calib/mu_c": 0.9839552238805968, "calib/mu_w": 0.9759375, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.398, "calib/std_conf": 0.013911956479276587, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.838726322664925, "calib/step_q_c_n": 1531.0, "calib/step_q_gap": 0.020057611008483023, "calib/step_q_w": 0.818668711656442, "calib/step_q_w_n": 1630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 901.875, "completions/mean_terminated_length": 958.0083618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 461.0, "epoch": 0.17386666666666667, "grad_norm": 0.023081474006175995, "kl": 0.1114044189453125, "learning_rate": 1.0277777777777777e-06, "loss": -0.2066, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.015411942265927792, "mask/share_reasoning": 0.7985349893569946, "mask/share_step_conf": 0.12745928764343262, "num_tokens": 53372898.0, "reward": 0.745888888835907, "reward_std": 0.23261308670043945, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6664301156997681, "rewards/final_brier_reward_step": 0.5409726500511169, "rewards/format_reward_step": 0.8984375, "step": 163 }, { "adv/mean_abs_final_conf": 0.6406596302986145, "adv/mean_abs_reasoning": 0.5433213710784912, "adv/mean_abs_step_conf": 0.7142074704170227, "adv/ratio_final_to_reasoning": 1.1791541146760105, "adv/ratio_step_to_reasoning": 1.3145212178923187, "adv/std_final_conf": 0.8660091161727905, "adv/std_reasoning": 0.8098387718200684, "adv/std_step_conf": 0.9305307269096375, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.6472543123912011, "calib/avg_num_step_conf": 12.48046875, "calib/ece": 0.36359307359307347, "calib/final_conf_rate": 0.90234375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008951574616236524, "calib/mean_conf": 0.9783116883116881, "calib/mu_c": 0.9817605633802815, "calib/mu_w": 0.972808988764045, "calib/nonempty_final_conf_rate": 0.90234375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36359307359307347, "calib/std_conf": 0.014747743013404825, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8337029702970297, "calib/step_q_c_n": 1515.0, "calib/step_q_gap": 0.028054160773219983, "calib/step_q_w": 0.8056488095238097, "calib/step_q_w_n": 1680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 896.9609375, "completions/mean_terminated_length": 968.869140625, "completions/min_length": 0.0, "completions/min_terminated_length": 565.0, "epoch": 0.17493333333333333, "grad_norm": 0.0218285471200943, "kl": 0.1168670654296875, "learning_rate": 1.0000000000000002e-06, "loss": -0.3059, "mask/has_final_conf_rate": 0.90234375, "mask/share_final_conf": 0.014433878473937511, "mask/share_reasoning": 0.7988092303276062, "mask/share_step_conf": 0.1125381588935852, "num_tokens": 53708656.0, "reward": 0.7782033681869507, "reward_std": 0.24336914718151093, "rewards/accuracy_reward_step": 0.5546875, "rewards/asymmetric_l2_reward": 0.6931148171424866, "rewards/final_brier_reward_step": 0.5726667642593384, "rewards/format_reward_step": 0.8984375, "step": 164 }, { "adv/mean_abs_final_conf": 0.5370346307754517, "adv/mean_abs_reasoning": 0.49270907044410706, "adv/mean_abs_step_conf": 0.7504063844680786, "adv/ratio_final_to_reasoning": 1.0899629476912034, "adv/ratio_step_to_reasoning": 1.5230212502312859, "adv/std_final_conf": 0.7757647633552551, "adv/std_reasoning": 0.7753999829292297, "adv/std_step_conf": 0.927631139755249, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6621621621621621, "calib/avg_num_step_conf": 11.73046875, "calib/ece": 0.433155737704918, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009493327914380734, "calib/mean_conf": 0.9782377049180327, "calib/mu_c": 0.9825563909774436, "calib/mu_w": 0.9730630630630629, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.433155737704918, "calib/std_conf": 0.014590336617139838, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8378965053763441, "calib/step_q_c_n": 1488.0, "calib/step_q_gap": 0.023467462472053557, "calib/step_q_w": 0.8144290429042905, "calib/step_q_w_n": 1515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 913.09375, "completions/mean_terminated_length": 946.3643798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.176, "grad_norm": 0.05142681300640106, "kl": 0.124298095703125, "learning_rate": 9.722222222222224e-07, "loss": -0.1896, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.015462111681699753, "mask/share_reasoning": 0.8250299692153931, "mask/share_step_conf": 0.12435169517993927, "num_tokens": 54047984.0, "reward": 0.7622162699699402, "reward_std": 0.23104625940322876, "rewards/accuracy_reward_step": 0.51953125, "rewards/asymmetric_l2_reward": 0.6923520565032959, "rewards/final_brier_reward_step": 0.5383304357528687, "rewards/format_reward_step": 0.94921875, "step": 165 }, { "adv/mean_abs_final_conf": 0.5480644106864929, "adv/mean_abs_reasoning": 0.49569880962371826, "adv/mean_abs_step_conf": 0.7524327039718628, "adv/ratio_final_to_reasoning": 1.1056399572605895, "adv/ratio_step_to_reasoning": 1.5179231609271557, "adv/std_final_conf": 0.8023747801780701, "adv/std_reasoning": 0.775547981262207, "adv/std_step_conf": 0.9304532408714294, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8121466768525591, "calib/avg_num_step_conf": 12.640625, "calib/ece": 0.33221757322175716, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.018593582887700855, "calib/mean_conf": 0.9765690376569036, "calib/mu_c": 0.9831818181818182, "calib/mu_w": 0.9645882352941173, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33221757322175716, "calib/std_conf": 0.014861510039826178, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8385502283105023, "calib/step_q_c_n": 1752.0, "calib/step_q_gap": 0.027559662272766294, "calib/step_q_w": 0.810990566037736, "calib/step_q_w_n": 1484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 921.578125, "completions/mean_terminated_length": 966.901611328125, "completions/min_length": 0.0, "completions/min_terminated_length": 519.0, "epoch": 0.17706666666666668, "grad_norm": 0.016354931518435478, "kl": 0.12017822265625, "learning_rate": 9.444444444444445e-07, "loss": -0.1826, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.015144339762628078, "mask/share_reasoning": 0.8095753192901611, "mask/share_step_conf": 0.12840533256530762, "num_tokens": 54390092.0, "reward": 0.8301776647567749, "reward_std": 0.25855541229248047, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.728964626789093, "rewards/final_brier_reward_step": 0.6243593692779541, "rewards/format_reward_step": 0.93359375, "step": 166 }, { "adv/mean_abs_final_conf": 0.5595499277114868, "adv/mean_abs_reasoning": 0.46411240100860596, "adv/mean_abs_step_conf": 0.7970150709152222, "adv/ratio_final_to_reasoning": 1.2056345111560836, "adv/ratio_step_to_reasoning": 1.7172888920510514, "adv/std_final_conf": 0.7713521122932434, "adv/std_reasoning": 0.7207490801811218, "adv/std_step_conf": 0.9261698126792908, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5107586512866017, "calib/avg_num_step_conf": 12.05859375, "calib/ece": 0.3246530612244899, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000559006211180324, "calib/mean_conf": 0.981795918367347, "calib/mu_c": 0.9819875776397515, "calib/mu_w": 0.9814285714285712, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3246530612244899, "calib/std_conf": 0.013249535366265032, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.830916890080429, "calib/step_q_c_n": 1865.0, "calib/step_q_gap": -0.014811424158523545, "calib/step_q_w": 0.8457283142389526, "calib/step_q_w_n": 1222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 838.125, "completions/mean_terminated_length": 872.195068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.17813333333333334, "grad_norm": 0.033019956201314926, "kl": 0.1278839111328125, "learning_rate": 9.166666666666666e-07, "loss": -0.1369, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016273878514766693, "mask/share_reasoning": 0.8090370893478394, "mask/share_step_conf": 0.13562653958797455, "num_tokens": 54710260.0, "reward": 0.8514053821563721, "reward_std": 0.24220512807369232, "rewards/accuracy_reward_step": 0.62890625, "rewards/asymmetric_l2_reward": 0.7450146675109863, "rewards/final_brier_reward_step": 0.640608549118042, "rewards/format_reward_step": 0.95703125, "step": 167 }, { "adv/mean_abs_final_conf": 0.6799944043159485, "adv/mean_abs_reasoning": 0.6203447580337524, "adv/mean_abs_step_conf": 0.7689940929412842, "adv/ratio_final_to_reasoning": 1.096155638473132, "adv/ratio_step_to_reasoning": 1.239623746283746, "adv/std_final_conf": 0.8511113524436951, "adv/std_reasoning": 0.8269323110580444, "adv/std_step_conf": 0.9306549429893494, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.5853057199211045, "calib/avg_num_step_conf": 12.35546875, "calib/ece": 0.3105982905982907, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005128205128205332, "calib/mean_conf": 0.9772649572649573, "calib/mu_c": 0.9789743589743589, "calib/mu_w": 0.9738461538461536, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3105982905982907, "calib/std_conf": 0.014741297976223637, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8346118721461188, "calib/step_q_c_n": 1752.0, "calib/step_q_gap": 0.04859486293279491, "calib/step_q_w": 0.7860170092133238, "calib/step_q_w_n": 1411.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 932.375, "completions/mean_terminated_length": 982.255126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 532.0, "epoch": 0.1792, "grad_norm": 0.03438134863972664, "kl": 0.1192779541015625, "learning_rate": 8.88888888888889e-07, "loss": -0.1497, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.014473002403974533, "mask/share_reasoning": 0.8152675628662109, "mask/share_step_conf": 0.11947821080684662, "num_tokens": 55053620.0, "reward": 0.8218560218811035, "reward_std": 0.31332412362098694, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.714383065700531, "rewards/final_brier_reward_step": 0.6246414184570312, "rewards/format_reward_step": 0.9140625, "step": 168 }, { "adv/mean_abs_final_conf": 0.5080597400665283, "adv/mean_abs_reasoning": 0.4036991000175476, "adv/mean_abs_step_conf": 0.7368338108062744, "adv/ratio_final_to_reasoning": 1.2585109554230973, "adv/ratio_step_to_reasoning": 1.8252054829308424, "adv/std_final_conf": 0.759669303894043, "adv/std_reasoning": 0.6817628741264343, "adv/std_step_conf": 0.9267999529838562, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6489621064363333, "calib/avg_num_step_conf": 11.41015625, "calib/ece": 0.3732653061224489, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00891961549178033, "calib/mean_conf": 0.9773469387755102, "calib/mu_c": 0.9808783783783783, "calib/mu_w": 0.971958762886598, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3732653061224489, "calib/std_conf": 0.014760055058257152, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8386280487804878, "calib/step_q_c_n": 1640.0, "calib/step_q_gap": 0.016090968374554926, "calib/step_q_w": 0.8225370804059329, "calib/step_q_w_n": 1281.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 910.59375, "completions/mean_terminated_length": 936.1927490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 516.0, "epoch": 0.18026666666666666, "grad_norm": 11.406038284301758, "kl": 19.868133544921875, "learning_rate": 8.611111111111112e-07, "loss": 0.1358, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.015651900321245193, "mask/share_reasoning": 0.8308191299438477, "mask/share_step_conf": 0.12618522346019745, "num_tokens": 55390916.0, "reward": 0.8216732740402222, "reward_std": 0.18951186537742615, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.738727331161499, "rewards/final_brier_reward_step": 0.5983691215515137, "rewards/format_reward_step": 0.953125, "step": 169 }, { "adv/mean_abs_final_conf": 0.5418843626976013, "adv/mean_abs_reasoning": 0.4897979497909546, "adv/mean_abs_step_conf": 0.7558741569519043, "adv/ratio_final_to_reasoning": 1.1063426519626658, "adv/ratio_step_to_reasoning": 1.5432366698850226, "adv/std_final_conf": 0.785415768623352, "adv/std_reasoning": 0.7576139569282532, "adv/std_step_conf": 0.9276391267776489, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.696646458445246, "calib/avg_num_step_conf": 12.20703125, "calib/ece": 0.3245, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.9958333333333333, "calib/gap": 0.011222469495817733, "calib/mean_conf": 0.9786666666666667, "calib/mu_c": 0.9825477707006369, "calib/mu_w": 0.9713253012048192, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3245, "calib/std_conf": 0.015244306769705509, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.839802036199095, "calib/step_q_c_n": 1768.0, "calib/step_q_gap": 0.04934514600012663, "calib/step_q_w": 0.7904568901989684, "calib/step_q_w_n": 1357.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 904.96875, "completions/mean_terminated_length": 937.943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 545.0, "epoch": 0.18133333333333335, "grad_norm": 0.01993340626358986, "kl": 0.12060546875, "learning_rate": 8.333333333333333e-07, "loss": -0.1043, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.015275785699486732, "mask/share_reasoning": 0.8242030143737793, "mask/share_step_conf": 0.12536495923995972, "num_tokens": 55726740.0, "reward": 0.8266474008560181, "reward_std": 0.2241785079240799, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7176440954208374, "rewards/final_brier_reward_step": 0.6270570158958435, "rewards/format_reward_step": 0.9296875, "step": 170 }, { "adv/mean_abs_final_conf": 0.5801984071731567, "adv/mean_abs_reasoning": 0.5012568831443787, "adv/mean_abs_step_conf": 0.7429395914077759, "adv/ratio_final_to_reasoning": 1.1574871621384604, "adv/ratio_step_to_reasoning": 1.4821533955749882, "adv/std_final_conf": 0.8193222880363464, "adv/std_reasoning": 0.7755275368690491, "adv/std_step_conf": 0.9323210120201111, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.6916470933646506, "calib/avg_num_step_conf": 12.890625, "calib/ece": 0.422468085106383, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010886670581327085, "calib/mean_conf": 0.9799148936170213, "calib/mu_c": 0.9847328244274809, "calib/mu_w": 0.9738461538461538, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.422468085106383, "calib/std_conf": 0.01402100216048866, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8441294196130753, "calib/step_q_c_n": 1499.0, "calib/step_q_gap": 0.035167731661936985, "calib/step_q_w": 0.8089616879511383, "calib/step_q_w_n": 1801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2960.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 834.046875, "completions/mean_terminated_length": 897.1260986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 570.0, "epoch": 0.1824, "grad_norm": 0.019295768812298775, "kl": 0.1244049072265625, "learning_rate": 8.055555555555557e-07, "loss": -0.2481, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.015632905066013336, "mask/share_reasoning": 0.7890222668647766, "mask/share_step_conf": 0.12503235042095184, "num_tokens": 56047152.0, "reward": 0.7329943180084229, "reward_std": 0.2551029324531555, "rewards/accuracy_reward_step": 0.51171875, "rewards/asymmetric_l2_reward": 0.6523202061653137, "rewards/final_brier_reward_step": 0.5285121202468872, "rewards/format_reward_step": 0.9140625, "step": 171 }, { "adv/mean_abs_final_conf": 0.5102229714393616, "adv/mean_abs_reasoning": 0.43718355894088745, "adv/mean_abs_step_conf": 0.7247519493103027, "adv/ratio_final_to_reasoning": 1.1670680678738652, "adv/ratio_step_to_reasoning": 1.6577749425574764, "adv/std_final_conf": 0.7555689811706543, "adv/std_reasoning": 0.7207115888595581, "adv/std_step_conf": 0.9293190240859985, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6016486060942854, "calib/avg_num_step_conf": 12.12890625, "calib/ece": 0.2252066115702479, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0061165138464384095, "calib/mean_conf": 0.981404958677686, "calib/mu_c": 0.9828961748633879, "calib/mu_w": 0.9767796610169495, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2252066115702479, "calib/std_conf": 0.01362458795337016, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8448167539267016, "calib/step_q_c_n": 2101.0, "calib/step_q_gap": 0.019657391376900812, "calib/step_q_w": 0.8251593625498008, "calib/step_q_w_n": 1004.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 829.60546875, "completions/mean_terminated_length": 870.4057006835938, "completions/min_length": 0.0, "completions/min_terminated_length": 521.0, "epoch": 0.18346666666666667, "grad_norm": 0.018698470667004585, "kl": 0.1277313232421875, "learning_rate": 7.777777777777779e-07, "loss": -0.1593, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.016454197466373444, "mask/share_reasoning": 0.8024311065673828, "mask/share_step_conf": 0.13423970341682434, "num_tokens": 56362883.0, "reward": 0.91644287109375, "reward_std": 0.2156335860490799, "rewards/accuracy_reward_step": 0.71484375, "rewards/asymmetric_l2_reward": 0.776668906211853, "rewards/final_brier_reward_step": 0.7249667644500732, "rewards/format_reward_step": 0.94140625, "step": 172 }, { "adv/mean_abs_final_conf": 0.6531821489334106, "adv/mean_abs_reasoning": 0.4964807629585266, "adv/mean_abs_step_conf": 0.7733575701713562, "adv/ratio_final_to_reasoning": 1.3156242853018134, "adv/ratio_step_to_reasoning": 1.5576788223634728, "adv/std_final_conf": 0.8502808213233948, "adv/std_reasoning": 0.7576800584793091, "adv/std_step_conf": 0.9313940405845642, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6137486645299145, "calib/avg_num_step_conf": 12.765625, "calib/ece": 0.4017741935483873, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.006816239316239203, "calib/mean_conf": 0.9824193548387099, "calib/mu_c": 0.9852777777777777, "calib/mu_w": 0.9784615384615385, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4017741935483873, "calib/std_conf": 0.013036608983064408, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8396873190503764, "calib/step_q_c_n": 1727.0, "calib/step_q_gap": 0.0056444897187734044, "calib/step_q_w": 0.834042829331603, "calib/step_q_w_n": 1541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2124.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 893.078125, "completions/mean_terminated_length": 914.5120239257812, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.18453333333333333, "grad_norm": 0.017594095319509506, "kl": 0.1277618408203125, "learning_rate": 7.5e-07, "loss": -0.152, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.015863744542002678, "mask/share_reasoning": 0.8241561055183411, "mask/share_step_conf": 0.13654263317584991, "num_tokens": 56694671.0, "reward": 0.800077497959137, "reward_std": 0.2479168325662613, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.7143690586090088, "rewards/final_brier_reward_step": 0.579535961151123, "rewards/format_reward_step": 0.96875, "step": 173 }, { "adv/mean_abs_final_conf": 0.7132623195648193, "adv/mean_abs_reasoning": 0.6211596727371216, "adv/mean_abs_step_conf": 0.7582811117172241, "adv/ratio_final_to_reasoning": 1.1482753161064854, "adv/ratio_step_to_reasoning": 1.220750710322003, "adv/std_final_conf": 0.8836770057678223, "adv/std_reasoning": 0.8431268930435181, "adv/std_step_conf": 0.9293094277381897, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5396544245228455, "calib/avg_num_step_conf": 12.86328125, "calib/ece": 0.41666666666666663, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.002333718912666205, "calib/mean_conf": 0.9778481012658228, "calib/mu_c": 0.9788721804511278, "calib/mu_w": 0.9765384615384616, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41666666666666663, "calib/std_conf": 0.01472712868075848, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8338524077548467, "calib/step_q_c_n": 1599.0, "calib/step_q_gap": 0.007913800907148882, "calib/step_q_w": 0.8259386068476978, "calib/step_q_w_n": 1694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 937.640625, "completions/mean_terminated_length": 991.88427734375, "completions/min_length": 0.0, "completions/min_terminated_length": 579.0, "epoch": 0.1856, "grad_norm": 0.03041265159845352, "kl": 0.1407012939453125, "learning_rate": 7.222222222222222e-07, "loss": -0.2731, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.014524579979479313, "mask/share_reasoning": 0.8076785206794739, "mask/share_step_conf": 0.12310938537120819, "num_tokens": 57038939.0, "reward": 0.7551385164260864, "reward_std": 0.3065335750579834, "rewards/accuracy_reward_step": 0.5234375, "rewards/asymmetric_l2_reward": 0.6824946403503418, "rewards/final_brier_reward_step": 0.53793865442276, "rewards/format_reward_step": 0.92578125, "step": 174 }, { "adv/mean_abs_final_conf": 0.6171407699584961, "adv/mean_abs_reasoning": 0.5069169998168945, "adv/mean_abs_step_conf": 0.7473276257514954, "adv/ratio_final_to_reasoning": 1.217439482561082, "adv/ratio_step_to_reasoning": 1.474260334574379, "adv/std_final_conf": 0.8295073509216309, "adv/std_reasoning": 0.7755605578422546, "adv/std_step_conf": 0.929857075214386, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6424601433377212, "calib/avg_num_step_conf": 12.9609375, "calib/ece": 0.5252340425531915, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008462776071376243, "calib/mean_conf": 0.9762978723404255, "calib/mu_c": 0.980943396226415, "calib/mu_w": 0.9724806201550388, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5252340425531915, "calib/std_conf": 0.014915242658373733, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8433551020408164, "calib/step_q_c_n": 1225.0, "calib/step_q_gap": 0.01862027165381208, "calib/step_q_w": 0.8247348303870043, "calib/step_q_w_n": 2093.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 930.4140625, "completions/mean_terminated_length": 988.32373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 501.0, "epoch": 0.18666666666666668, "grad_norm": 0.018979249522089958, "kl": 0.115966796875, "learning_rate": 6.944444444444446e-07, "loss": -0.2135, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.014532214030623436, "mask/share_reasoning": 0.8057027459144592, "mask/share_step_conf": 0.12117130309343338, "num_tokens": 57382949.0, "reward": 0.6719663143157959, "reward_std": 0.24395278096199036, "rewards/accuracy_reward_step": 0.4140625, "rewards/asymmetric_l2_reward": 0.63644939661026, "rewards/final_brier_reward_step": 0.44107693433761597, "rewards/format_reward_step": 0.91796875, "step": 175 }, { "adv/mean_abs_final_conf": 0.5018015503883362, "adv/mean_abs_reasoning": 0.41027840971946716, "adv/mean_abs_step_conf": 0.7303222417831421, "adv/ratio_final_to_reasoning": 1.2230756932382796, "adv/ratio_step_to_reasoning": 1.7800650106899576, "adv/std_final_conf": 0.7439115047454834, "adv/std_reasoning": 0.6816770434379578, "adv/std_step_conf": 0.9248234033584595, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.629388908512893, "calib/avg_num_step_conf": 11.5703125, "calib/ece": 0.36909836065573776, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00769904627340201, "calib/mean_conf": 0.9797540983606557, "calib/mu_c": 0.9827516778523488, "calib/mu_w": 0.9750526315789468, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36909836065573776, "calib/std_conf": 0.014139997609043935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.840911664779162, "calib/step_q_c_n": 1766.0, "calib/step_q_gap": 0.011555477488192079, "calib/step_q_w": 0.8293561872909699, "calib/step_q_w_n": 1196.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 879.80078125, "completions/mean_terminated_length": 908.181396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.18773333333333334, "grad_norm": 0.01840066723525524, "kl": 0.121307373046875, "learning_rate": 6.666666666666667e-07, "loss": -0.1031, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.01617918536067009, "mask/share_reasoning": 0.818038821220398, "mask/share_step_conf": 0.13453197479248047, "num_tokens": 57712242.0, "reward": 0.829666256904602, "reward_std": 0.19717296957969666, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.7515543699264526, "rewards/final_brier_reward_step": 0.5999656319618225, "rewards/format_reward_step": 0.953125, "step": 176 }, { "adv/mean_abs_final_conf": 0.5797469019889832, "adv/mean_abs_reasoning": 0.4305408000946045, "adv/mean_abs_step_conf": 0.7243553400039673, "adv/ratio_final_to_reasoning": 1.346555081101705, "adv/ratio_step_to_reasoning": 1.6824313510933266, "adv/std_final_conf": 0.8125232458114624, "adv/std_reasoning": 0.7207408547401428, "adv/std_step_conf": 0.9289398193359375, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7121413782904467, "calib/avg_num_step_conf": 13.609375, "calib/ece": 0.3640585774058577, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.012709257616089986, "calib/mean_conf": 0.9791213389121338, "calib/mu_c": 0.9840136054421768, "calib/mu_w": 0.9713043478260868, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3640585774058577, "calib/std_conf": 0.014480614825990485, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8476618911174786, "calib/step_q_c_n": 1745.0, "calib/step_q_gap": 0.02403911940960035, "calib/step_q_w": 0.8236227717078782, "calib/step_q_w_n": 1739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 909.4765625, "completions/mean_terminated_length": 954.2048950195312, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.1888, "grad_norm": 0.04756280779838562, "kl": 0.1220703125, "learning_rate": 6.388888888888889e-07, "loss": -0.2128, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.015417687594890594, "mask/share_reasoning": 0.8047745227813721, "mask/share_step_conf": 0.13293278217315674, "num_tokens": 58048900.0, "reward": 0.795788049697876, "reward_std": 0.21256600320339203, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.6957724690437317, "rewards/final_brier_reward_step": 0.5942409634590149, "rewards/format_reward_step": 0.93359375, "step": 177 }, { "adv/mean_abs_final_conf": 0.6708763837814331, "adv/mean_abs_reasoning": 0.6096030473709106, "adv/mean_abs_step_conf": 0.7516494989395142, "adv/ratio_final_to_reasoning": 1.1005135008343239, "adv/ratio_step_to_reasoning": 1.2330146677927873, "adv/std_final_conf": 0.8560056090354919, "adv/std_reasoning": 0.8266636729240417, "adv/std_step_conf": 0.9325826168060303, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7006932654216185, "calib/avg_num_step_conf": 11.84765625, "calib/ece": 0.35873469387755097, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.011963780418789205, "calib/mean_conf": 0.9791428571428571, "calib/mu_c": 0.9836842105263158, "calib/mu_w": 0.9717204301075266, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35873469387755097, "calib/std_conf": 0.014444828182588057, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8468819729888433, "calib/step_q_c_n": 1703.0, "calib/step_q_gap": 0.027904529379820686, "calib/step_q_w": 0.8189774436090226, "calib/step_q_w_n": 1330.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 843.47265625, "completions/mean_terminated_length": 877.7601318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 503.0, "epoch": 0.18986666666666666, "grad_norm": 0.02961582876741886, "kl": 0.1240234375, "learning_rate": 6.111111111111112e-07, "loss": -0.1428, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.016271937638521194, "mask/share_reasoning": 0.8166067600250244, "mask/share_step_conf": 0.12805885076522827, "num_tokens": 58370901.0, "reward": 0.8209068775177002, "reward_std": 0.28923019766807556, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7179765701293945, "rewards/final_brier_reward_step": 0.6136808395385742, "rewards/format_reward_step": 0.95703125, "step": 178 }, { "adv/mean_abs_final_conf": 0.6773865222930908, "adv/mean_abs_reasoning": 0.6010725498199463, "adv/mean_abs_step_conf": 0.7447128891944885, "adv/ratio_final_to_reasoning": 1.1269629972222233, "adv/ratio_step_to_reasoning": 1.238973380863209, "adv/std_final_conf": 0.8693088889122009, "adv/std_reasoning": 0.8430590629577637, "adv/std_step_conf": 0.9291117787361145, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6356553266990665, "calib/avg_num_step_conf": 12.76953125, "calib/ece": 0.3469915254237288, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0081238910745971, "calib/mean_conf": 0.9783474576271186, "calib/mu_c": 0.9813422818791945, "calib/mu_w": 0.9732183908045974, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3469915254237288, "calib/std_conf": 0.014592706048143606, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8434924199887703, "calib/step_q_c_n": 1781.0, "calib/step_q_gap": 0.018707473752211046, "calib/step_q_w": 0.8247849462365593, "calib/step_q_w_n": 1488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 896.1953125, "completions/mean_terminated_length": 951.9751586914062, "completions/min_length": 0.0, "completions/min_terminated_length": 502.0, "epoch": 0.19093333333333334, "grad_norm": 0.02130270004272461, "kl": 0.1211700439453125, "learning_rate": 5.833333333333334e-07, "loss": -0.2417, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015163568779826164, "mask/share_reasoning": 0.7996172904968262, "mask/share_step_conf": 0.1266254037618637, "num_tokens": 58706591.0, "reward": 0.8096022605895996, "reward_std": 0.2978968620300293, "rewards/accuracy_reward_step": 0.5859375, "rewards/asymmetric_l2_reward": 0.7180361747741699, "rewards/final_brier_reward_step": 0.5996058583259583, "rewards/format_reward_step": 0.921875, "step": 179 }, { "adv/mean_abs_final_conf": 0.5014601349830627, "adv/mean_abs_reasoning": 0.4010167121887207, "adv/mean_abs_step_conf": 0.7332862615585327, "adv/ratio_final_to_reasoning": 1.2504719123702575, "adv/ratio_step_to_reasoning": 1.828567835879728, "adv/std_final_conf": 0.7621612548828125, "adv/std_reasoning": 0.701465368270874, "adv/std_step_conf": 0.9304397702217102, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5817487755690004, "calib/avg_num_step_conf": 13.08984375, "calib/ece": 0.34236734693877546, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.004942379717660539, "calib/mean_conf": 0.9791020408163265, "calib/mu_c": 0.9808974358974358, "calib/mu_w": 0.9759550561797753, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34236734693877546, "calib/std_conf": 0.014314606352210409, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.832957992998833, "calib/step_q_c_n": 1714.0, "calib/step_q_gap": -0.008196558009108568, "calib/step_q_w": 0.8411545510079416, "calib/step_q_w_n": 1637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 940.8671875, "completions/mean_terminated_length": 971.2177124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 523.0, "epoch": 0.192, "grad_norm": 0.014338175766170025, "kl": 0.1213226318359375, "learning_rate": 5.555555555555555e-07, "loss": -0.1398, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.015053339302539825, "mask/share_reasoning": 0.8310614228248596, "mask/share_step_conf": 0.12263523042201996, "num_tokens": 59051309.0, "reward": 0.8462561368942261, "reward_std": 0.19401851296424866, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.7546102404594421, "rewards/final_brier_reward_step": 0.6254019737243652, "rewards/format_reward_step": 0.953125, "step": 180 }, { "adv/mean_abs_final_conf": 0.693315863609314, "adv/mean_abs_reasoning": 0.5807839632034302, "adv/mean_abs_step_conf": 0.7269697189331055, "adv/ratio_final_to_reasoning": 1.1937586220273568, "adv/ratio_step_to_reasoning": 1.2517041877729518, "adv/std_final_conf": 0.9083855152130127, "adv/std_reasoning": 0.8429928421974182, "adv/std_step_conf": 0.9324417114257812, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6643928571428572, "calib/avg_num_step_conf": 11.55078125, "calib/ece": 0.39537500000000003, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009785714285714286, "calib/mean_conf": 0.9787083333333334, "calib/mu_c": 0.9827857142857142, "calib/mu_w": 0.9729999999999999, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39537500000000003, "calib/std_conf": 0.014476933280989542, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8407907845579078, "calib/step_q_c_n": 1606.0, "calib/step_q_gap": 0.017674574343251814, "calib/step_q_w": 0.823116210214656, "calib/step_q_w_n": 1351.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 864.91796875, "completions/mean_terminated_length": 911.1892700195312, "completions/min_length": 0.0, "completions/min_terminated_length": 553.0, "epoch": 0.19306666666666666, "grad_norm": 0.031241774559020996, "kl": 0.1353912353515625, "learning_rate": 5.277777777777779e-07, "loss": -0.1777, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.01560254767537117, "mask/share_reasoning": 0.8068452477455139, "mask/share_step_conf": 0.12677091360092163, "num_tokens": 59378992.0, "reward": 0.7764517664909363, "reward_std": 0.28926050662994385, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.6886812448501587, "rewards/final_brier_reward_step": 0.5673472881317139, "rewards/format_reward_step": 0.9375, "step": 181 }, { "adv/mean_abs_final_conf": 0.6113361120223999, "adv/mean_abs_reasoning": 0.4344509243965149, "adv/mean_abs_step_conf": 0.7469823360443115, "adv/ratio_final_to_reasoning": 1.407146532998156, "adv/ratio_step_to_reasoning": 1.7193710361692205, "adv/std_final_conf": 0.8014238476753235, "adv/std_reasoning": 0.7014479041099548, "adv/std_step_conf": 0.9269827604293823, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6751508295625943, "calib/avg_num_step_conf": 11.79296875, "calib/ece": 0.33224066390041485, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010199849170437703, "calib/mean_conf": 0.9795435684647302, "calib/mu_c": 0.9831410256410257, "calib/mu_w": 0.972941176470588, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33224066390041485, "calib/std_conf": 0.014149438434543887, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8405216881594373, "calib/step_q_c_n": 1706.0, "calib/step_q_gap": -0.0003389363645535104, "calib/step_q_w": 0.8408606245239908, "calib/step_q_w_n": 1313.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 863.76953125, "completions/mean_terminated_length": 906.2499389648438, "completions/min_length": 0.0, "completions/min_terminated_length": 561.0, "epoch": 0.19413333333333332, "grad_norm": 0.022411292418837547, "kl": 0.121917724609375, "learning_rate": 5.000000000000001e-07, "loss": -0.2028, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.01571817323565483, "mask/share_reasoning": 0.8114913105964661, "mask/share_step_conf": 0.1259155571460724, "num_tokens": 59706277.0, "reward": 0.8176075220108032, "reward_std": 0.19320210814476013, "rewards/accuracy_reward_step": 0.609375, "rewards/asymmetric_l2_reward": 0.6993848085403442, "rewards/final_brier_reward_step": 0.6264550685882568, "rewards/format_reward_step": 0.9375, "step": 182 }, { "adv/mean_abs_final_conf": 0.6075547337532043, "adv/mean_abs_reasoning": 0.5296026468276978, "adv/mean_abs_step_conf": 0.7353948354721069, "adv/ratio_final_to_reasoning": 1.147189760837558, "adv/ratio_step_to_reasoning": 1.3885784745924092, "adv/std_final_conf": 0.82039874792099, "adv/std_reasoning": 0.7929397225379944, "adv/std_step_conf": 0.932217538356781, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7287499999999999, "calib/avg_num_step_conf": 12.2890625, "calib/ece": 0.39350000000000007, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.013600000000000168, "calib/mean_conf": 0.9768333333333334, "calib/mu_c": 0.9825, "calib/mu_w": 0.9688999999999999, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39350000000000007, "calib/std_conf": 0.014887541398393801, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8456774193548389, "calib/step_q_c_n": 1550.0, "calib/step_q_gap": 0.03122879780095411, "calib/step_q_w": 0.8144486215538848, "calib/step_q_w_n": 1596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 941.71484375, "completions/mean_terminated_length": 988.0286254882812, "completions/min_length": 0.0, "completions/min_terminated_length": 546.0, "epoch": 0.1952, "grad_norm": 0.041146308183670044, "kl": 0.1220550537109375, "learning_rate": 4.7222222222222226e-07, "loss": -0.2376, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.014763625338673592, "mask/share_reasoning": 0.8222149610519409, "mask/share_step_conf": 0.11614643037319183, "num_tokens": 60054036.0, "reward": 0.7858456373214722, "reward_std": 0.25782373547554016, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.7043553590774536, "rewards/final_brier_reward_step": 0.5704609155654907, "rewards/format_reward_step": 0.9375, "step": 183 }, { "adv/mean_abs_final_conf": 0.6665743589401245, "adv/mean_abs_reasoning": 0.5123068690299988, "adv/mean_abs_step_conf": 0.7528473138809204, "adv/ratio_final_to_reasoning": 1.3011232119573481, "adv/ratio_step_to_reasoning": 1.4695241453747452, "adv/std_final_conf": 0.8536959290504456, "adv/std_reasoning": 0.7754993438720703, "adv/std_step_conf": 0.9273887276649475, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5960636952154178, "calib/avg_num_step_conf": 11.93359375, "calib/ece": 0.3506250000000002, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005740754520425551, "calib/mean_conf": 0.9797916666666668, "calib/mu_c": 0.9819205298013244, "calib/mu_w": 0.9761797752808988, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3506250000000002, "calib/std_conf": 0.014214075086179752, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8426749226006193, "calib/step_q_c_n": 1615.0, "calib/step_q_gap": 0.015758255933952636, "calib/step_q_w": 0.8269166666666666, "calib/step_q_w_n": 1440.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 904.36328125, "completions/mean_terminated_length": 944.96728515625, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.19626666666666667, "grad_norm": 0.03529198095202446, "kl": 0.120391845703125, "learning_rate": 4.444444444444445e-07, "loss": -0.1466, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.015524040907621384, "mask/share_reasoning": 0.8197634816169739, "mask/share_step_conf": 0.12174372375011444, "num_tokens": 60390833.0, "reward": 0.8256651759147644, "reward_std": 0.25622591376304626, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7392463088035583, "rewards/final_brier_reward_step": 0.6058340072631836, "rewards/format_reward_step": 0.9375, "step": 184 }, { "adv/mean_abs_final_conf": 0.5692657232284546, "adv/mean_abs_reasoning": 0.5020483136177063, "adv/mean_abs_step_conf": 0.7326884865760803, "adv/ratio_final_to_reasoning": 1.1338863368076806, "adv/ratio_step_to_reasoning": 1.4593983620747686, "adv/std_final_conf": 0.798140823841095, "adv/std_reasoning": 0.7755592465400696, "adv/std_step_conf": 0.9297898411750793, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.7315218222120309, "calib/avg_num_step_conf": 12.76953125, "calib/ece": 0.35533898305084755, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.013738439195903118, "calib/mean_conf": 0.9782203389830509, "calib/mu_c": 0.9834013605442177, "calib/mu_w": 0.9696629213483146, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35533898305084755, "calib/std_conf": 0.014592275407900104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8490630524454921, "calib/step_q_c_n": 1697.0, "calib/step_q_gap": 0.049800965931497077, "calib/step_q_w": 0.799262086513995, "calib/step_q_w_n": 1572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 917.31640625, "completions/mean_terminated_length": 970.38427734375, "completions/min_length": 0.0, "completions/min_terminated_length": 508.0, "epoch": 0.19733333333333333, "grad_norm": 0.01986088417470455, "kl": 0.1129608154296875, "learning_rate": 4.1666666666666667e-07, "loss": -0.1787, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.014929613098502159, "mask/share_reasoning": 0.8069747686386108, "mask/share_step_conf": 0.12340810149908066, "num_tokens": 60732586.0, "reward": 0.7960665225982666, "reward_std": 0.26516661047935486, "rewards/accuracy_reward_step": 0.57421875, "rewards/asymmetric_l2_reward": 0.698235273361206, "rewards/final_brier_reward_step": 0.5946788787841797, "rewards/format_reward_step": 0.921875, "step": 185 }, { "adv/mean_abs_final_conf": 0.617626965045929, "adv/mean_abs_reasoning": 0.5716798305511475, "adv/mean_abs_step_conf": 0.7495993375778198, "adv/ratio_final_to_reasoning": 1.080372145455061, "adv/ratio_step_to_reasoning": 1.3112222917767502, "adv/std_final_conf": 0.8382736444473267, "adv/std_reasoning": 0.8267985582351685, "adv/std_step_conf": 0.9307035207748413, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.7566045066045066, "calib/avg_num_step_conf": 13.01171875, "calib/ece": 0.3665665236051502, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.015153846153846184, "calib/mean_conf": 0.9803004291845493, "calib/mu_c": 0.986153846153846, "calib/mu_w": 0.9709999999999999, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3665665236051502, "calib/std_conf": 0.013940241861726809, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8402585722315907, "calib/step_q_c_n": 1779.0, "calib/step_q_gap": 0.023061407283136992, "calib/step_q_w": 0.8171971649484537, "calib/step_q_w_n": 1552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 869.046875, "completions/mean_terminated_length": 938.7172241210938, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.1984, "grad_norm": 0.05723670870065689, "kl": 0.1216888427734375, "learning_rate": 3.8888888888888895e-07, "loss": -0.27, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.01483369804918766, "mask/share_reasoning": 0.7815354466438293, "mask/share_step_conf": 0.12941214442253113, "num_tokens": 61060102.0, "reward": 0.7855908870697021, "reward_std": 0.3000542223453522, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.702877402305603, "rewards/final_brier_reward_step": 0.5745543241500854, "rewards/format_reward_step": 0.90625, "step": 186 }, { "adv/mean_abs_final_conf": 0.6157701015472412, "adv/mean_abs_reasoning": 0.5407514572143555, "adv/mean_abs_step_conf": 0.7796497941017151, "adv/ratio_final_to_reasoning": 1.138730360005573, "adv/ratio_step_to_reasoning": 1.4417895387985236, "adv/std_final_conf": 0.8065946102142334, "adv/std_reasoning": 0.7578535676002502, "adv/std_step_conf": 0.9338079690933228, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6219548872180451, "calib/avg_num_step_conf": 11.609375, "calib/ece": 0.38578723404255333, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007165413533834819, "calib/mean_conf": 0.9815319148936171, "calib/mu_c": 0.9844285714285714, "calib/mu_w": 0.9772631578947366, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38578723404255333, "calib/std_conf": 0.013471563531853206, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8369925280199253, "calib/step_q_c_n": 1606.0, "calib/step_q_gap": 0.013112586585078967, "calib/step_q_w": 0.8238799414348463, "calib/step_q_w_n": 1366.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 948.04296875, "completions/mean_terminated_length": 990.6080932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 361.0, "epoch": 0.19946666666666665, "grad_norm": 0.036436568945646286, "kl": 0.1116943359375, "learning_rate": 3.611111111111111e-07, "loss": -0.0918, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.014702420681715012, "mask/share_reasoning": 0.8228905200958252, "mask/share_step_conf": 0.11943836510181427, "num_tokens": 61404345.0, "reward": 0.7795661091804504, "reward_std": 0.2637999355792999, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.703906774520874, "rewards/final_brier_reward_step": 0.5630378723144531, "rewards/format_reward_step": 0.91015625, "step": 187 }, { "adv/mean_abs_final_conf": 0.5346688628196716, "adv/mean_abs_reasoning": 0.4271719455718994, "adv/mean_abs_step_conf": 0.7676730155944824, "adv/ratio_final_to_reasoning": 1.2516478864356482, "adv/ratio_step_to_reasoning": 1.7971054128255517, "adv/std_final_conf": 0.7731443643569946, "adv/std_reasoning": 0.7208114266395569, "adv/std_step_conf": 0.9298895001411438, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6706921413121846, "calib/avg_num_step_conf": 11.53125, "calib/ece": 0.374813278008299, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010062725306417053, "calib/mean_conf": 0.9806224066390044, "calib/mu_c": 0.9845890410958904, "calib/mu_w": 0.9745263157894734, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.374813278008299, "calib/std_conf": 0.013756422717855155, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8384236760124611, "calib/step_q_c_n": 1605.0, "calib/step_q_gap": 0.019559533473485602, "calib/step_q_w": 0.8188641425389755, "calib/step_q_w_n": 1347.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 942.8203125, "completions/mean_terminated_length": 985.1510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 474.0, "epoch": 0.20053333333333334, "grad_norm": 0.023577481508255005, "kl": 0.1107635498046875, "learning_rate": 3.3333333333333335e-07, "loss": -0.1837, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.015136613510549068, "mask/share_reasoning": 0.8198684453964233, "mask/share_step_conf": 0.12202617526054382, "num_tokens": 61749779.0, "reward": 0.7894806861877441, "reward_std": 0.23784969747066498, "rewards/accuracy_reward_step": 0.5703125, "rewards/asymmetric_l2_reward": 0.6890180110931396, "rewards/final_brier_reward_step": 0.5883808135986328, "rewards/format_reward_step": 0.9375, "step": 188 }, { "adv/mean_abs_final_conf": 0.6499653458595276, "adv/mean_abs_reasoning": 0.5608536601066589, "adv/mean_abs_step_conf": 0.753533124923706, "adv/ratio_final_to_reasoning": 1.158885805855171, "adv/ratio_step_to_reasoning": 1.3435467725759422, "adv/std_final_conf": 0.8551369905471802, "adv/std_reasoning": 0.8101358413696289, "adv/std_step_conf": 0.9269534945487976, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.6467896962706651, "calib/avg_num_step_conf": 12.296875, "calib/ece": 0.3350840336134453, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008692810457516909, "calib/mean_conf": 0.9779411764705882, "calib/mu_c": 0.9810457516339871, "calib/mu_w": 0.9723529411764702, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3350840336134453, "calib/std_conf": 0.014594113074491486, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8391402714932126, "calib/step_q_c_n": 1768.0, "calib/step_q_gap": 0.018480851203357518, "calib/step_q_w": 0.8206594202898551, "calib/step_q_w_n": 1380.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 857.04296875, "completions/mean_terminated_length": 906.6239013671875, "completions/min_length": 0.0, "completions/min_terminated_length": 552.0, "epoch": 0.2016, "grad_norm": 0.025642195716500282, "kl": 0.130401611328125, "learning_rate": 3.055555555555556e-07, "loss": -0.2976, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01532869040966034, "mask/share_reasoning": 0.8045191764831543, "mask/share_step_conf": 0.12546461820602417, "num_tokens": 62076950.0, "reward": 0.8125067949295044, "reward_std": 0.29982349276542664, "rewards/accuracy_reward_step": 0.59765625, "rewards/asymmetric_l2_reward": 0.7050384879112244, "rewards/final_brier_reward_step": 0.6152874827384949, "rewards/format_reward_step": 0.92578125, "step": 189 }, { "adv/mean_abs_final_conf": 0.6154840588569641, "adv/mean_abs_reasoning": 0.5056318044662476, "adv/mean_abs_step_conf": 0.741155743598938, "adv/ratio_final_to_reasoning": 1.217257406318968, "adv/ratio_step_to_reasoning": 1.4658012748650433, "adv/std_final_conf": 0.830213725566864, "adv/std_reasoning": 0.7755312323570251, "adv/std_step_conf": 0.9308255314826965, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6579521463757916, "calib/avg_num_step_conf": 12.359375, "calib/ece": 0.37893004115226336, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.009448275862068867, "calib/mean_conf": 0.9756378600823045, "calib/mu_c": 0.9794482758620688, "calib/mu_w": 0.97, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37893004115226336, "calib/std_conf": 0.014958946796078698, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8391810620601406, "calib/step_q_c_n": 1563.0, "calib/step_q_gap": 0.0027101064074235293, "calib/step_q_w": 0.8364709556527171, "calib/step_q_w_n": 1601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 917.296875, "completions/mean_terminated_length": 958.4815673828125, "completions/min_length": 0.0, "completions/min_terminated_length": 524.0, "epoch": 0.20266666666666666, "grad_norm": 0.022940604016184807, "kl": 0.11981201171875, "learning_rate": 2.7777777777777776e-07, "loss": -0.1979, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01512821763753891, "mask/share_reasoning": 0.8211517333984375, "mask/share_step_conf": 0.12075131386518478, "num_tokens": 62417386.0, "reward": 0.8009210824966431, "reward_std": 0.24404773116111755, "rewards/accuracy_reward_step": 0.56640625, "rewards/asymmetric_l2_reward": 0.7101178169250488, "rewards/final_brier_reward_step": 0.5885992050170898, "rewards/format_reward_step": 0.94921875, "step": 190 }, { "adv/mean_abs_final_conf": 0.6485990881919861, "adv/mean_abs_reasoning": 0.517936646938324, "adv/mean_abs_step_conf": 0.7152957320213318, "adv/ratio_final_to_reasoning": 1.2522749491198322, "adv/ratio_step_to_reasoning": 1.3810486982329895, "adv/std_final_conf": 0.8772525191307068, "adv/std_reasoning": 0.7929953336715698, "adv/std_step_conf": 0.9332661628723145, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.6184525847312564, "calib/avg_num_step_conf": 12.9140625, "calib/ece": 0.45429752066115714, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.007215337213282713, "calib/mean_conf": 0.9790909090909092, "calib/mu_c": 0.9825196850393698, "calib/mu_w": 0.9753043478260871, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.45429752066115714, "calib/std_conf": 0.014431370787625051, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8464498269896195, "calib/step_q_c_n": 1445.0, "calib/step_q_gap": 0.010404689966513558, "calib/step_q_w": 0.8360451370231059, "calib/step_q_w_n": 1861.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 893.30078125, "completions/mean_terminated_length": 941.0905151367188, "completions/min_length": 0.0, "completions/min_terminated_length": 548.0, "epoch": 0.20373333333333332, "grad_norm": 0.017885025590658188, "kl": 0.120147705078125, "learning_rate": 2.5000000000000004e-07, "loss": -0.2514, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.01538168266415596, "mask/share_reasoning": 0.8069976568222046, "mask/share_step_conf": 0.12683944404125214, "num_tokens": 62750239.0, "reward": 0.7270830869674683, "reward_std": 0.26644009351730347, "rewards/accuracy_reward_step": 0.49609375, "rewards/asymmetric_l2_reward": 0.6501602530479431, "rewards/final_brier_reward_step": 0.5172871351242065, "rewards/format_reward_step": 0.9375, "step": 191 }, { "adv/mean_abs_final_conf": 0.6051337718963623, "adv/mean_abs_reasoning": 0.5304586887359619, "adv/mean_abs_step_conf": 0.7536708116531372, "adv/ratio_final_to_reasoning": 1.1407745499246036, "adv/ratio_step_to_reasoning": 1.420790775336475, "adv/std_final_conf": 0.8207889199256897, "adv/std_reasoning": 0.7756054997444153, "adv/std_step_conf": 0.9285756945610046, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7128837072018891, "calib/avg_num_step_conf": 12.7109375, "calib/ece": 0.34314049586776874, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.012792207792207688, "calib/mean_conf": 0.9795041322314051, "calib/mu_c": 0.9841558441558441, "calib/mu_w": 0.9713636363636364, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34314049586776874, "calib/std_conf": 0.014307787838866616, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8471263736263737, "calib/step_q_c_n": 1820.0, "calib/step_q_gap": 0.02227281714101792, "calib/step_q_w": 0.8248535564853557, "calib/step_q_w_n": 1434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 880.55859375, "completions/mean_terminated_length": 923.8646850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 474.0, "epoch": 0.2048, "grad_norm": 0.031466081738471985, "kl": 0.1272125244140625, "learning_rate": 2.2222222222222224e-07, "loss": -0.2225, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.015780529007315636, "mask/share_reasoning": 0.806641161441803, "mask/share_step_conf": 0.13070333003997803, "num_tokens": 63080638.0, "reward": 0.8223682641983032, "reward_std": 0.27081209421157837, "rewards/accuracy_reward_step": 0.6015625, "rewards/asymmetric_l2_reward": 0.7147022485733032, "rewards/final_brier_reward_step": 0.620659351348877, "rewards/format_reward_step": 0.9453125, "step": 192 }, { "adv/mean_abs_final_conf": 0.6623778343200684, "adv/mean_abs_reasoning": 0.6136531233787537, "adv/mean_abs_step_conf": 0.7547324895858765, "adv/ratio_final_to_reasoning": 1.0794010640295255, "adv/ratio_step_to_reasoning": 1.2299008362091348, "adv/std_final_conf": 0.8701410889625549, "adv/std_reasoning": 0.843064546585083, "adv/std_step_conf": 0.9320592880249023, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6003455608718766, "calib/avg_num_step_conf": 11.41015625, "calib/ece": 0.44345528455284544, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005797448165869135, "calib/mean_conf": 0.980040650406504, "calib/mu_c": 0.9827272727272727, "calib/mu_w": 0.9769298245614035, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44345528455284544, "calib/std_conf": 0.014070032950183454, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8425330549756439, "calib/step_q_c_n": 1437.0, "calib/step_q_gap": 0.029487232873218083, "calib/step_q_w": 0.8130458221024258, "calib/step_q_w_n": 1484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 908.3984375, "completions/mean_terminated_length": 933.9357299804688, "completions/min_length": 0.0, "completions/min_terminated_length": 545.0, "epoch": 0.20586666666666667, "grad_norm": 0.01949903927743435, "kl": 0.1169586181640625, "learning_rate": 1.9444444444444447e-07, "loss": -0.0695, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.015710387378931046, "mask/share_reasoning": 0.8357419371604919, "mask/share_step_conf": 0.12120389193296432, "num_tokens": 63418900.0, "reward": 0.7550156116485596, "reward_std": 0.296281099319458, "rewards/accuracy_reward_step": 0.515625, "rewards/asymmetric_l2_reward": 0.6791197657585144, "rewards/final_brier_reward_step": 0.5355988144874573, "rewards/format_reward_step": 0.9609375, "step": 193 }, { "adv/mean_abs_final_conf": 0.5542274713516235, "adv/mean_abs_reasoning": 0.5262541770935059, "adv/mean_abs_step_conf": 0.736404299736023, "adv/ratio_final_to_reasoning": 1.0531554816583382, "adv/ratio_step_to_reasoning": 1.3993319802289705, "adv/std_final_conf": 0.7933449149131775, "adv/std_reasoning": 0.7755599021911621, "adv/std_step_conf": 0.9261869788169861, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6697740732616801, "calib/avg_num_step_conf": 12.56640625, "calib/ece": 0.3888235294117647, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.9957983193277311, "calib/gap": 0.009396797543321012, "calib/mean_conf": 0.9799999999999999, "calib/mu_c": 0.9838297872340426, "calib/mu_w": 0.9744329896907216, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38819327731092435, "calib/std_conf": 0.016218051981789737, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8422549647661757, "calib/step_q_c_n": 1561.0, "calib/step_q_gap": 0.04467646235071665, "calib/step_q_w": 0.797578502415459, "calib/step_q_w_n": 1656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 857.75390625, "completions/mean_terminated_length": 911.14111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 24.0, "epoch": 0.20693333333333333, "grad_norm": 0.023266388103365898, "kl": 0.12542724609375, "learning_rate": 1.6666666666666668e-07, "loss": -0.1916, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.01550736278295517, "mask/share_reasoning": 0.8019325733184814, "mask/share_step_conf": 0.12396633625030518, "num_tokens": 63744429.0, "reward": 0.7903183698654175, "reward_std": 0.2568064332008362, "rewards/accuracy_reward_step": 0.55078125, "rewards/asymmetric_l2_reward": 0.7150031328201294, "rewards/final_brier_reward_step": 0.5695398449897766, "rewards/format_reward_step": 0.9296875, "step": 194 }, { "adv/mean_abs_final_conf": 0.6420087218284607, "adv/mean_abs_reasoning": 0.5460288524627686, "adv/mean_abs_step_conf": 0.754043459892273, "adv/ratio_final_to_reasoning": 1.1757780178332913, "adv/ratio_step_to_reasoning": 1.3809590033407402, "adv/std_final_conf": 0.8318362236022949, "adv/std_reasoning": 0.7929378747940063, "adv/std_step_conf": 0.9332217574119568, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6601991758241758, "calib/avg_num_step_conf": 11.66796875, "calib/ece": 0.40479508196721314, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00973351648351617, "calib/mean_conf": 0.9785655737704918, "calib/mu_c": 0.9827142857142857, "calib/mu_w": 0.9729807692307695, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40479508196721314, "calib/std_conf": 0.014654204177662597, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8443458838544992, "calib/step_q_c_n": 1567.0, "calib/step_q_gap": 0.0173599683615413, "calib/step_q_w": 0.8269859154929579, "calib/step_q_w_n": 1420.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 917.6171875, "completions/mean_terminated_length": 947.2177124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.208, "grad_norm": 0.03138196840882301, "kl": 0.125946044921875, "learning_rate": 1.3888888888888888e-07, "loss": -0.0945, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.015517974272370338, "mask/share_reasoning": 0.826676607131958, "mask/share_step_conf": 0.1265554130077362, "num_tokens": 64085323.0, "reward": 0.7782267332077026, "reward_std": 0.27050119638442993, "rewards/accuracy_reward_step": 0.546875, "rewards/asymmetric_l2_reward": 0.6882679462432861, "rewards/final_brier_reward_step": 0.5681855082511902, "rewards/format_reward_step": 0.953125, "step": 195 }, { "adv/mean_abs_final_conf": 0.5235567688941956, "adv/mean_abs_reasoning": 0.41682004928588867, "adv/mean_abs_step_conf": 0.74735426902771, "adv/ratio_final_to_reasoning": 1.2560738615888851, "adv/ratio_step_to_reasoning": 1.7929902131821744, "adv/std_final_conf": 0.7714676260948181, "adv/std_reasoning": 0.7014705538749695, "adv/std_step_conf": 0.9283434748649597, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5894736842105264, "calib/avg_num_step_conf": 12.375, "calib/ece": 0.37510288065843633, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.005441678520626048, "calib/mean_conf": 0.9841563786008232, "calib/mu_c": 0.9862837837837836, "calib/mu_w": 0.9808421052631575, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37510288065843633, "calib/std_conf": 0.01184642079494479, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8543995243757431, "calib/step_q_c_n": 1682.0, "calib/step_q_gap": 0.0170307491402113, "calib/step_q_w": 0.8373687752355318, "calib/step_q_w_n": 1486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 789.8828125, "completions/mean_terminated_length": 828.7294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 508.0, "epoch": 0.20906666666666668, "grad_norm": 0.021556729450821877, "kl": 0.1281890869140625, "learning_rate": 1.1111111111111112e-07, "loss": -0.1975, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.01695971004664898, "mask/share_reasoning": 0.7979605197906494, "mask/share_step_conf": 0.13820475339889526, "num_tokens": 64390077.0, "reward": 0.7968078255653381, "reward_std": 0.21186324954032898, "rewards/accuracy_reward_step": 0.578125, "rewards/asymmetric_l2_reward": 0.6961746215820312, "rewards/final_brier_reward_step": 0.5919722318649292, "rewards/format_reward_step": 0.94921875, "step": 196 }, { "adv/mean_abs_final_conf": 0.6259146928787231, "adv/mean_abs_reasoning": 0.527590811252594, "adv/mean_abs_step_conf": 0.7711992263793945, "adv/ratio_final_to_reasoning": 1.1863639008281643, "adv/ratio_step_to_reasoning": 1.4617374107567018, "adv/std_final_conf": 0.8267181515693665, "adv/std_reasoning": 0.7756460309028625, "adv/std_step_conf": 0.9291678071022034, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.583591903073286, "calib/avg_num_step_conf": 12.50390625, "calib/ece": 0.37638655462184867, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0048226950354613995, "calib/mean_conf": 0.9814285714285713, "calib/mu_c": 0.9833333333333335, "calib/mu_w": 0.9785106382978721, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37638655462184867, "calib/std_conf": 0.013490470604054103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.836875, "calib/step_q_c_n": 1600.0, "calib/step_q_gap": -0.0035060118675829255, "calib/step_q_w": 0.840381011867583, "calib/step_q_w_n": 1601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 873.21484375, "completions/mean_terminated_length": 912.4203491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 572.0, "epoch": 0.21013333333333334, "grad_norm": 0.023413734510540962, "kl": 0.128143310546875, "learning_rate": 8.333333333333334e-08, "loss": -0.0998, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.015757042914628983, "mask/share_reasoning": 0.8129766583442688, "mask/share_step_conf": 0.12829750776290894, "num_tokens": 64718676.0, "reward": 0.7949533462524414, "reward_std": 0.271894633769989, "rewards/accuracy_reward_step": 0.5625, "rewards/asymmetric_l2_reward": 0.7136778235435486, "rewards/final_brier_reward_step": 0.5777913928031921, "rewards/format_reward_step": 0.9296875, "step": 197 }, { "adv/mean_abs_final_conf": 0.5809048414230347, "adv/mean_abs_reasoning": 0.4991025924682617, "adv/mean_abs_step_conf": 0.777890682220459, "adv/ratio_final_to_reasoning": 1.1638986657036345, "adv/ratio_step_to_reasoning": 1.5585787250141876, "adv/std_final_conf": 0.7810680270195007, "adv/std_reasoning": 0.7396623492240906, "adv/std_step_conf": 0.9276652932167053, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6636700798193985, "calib/avg_num_step_conf": 13.34765625, "calib/ece": 0.31288135593220345, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.9957627118644068, "calib/gap": 0.009945980811094435, "calib/mean_conf": 0.978135593220339, "calib/mu_c": 0.9814649681528661, "calib/mu_w": 0.9715189873417717, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31288135593220345, "calib/std_conf": 0.015235393257114744, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8487527839643654, "calib/step_q_c_n": 1796.0, "calib/step_q_gap": 0.010412253427659612, "calib/step_q_w": 0.8383405305367058, "calib/step_q_w_n": 1621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 893.18359375, "completions/mean_terminated_length": 944.8553466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.2112, "grad_norm": 0.027668872848153114, "kl": 0.126190185546875, "learning_rate": 5.555555555555556e-08, "loss": -0.2549, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.015574757941067219, "mask/share_reasoning": 0.7983865141868591, "mask/share_step_conf": 0.13135124742984772, "num_tokens": 65052715.0, "reward": 0.8287535905838013, "reward_std": 0.24659347534179688, "rewards/accuracy_reward_step": 0.61328125, "rewards/asymmetric_l2_reward": 0.7202712893486023, "rewards/final_brier_reward_step": 0.6302046775817871, "rewards/format_reward_step": 0.921875, "step": 198 }, { "adv/mean_abs_final_conf": 0.7182374000549316, "adv/mean_abs_reasoning": 0.603756308555603, "adv/mean_abs_step_conf": 0.7502772212028503, "adv/ratio_final_to_reasoning": 1.1896147334231713, "adv/ratio_step_to_reasoning": 1.242682205669663, "adv/std_final_conf": 0.8792629837989807, "adv/std_reasoning": 0.843132734298706, "adv/std_step_conf": 0.9331202507019043, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6757726269315673, "calib/avg_num_step_conf": 12.4609375, "calib/ece": 0.33880851063829787, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.010641753390097874, "calib/mean_conf": 0.9813617021276596, "calib/mu_c": 0.9851655629139072, "calib/mu_w": 0.9745238095238093, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33880851063829787, "calib/std_conf": 0.013646643312496537, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.841233993015134, "calib/step_q_c_n": 1718.0, "calib/step_q_gap": -0.0020812243761704208, "calib/step_q_w": 0.8433152173913044, "calib/step_q_w_n": 1472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2642.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 911.12109375, "completions/mean_terminated_length": 959.8641967773438, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.21226666666666666, "grad_norm": 0.02316947840154171, "kl": 0.1233367919921875, "learning_rate": 2.777777777777778e-08, "loss": -0.2208, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.015151815488934517, "mask/share_reasoning": 0.8085764646530151, "mask/share_step_conf": 0.1254904866218567, "num_tokens": 65390162.0, "reward": 0.8104346394538879, "reward_std": 0.31479138135910034, "rewards/accuracy_reward_step": 0.59375, "rewards/asymmetric_l2_reward": 0.7124529480934143, "rewards/final_brier_reward_step": 0.6060726642608643, "rewards/format_reward_step": 0.91796875, "step": 199 }, { "adv/mean_abs_final_conf": 0.5120427012443542, "adv/mean_abs_reasoning": 0.47003117203712463, "adv/mean_abs_step_conf": 0.7336593866348267, "adv/ratio_final_to_reasoning": 1.0893803043426902, "adv/ratio_step_to_reasoning": 1.560873895778299, "adv/std_final_conf": 0.76967853307724, "adv/std_reasoning": 0.7576777935028076, "adv/std_step_conf": 0.926696240901947, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.77136916679501, "calib/avg_num_step_conf": 12.203125, "calib/ece": 0.34371308016877633, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.016107346373016918, "calib/mean_conf": 0.98084388185654, "calib/mu_c": 0.986688741721854, "calib/mu_w": 0.970581395348837, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34371308016877633, "calib/std_conf": 0.013814812515803535, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8510152905198776, "calib/step_q_c_n": 1635.0, "calib/step_q_gap": 0.01500454505312121, "calib/step_q_w": 0.8360107454667564, "calib/step_q_w_n": 1489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 876.6796875, "completions/mean_terminated_length": 927.3966674804688, "completions/min_length": 0.0, "completions/min_terminated_length": 444.0, "epoch": 0.21333333333333335, "grad_norm": 0.019465776160359383, "kl": 0.119354248046875, "learning_rate": 0.0, "loss": -0.1049, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.01544947735965252, "mask/share_reasoning": 0.8081007599830627, "mask/share_step_conf": 0.1217622458934784, "num_tokens": 65722640.0, "reward": 0.8022987842559814, "reward_std": 0.23661327362060547, "rewards/accuracy_reward_step": 0.58984375, "rewards/asymmetric_l2_reward": 0.6970667243003845, "rewards/final_brier_reward_step": 0.6051871180534363, "rewards/format_reward_step": 0.921875, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.0124371408065781, "train_runtime": 23828.9403, "train_samples_per_second": 2.149, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 65722640, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }