{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7502421140670776, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5723537492194897, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9357826709747314, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.043024562299251556, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0136, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 1.264374852180481, "reward_std": 0.26098379492759705, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.770934522151947, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5102895722914849, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9358851313591003, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.04039807990193367, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0157, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 1.198354721069336, "reward_std": 0.24474793672561646, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.76524817943573, "adv/mean_abs_reasoning": 0.46839040517807007, "adv/mean_abs_step_conf": 0.7326986789703369, "adv/ratio_final_to_reasoning": 1.6337827824308275, "adv/ratio_step_to_reasoning": 1.5642905381287295, "adv/std_final_conf": 0.9319915175437927, "adv/std_reasoning": 0.7393165230751038, "adv/std_step_conf": 0.9354543089866638, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4948163707819488, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.25780876494023897, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.3386454183266932, "calib/gap": -0.00015584767583676218, "calib/mean_conf": 0.8833067729083666, "calib/mu_c": 0.8832484076433122, "calib/mu_w": 0.883404255319149, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25780876494023897, "calib/std_conf": 0.04902050056904828, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7823380281690141, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.008501901539591405, "calib/step_q_w": 0.7738361266294227, "calib/step_q_w_n": 537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 514.953125, "completions/mean_terminated_length": 516.9725952148438, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.05442668870091438, "kl": 0.000637352466583252, "learning_rate": 7.5e-07, "loss": 0.0535, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032597482204437256, "mask/share_reasoning": 0.8555380702018738, "mask/share_step_conf": 0.10795819014310837, "num_tokens": 695745.0, "reward": 1.2382688522338867, "reward_std": 0.2650030851364136, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6793316602706909, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7396186590194702, "step": 3 }, { "adv/mean_abs_final_conf": 0.770321249961853, "adv/mean_abs_reasoning": 0.45143747329711914, "adv/mean_abs_step_conf": 0.7634415626525879, "adv/ratio_final_to_reasoning": 1.706374183639949, "adv/ratio_step_to_reasoning": 1.6911346704931591, "adv/std_final_conf": 0.9282577633857727, "adv/std_reasoning": 0.7014027237892151, "adv/std_step_conf": 0.9355876445770264, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.46549286291408565, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.251984126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2698412698412698, "calib/gap": -0.0028332884460006147, "calib/mean_conf": 0.878968253968254, "calib/mu_c": 0.8779113924050633, "calib/mu_w": 0.8807446808510639, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.251984126984127, "calib/std_conf": 0.0456492731053476, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8028297055057618, "calib/step_q_c_n": 781.0, "calib/step_q_gap": 0.008473877284902787, "calib/step_q_w": 0.794355828220859, "calib/step_q_w_n": 489.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 519.29296875, "completions/mean_terminated_length": 519.29296875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.004266666666666667, "grad_norm": 0.05813954770565033, "kl": 0.00032207369804382324, "learning_rate": 1.0000000000000002e-06, "loss": 0.0797, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0328434482216835, "mask/share_reasoning": 0.854226291179657, "mask/share_step_conf": 0.11293025314807892, "num_tokens": 934852.0, "reward": 1.2232000827789307, "reward_std": 0.24379229545593262, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6873449087142944, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7197619676589966, "step": 4 }, { "adv/mean_abs_final_conf": 0.7571590542793274, "adv/mean_abs_reasoning": 0.46423810720443726, "adv/mean_abs_step_conf": 0.7635762691497803, "adv/ratio_final_to_reasoning": 1.6309713539864492, "adv/ratio_step_to_reasoning": 1.6447944649522772, "adv/std_final_conf": 0.9308009743690491, "adv/std_reasoning": 0.7206776738166809, "adv/std_step_conf": 0.9350479245185852, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4254761904761905, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.319795918367347, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3346938775510204, "calib/gap": -0.013380952380952382, "calib/mean_conf": 0.8821632653061224, "calib/mu_c": 0.8764285714285714, "calib/mu_w": 0.8898095238095238, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3152653061224491, "calib/std_conf": 0.04651764946540997, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7969747899159664, "calib/step_q_c_n": 714.0, "calib/step_q_gap": -0.002463615881135084, "calib/step_q_w": 0.7994384057971015, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 526.4296875, "completions/mean_terminated_length": 526.4296875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.005333333333333333, "grad_norm": 0.04380195587873459, "kl": 0.0003115236759185791, "learning_rate": 1.25e-06, "loss": 0.0614, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03395012021064758, "mask/share_reasoning": 0.8517842292785645, "mask/share_step_conf": 0.11426563560962677, "num_tokens": 1176306.0, "reward": 1.1386687755584717, "reward_std": 0.25449854135513306, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6205625534057617, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6783875226974487, "step": 5 }, { "adv/mean_abs_final_conf": 0.7818187475204468, "adv/mean_abs_reasoning": 0.4163788855075836, "adv/mean_abs_step_conf": 0.7508374452590942, "adv/ratio_final_to_reasoning": 1.8776618477360503, "adv/ratio_step_to_reasoning": 1.8032553316045106, "adv/std_final_conf": 0.931926965713501, "adv/std_reasoning": 0.6816792488098145, "adv/std_step_conf": 0.9356086850166321, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.48602484472049684, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.3335294117647059, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3176470588235294, "calib/gap": -0.0015341614906831946, "calib/mean_conf": 0.8825490196078432, "calib/mu_c": 0.8818571428571429, "calib/mu_w": 0.8833913043478261, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3335294117647059, "calib/std_conf": 0.04358643147635229, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7916819571865443, "calib/step_q_c_n": 654.0, "calib/step_q_gap": -0.0013319531689426345, "calib/step_q_w": 0.793013910355487, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 435.4453125, "completions/mean_terminated_length": 437.1529541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.0064, "grad_norm": 0.045873790979385376, "kl": 0.0004010796546936035, "learning_rate": 1.5e-06, "loss": 0.0061, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0369919016957283, "mask/share_reasoning": 0.8343896865844727, "mask/share_step_conf": 0.12471219152212143, "num_tokens": 1393732.0, "reward": 1.1750853061676025, "reward_std": 0.24491076171398163, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6318714618682861, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7056338787078857, "step": 6 }, { "adv/mean_abs_final_conf": 0.7702935934066772, "adv/mean_abs_reasoning": 0.3941645324230194, "adv/mean_abs_step_conf": 0.7672722339630127, "adv/ratio_final_to_reasoning": 1.9542437993380748, "adv/ratio_step_to_reasoning": 1.9465785753132456, "adv/std_final_conf": 0.926905632019043, "adv/std_reasoning": 0.6613823771476746, "adv/std_step_conf": 0.9354144930839539, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.465325163626236, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.22237154150197624, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.31620553359683795, "calib/gap": -0.0049192313048325165, "calib/mean_conf": 0.8824505928853753, "calib/mu_c": 0.8807784431137723, "calib/mu_w": 0.8856976744186048, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22237154150197624, "calib/std_conf": 0.043976306610796094, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7888198103266597, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.005324047614795324, "calib/step_q_w": 0.7834957627118644, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 550.9140625, "completions/mean_terminated_length": 550.9140625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.007466666666666667, "grad_norm": 0.06298936903476715, "kl": 0.0002759397029876709, "learning_rate": 1.75e-06, "loss": 0.1126, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030498892068862915, "mask/share_reasoning": 0.8595165014266968, "mask/share_step_conf": 0.10998459905385971, "num_tokens": 1642190.0, "reward": 1.2695424556732178, "reward_std": 0.2169540971517563, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7131187319755554, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7493111491203308, "step": 7 }, { "adv/mean_abs_final_conf": 0.7680623531341553, "adv/mean_abs_reasoning": 0.4634212851524353, "adv/mean_abs_step_conf": 0.7772020101547241, "adv/ratio_final_to_reasoning": 1.6573739224807358, "adv/ratio_step_to_reasoning": 1.6770960572065123, "adv/std_final_conf": 0.9303442239761353, "adv/std_reasoning": 0.7205734252929688, "adv/std_step_conf": 0.9359766244888306, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5117266734279918, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.33440476190476187, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": 0.012046146044624617, "calib/mean_conf": 0.8740873015873015, "calib/mu_c": 0.8796323529411765, "calib/mu_w": 0.8675862068965519, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.33440476190476187, "calib/std_conf": 0.0790240590089017, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.792394366197183, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.014764736567553172, "calib/step_q_w": 0.7776296296296298, "calib/step_q_w_n": 675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 520.484375, "completions/mean_terminated_length": 524.5827026367188, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.008533333333333334, "grad_norm": 0.047856494784355164, "kl": 0.00043451786041259766, "learning_rate": 2.0000000000000003e-06, "loss": -0.0253, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03180117905139923, "mask/share_reasoning": 0.8542159199714661, "mask/share_step_conf": 0.1061704009771347, "num_tokens": 1881946.0, "reward": 1.1904280185699463, "reward_std": 0.25926920771598816, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.618283212184906, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7312864065170288, "step": 8 }, { "adv/mean_abs_final_conf": 0.7786675691604614, "adv/mean_abs_reasoning": 0.4911506175994873, "adv/mean_abs_step_conf": 0.766486406326294, "adv/ratio_final_to_reasoning": 1.5853946656246132, "adv/ratio_step_to_reasoning": 1.5605933879764178, "adv/std_final_conf": 0.9314641356468201, "adv/std_reasoning": 0.739389955997467, "adv/std_step_conf": 0.9362146854400635, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4929835265405735, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.2778629032258065, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.29435483870967744, "calib/gap": -0.0008697715409126117, "calib/mean_conf": 0.8786693548387097, "calib/mu_c": 0.8783221476510067, "calib/mu_w": 0.8791919191919193, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.2778629032258065, "calib/std_conf": 0.048579037988023516, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7766941015089164, "calib/step_q_c_n": 729.0, "calib/step_q_gap": 0.05222041729839, "calib/step_q_w": 0.7244736842105264, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 513.75390625, "completions/mean_terminated_length": 517.7991943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.0096, "grad_norm": 0.05688917636871338, "kl": 0.00041407346725463867, "learning_rate": 2.25e-06, "loss": -0.0055, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03355059027671814, "mask/share_reasoning": 0.8492412567138672, "mask/share_step_conf": 0.10939560830593109, "num_tokens": 2121003.0, "reward": 1.1500149965286255, "reward_std": 0.2937062680721283, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6464296579360962, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6740657091140747, "step": 9 }, { "adv/mean_abs_final_conf": 0.7343442440032959, "adv/mean_abs_reasoning": 0.47787123918533325, "adv/mean_abs_step_conf": 0.7526214718818665, "adv/ratio_final_to_reasoning": 1.5366989761827756, "adv/ratio_step_to_reasoning": 1.5749461573894314, "adv/std_final_conf": 0.9322107434272766, "adv/std_reasoning": 0.7575381994247437, "adv/std_step_conf": 0.9360104203224182, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4807258492569002, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.26940711462450595, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.38735177865612647, "calib/gap": -0.0043365180467089814, "calib/mean_conf": 0.8881422924901187, "calib/mu_c": 0.8864968152866243, "calib/mu_w": 0.8908333333333333, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26849802371541504, "calib/std_conf": 0.05183688606388143, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7852689655172415, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.008245155993431896, "calib/step_q_w": 0.7770238095238096, "calib/step_q_w_n": 504.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 513.859375, "completions/mean_terminated_length": 515.87451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.010666666666666666, "grad_norm": 0.03583617880940437, "kl": 0.000704646110534668, "learning_rate": 2.5e-06, "loss": 0.0562, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03214257210493088, "mask/share_reasoning": 0.8575109243392944, "mask/share_step_conf": 0.1064402237534523, "num_tokens": 2359351.0, "reward": 1.233507752418518, "reward_std": 0.26790809631347656, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6795351505279541, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7339744567871094, "step": 10 }, { "adv/mean_abs_final_conf": 0.7628040313720703, "adv/mean_abs_reasoning": 0.41833338141441345, "adv/mean_abs_step_conf": 0.7374853491783142, "adv/ratio_final_to_reasoning": 1.8234357219903856, "adv/ratio_step_to_reasoning": 1.7629129826666625, "adv/std_final_conf": 0.9276480078697205, "adv/std_reasoning": 0.7013552188873291, "adv/std_step_conf": 0.9357235431671143, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.509952229299363, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.27031620553359686, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.43873517786561267, "calib/gap": 0.007517914012738824, "calib/mean_conf": 0.8881027667984189, "calib/mu_c": 0.8909554140127388, "calib/mu_w": 0.8834375, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2689328063241107, "calib/std_conf": 0.07276822613907688, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7701674641148325, "calib/step_q_c_n": 836.0, "calib/step_q_gap": 0.025344142560062233, "calib/step_q_w": 0.7448233215547703, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 527.68359375, "completions/mean_terminated_length": 529.7529907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.011733333333333333, "grad_norm": 0.0488060787320137, "kl": 0.0011137723922729492, "learning_rate": 2.7500000000000004e-06, "loss": 0.0176, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03208436816930771, "mask/share_reasoning": 0.8486138582229614, "mask/share_step_conf": 0.11539548635482788, "num_tokens": 2598918.0, "reward": 1.2425141334533691, "reward_std": 0.22078053653240204, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6830957531929016, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7408100366592407, "step": 11 }, { "adv/mean_abs_final_conf": 0.7242513298988342, "adv/mean_abs_reasoning": 0.3657684028148651, "adv/mean_abs_step_conf": 0.7559762597084045, "adv/ratio_final_to_reasoning": 1.9800817247339335, "adv/ratio_step_to_reasoning": 2.0668167449418653, "adv/std_final_conf": 0.925606369972229, "adv/std_reasoning": 0.6612412333488464, "adv/std_step_conf": 0.9354330897331238, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.43468169761273207, "calib/avg_num_step_conf": 5.8125, "calib/ece": 0.21607142857142841, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5436507936507936, "calib/gap": -0.009120247568523587, "calib/mean_conf": 0.9040873015873016, "calib/mu_c": 0.9012643678160919, "calib/mu_w": 0.9103846153846155, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2148412698412697, "calib/std_conf": 0.04723674246548246, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7700875273522976, "calib/step_q_c_n": 914.0, "calib/step_q_gap": 0.004669408885398685, "calib/step_q_w": 0.7654181184668989, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 458.40234375, "completions/mean_terminated_length": 465.6785888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0128, "grad_norm": 0.04862995445728302, "kl": 0.0020551681518554688, "learning_rate": 3e-06, "loss": 0.0078, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03572825714945793, "mask/share_reasoning": 0.8189131617546082, "mask/share_step_conf": 0.129733607172966, "num_tokens": 2820445.0, "reward": 1.3078030347824097, "reward_std": 0.2150072157382965, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7230449318885803, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.779874324798584, "step": 12 }, { "adv/mean_abs_final_conf": 0.7431142330169678, "adv/mean_abs_reasoning": 0.4784717857837677, "adv/mean_abs_step_conf": 0.7514384984970093, "adv/ratio_final_to_reasoning": 1.5530993782626046, "adv/ratio_step_to_reasoning": 1.5704969881684967, "adv/std_final_conf": 0.925552248954773, "adv/std_reasoning": 0.7574694752693176, "adv/std_step_conf": 0.9359784722328186, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.551496590047992, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.32392156862745114, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5450980392156862, "calib/gap": 0.014034478403637252, "calib/mean_conf": 0.904313725490196, "calib/mu_c": 0.9102027027027028, "calib/mu_w": 0.8961682242990655, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.32392156862745114, "calib/std_conf": 0.05363108924103207, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7734931506849315, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.018622238464621632, "calib/step_q_w": 0.7548709122203099, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 500.125, "completions/mean_terminated_length": 500.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.013866666666666666, "grad_norm": 0.03821491077542305, "kl": 0.0026273727416992188, "learning_rate": 3.2500000000000002e-06, "loss": 0.0353, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03380978852510452, "mask/share_reasoning": 0.8494916558265686, "mask/share_step_conf": 0.11669855564832687, "num_tokens": 3053069.0, "reward": 1.2348108291625977, "reward_std": 0.24605479836463928, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6470734477043152, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7550240755081177, "step": 13 }, { "adv/mean_abs_final_conf": 0.7724058032035828, "adv/mean_abs_reasoning": 0.5276020169258118, "adv/mean_abs_step_conf": 0.7816826105117798, "adv/ratio_final_to_reasoning": 1.4639932722474673, "adv/ratio_step_to_reasoning": 1.4815762363199898, "adv/std_final_conf": 0.9248756170272827, "adv/std_reasoning": 0.7576701045036316, "adv/std_step_conf": 0.936029314994812, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4311912026442919, "calib/avg_num_step_conf": 5.62109375, "calib/ece": 0.37992063492063477, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7619047619047619, "calib/gap": -0.005934401220442531, "calib/mean_conf": 0.9275396825396826, "calib/mu_c": 0.9248550724637681, "calib/mu_w": 0.9307894736842106, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.37992063492063477, "calib/std_conf": 0.03808275258359961, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.754450402144772, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.047581715276085124, "calib/step_q_w": 0.7068686868686869, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 541.4375, "completions/mean_terminated_length": 543.560791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.014933333333333333, "grad_norm": 0.03073563612997532, "kl": 0.004769325256347656, "learning_rate": 3.5e-06, "loss": -0.0154, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03164754807949066, "mask/share_reasoning": 0.844820499420166, "mask/share_step_conf": 0.11962570250034332, "num_tokens": 3297077.0, "reward": 1.1957473754882812, "reward_std": 0.28079643845558167, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5938761234283447, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7468560934066772, "step": 14 }, { "adv/mean_abs_final_conf": 0.7598576545715332, "adv/mean_abs_reasoning": 0.4144722521305084, "adv/mean_abs_step_conf": 0.7719860672950745, "adv/ratio_final_to_reasoning": 1.8333136914851187, "adv/ratio_step_to_reasoning": 1.8625759947182003, "adv/std_final_conf": 0.9134082794189453, "adv/std_reasoning": 0.6815695762634277, "adv/std_step_conf": 0.935758650302887, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4457282913165265, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.3324609375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.8203125, "calib/gap": -0.0006455309396488085, "calib/mean_conf": 0.9340234374999999, "calib/mu_c": 0.9337662337662338, "calib/mu_w": 0.9344117647058826, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3324609375, "calib/std_conf": 0.04049285524242015, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7113087674714104, "calib/step_q_c_n": 787.0, "calib/step_q_gap": -0.011361582436508644, "calib/step_q_w": 0.722670349907919, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 474.09375, "completions/mean_terminated_length": 475.9529724121094, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.016, "grad_norm": 0.04096554219722748, "kl": 0.009725570678710938, "learning_rate": 3.7500000000000005e-06, "loss": 0.0214, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.033879321068525314, "mask/share_reasoning": 0.8443080186843872, "mask/share_step_conf": 0.11790642887353897, "num_tokens": 3526325.0, "reward": 1.261141061782837, "reward_std": 0.23252829909324646, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6478356122970581, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7770669460296631, "step": 15 }, { "adv/mean_abs_final_conf": 0.752341091632843, "adv/mean_abs_reasoning": 0.36619657278060913, "adv/mean_abs_step_conf": 0.7721002101898193, "adv/ratio_final_to_reasoning": 2.054473328136732, "adv/ratio_step_to_reasoning": 2.1084310110471454, "adv/std_final_conf": 0.9102146625518799, "adv/std_reasoning": 0.6403167843818665, "adv/std_step_conf": 0.9357337355613708, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5223367697594502, "calib/avg_num_step_conf": 6.50390625, "calib/ece": 0.33652000000000015, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.944, "calib/gap": 0.0029721716865438452, "calib/mean_conf": 0.94852, "calib/mu_c": 0.9496732026143789, "calib/mu_w": 0.9467010309278351, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33652000000000015, "calib/std_conf": 0.026287061456161265, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6765062560153995, "calib/step_q_c_n": 1039.0, "calib/step_q_gap": -0.021417066668306672, "calib/step_q_w": 0.6979233226837062, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 657.9375, "completions/mean_terminated_length": 657.9375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.017066666666666667, "grad_norm": 0.03354340419173241, "kl": 0.009976387023925781, "learning_rate": 4.000000000000001e-06, "loss": 0.0571, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.025233477354049683, "mask/share_reasoning": 0.8622151613235474, "mask/share_step_conf": 0.11255130171775818, "num_tokens": 3803605.0, "reward": 1.2428134679794312, "reward_std": 0.23047450184822083, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6347839832305908, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7679995894432068, "step": 16 }, { "adv/mean_abs_final_conf": 0.7450023889541626, "adv/mean_abs_reasoning": 0.4558400511741638, "adv/mean_abs_step_conf": 0.7488822937011719, "adv/ratio_final_to_reasoning": 1.6343504416410262, "adv/ratio_step_to_reasoning": 1.6428619902357917, "adv/std_final_conf": 0.9170376062393188, "adv/std_reasoning": 0.7205342054367065, "adv/std_step_conf": 0.9359606504440308, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4767057800956106, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.1766535433070866, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8818897637795275, "calib/gap": 0.0067961755758364495, "calib/mean_conf": 0.9443700787401575, "calib/mu_c": 0.9459487179487178, "calib/mu_w": 0.9391525423728814, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1766535433070866, "calib/std_conf": 0.04263234200736771, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6660054595086443, "calib/step_q_c_n": 1099.0, "calib/step_q_gap": 0.011436493991402896, "calib/step_q_w": 0.6545689655172414, "calib/step_q_w_n": 348.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 515.0390625, "completions/mean_terminated_length": 517.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.018133333333333335, "grad_norm": 0.03775444254279137, "kl": 0.014379501342773438, "learning_rate": 4.25e-06, "loss": -0.0031, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03248380497097969, "mask/share_reasoning": 0.8391643166542053, "mask/share_step_conf": 0.12444561719894409, "num_tokens": 4038983.0, "reward": 1.399499773979187, "reward_std": 0.24108482897281647, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.7809871435165405, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8340061902999878, "step": 17 }, { "adv/mean_abs_final_conf": 0.7674014568328857, "adv/mean_abs_reasoning": 0.30643776059150696, "adv/mean_abs_step_conf": 0.7641079425811768, "adv/ratio_final_to_reasoning": 2.5042653207998757, "adv/ratio_step_to_reasoning": 2.493517577945498, "adv/std_final_conf": 0.9057202935218811, "adv/std_reasoning": 0.5961846113204956, "adv/std_step_conf": 0.935936689376831, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.509382951653944, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.43446215139442235, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9561752988047809, "calib/gap": -0.005776081424936352, "calib/mean_conf": 0.9503187250996016, "calib/mu_c": 0.9475572519083969, "calib/mu_w": 0.9533333333333333, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.43143426294820725, "calib/std_conf": 0.052063954771246826, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6570231213872832, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.05865201657141916, "calib/step_q_w": 0.5983711048158641, "calib/step_q_w_n": 706.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 524.515625, "completions/mean_terminated_length": 526.5725708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.0192, "grad_norm": 0.024585934355854988, "kl": 0.017147064208984375, "learning_rate": 4.5e-06, "loss": 0.0464, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03209284693002701, "mask/share_reasoning": 0.853888988494873, "mask/share_step_conf": 0.11011190712451935, "num_tokens": 4283979.0, "reward": 1.1490769386291504, "reward_std": 0.21871541440486908, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5484675765037537, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7267962694168091, "step": 18 }, { "adv/mean_abs_final_conf": 0.7171926498413086, "adv/mean_abs_reasoning": 0.36403369903564453, "adv/mean_abs_step_conf": 0.750006914138794, "adv/ratio_final_to_reasoning": 1.9701270836771745, "adv/ratio_step_to_reasoning": 2.060267816209391, "adv/std_final_conf": 0.898904025554657, "adv/std_reasoning": 0.661307692527771, "adv/std_step_conf": 0.9358205199241638, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.466437908496732, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.36015873015873007, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9563492063492064, "calib/gap": 0.004870588235294293, "calib/mean_conf": 0.9514285714285714, "calib/mu_c": 0.9533999999999999, "calib/mu_w": 0.9485294117647056, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3581746031746031, "calib/std_conf": 0.058901508937395146, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6534458509142054, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.04295275426726264, "calib/step_q_w": 0.6104930966469427, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 489.40625, "completions/mean_terminated_length": 495.2095031738281, "completions/min_length": 0.0, "completions/min_terminated_length": 218.0, "epoch": 0.020266666666666665, "grad_norm": 0.03137790784239769, "kl": 0.01927947998046875, "learning_rate": 4.75e-06, "loss": -0.0737, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031197164207696915, "mask/share_reasoning": 0.8504468202590942, "mask/share_step_conf": 0.10663723945617676, "num_tokens": 4514027.0, "reward": 1.2547439336776733, "reward_std": 0.250137060880661, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6209847927093506, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7876108884811401, "step": 19 }, { "adv/mean_abs_final_conf": 0.7319923639297485, "adv/mean_abs_reasoning": 0.4361271858215332, "adv/mean_abs_step_conf": 0.7728710174560547, "adv/ratio_final_to_reasoning": 1.6783919639196392, "adv/ratio_step_to_reasoning": 1.7721230012300122, "adv/std_final_conf": 0.9048722982406616, "adv/std_reasoning": 0.7013739943504333, "adv/std_step_conf": 0.9360346794128418, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4546394417162057, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.3806746031746032, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9682539682539683, "calib/gap": -0.003025329542517441, "calib/mean_conf": 0.9600396825396825, "calib/mu_c": 0.9587671232876711, "calib/mu_w": 0.9617924528301885, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3806746031746032, "calib/std_conf": 0.022422671631048382, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6118475452196384, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.03363108180500418, "calib/step_q_w": 0.5782164634146342, "calib/step_q_w_n": 656.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 490.09375, "completions/mean_terminated_length": 492.0157165527344, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.021333333333333333, "grad_norm": 0.027856621891260147, "kl": 0.028520584106445312, "learning_rate": 5e-06, "loss": 0.0259, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03477778285741806, "mask/share_reasoning": 0.830623209476471, "mask/share_step_conf": 0.1306927353143692, "num_tokens": 4744363.0, "reward": 1.2527803182601929, "reward_std": 0.24482667446136475, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5998862981796265, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7973682880401611, "step": 20 }, { "adv/mean_abs_final_conf": 0.7427388429641724, "adv/mean_abs_reasoning": 0.5501335263252258, "adv/mean_abs_step_conf": 0.7632744312286377, "adv/ratio_final_to_reasoning": 1.3501064876476605, "adv/ratio_step_to_reasoning": 1.3874348584554508, "adv/std_final_conf": 0.9050422310829163, "adv/std_reasoning": 0.7927542924880981, "adv/std_step_conf": 0.9361280798912048, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5476284067085955, "calib/avg_num_step_conf": 5.7578125, "calib/ece": 0.339686274509804, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": 0.006108490566037461, "calib/mean_conf": 0.9617254901960784, "calib/mu_c": 0.9640251572327042, "calib/mu_w": 0.9579166666666667, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3389411764705883, "calib/std_conf": 0.03377206086912587, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5569716775599128, "calib/step_q_c_n": 918.0, "calib/step_q_gap": -0.0242873152458426, "calib/step_q_w": 0.5812589928057554, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 513.17578125, "completions/mean_terminated_length": 513.17578125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.0224, "grad_norm": 0.025678621605038643, "kl": 0.031463623046875, "learning_rate": 4.9722222222222224e-06, "loss": 0.0157, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03263770043849945, "mask/share_reasoning": 0.8395789265632629, "mask/share_step_conf": 0.12778335809707642, "num_tokens": 4978696.0, "reward": 1.2974814176559448, "reward_std": 0.27095523476600647, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6500609517097473, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8107321858406067, "step": 21 }, { "adv/mean_abs_final_conf": 0.6854065656661987, "adv/mean_abs_reasoning": 0.4051262438297272, "adv/mean_abs_step_conf": 0.7591018676757812, "adv/ratio_final_to_reasoning": 1.691834523448627, "adv/ratio_step_to_reasoning": 1.8737415293066735, "adv/std_final_conf": 0.8820250630378723, "adv/std_reasoning": 0.701339602470398, "adv/std_step_conf": 0.9360891580581665, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5248234316576652, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.30999999999999994, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.001087799473757145, "calib/mean_conf": 0.9629644268774703, "calib/mu_c": 0.9625903614457832, "calib/mu_w": 0.9636781609195404, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30841897233201576, "calib/std_conf": 0.02663445121398129, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5901408450704226, "calib/step_q_c_n": 923.0, "calib/step_q_gap": 0.055067852369692805, "calib/step_q_w": 0.5350729927007298, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 482.75390625, "completions/mean_terminated_length": 484.6470947265625, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.023466666666666667, "grad_norm": 0.023623965680599213, "kl": 0.038116455078125, "learning_rate": 4.944444444444445e-06, "loss": 0.0431, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03269077092409134, "mask/share_reasoning": 0.8367648720741272, "mask/share_step_conf": 0.12663814425468445, "num_tokens": 5204097.0, "reward": 1.3156535625457764, "reward_std": 0.23802658915519714, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6710683107376099, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8164474368095398, "step": 22 }, { "adv/mean_abs_final_conf": 0.7111557722091675, "adv/mean_abs_reasoning": 0.46014660596847534, "adv/mean_abs_step_conf": 0.7735565900802612, "adv/ratio_final_to_reasoning": 1.5454982455263155, "adv/ratio_step_to_reasoning": 1.681108977109912, "adv/std_final_conf": 0.8991236090660095, "adv/std_reasoning": 0.7392363548278809, "adv/std_step_conf": 0.9362433552742004, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5132321585350559, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.43806324110671935, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.0014429951084912185, "calib/mean_conf": 0.9661264822134387, "calib/mu_c": 0.9654477611940296, "calib/mu_w": 0.9668907563025209, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.43727272727272726, "calib/std_conf": 0.017079672934401477, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5716534391534391, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.02909565795159008, "calib/step_q_w": 0.542557781201849, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 536.703125, "completions/mean_terminated_length": 538.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.024533333333333334, "grad_norm": 0.034179698675870895, "kl": 0.041839599609375, "learning_rate": 4.9166666666666665e-06, "loss": -0.0161, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0333084911108017, "mask/share_reasoning": 0.8375416994094849, "mask/share_step_conf": 0.12524355947971344, "num_tokens": 5445429.0, "reward": 1.217017650604248, "reward_std": 0.2587108016014099, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.552796483039856, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7894474267959595, "step": 23 }, { "adv/mean_abs_final_conf": 0.7689262628555298, "adv/mean_abs_reasoning": 0.6364258527755737, "adv/mean_abs_step_conf": 0.7571092844009399, "adv/ratio_final_to_reasoning": 1.208194575223644, "adv/ratio_step_to_reasoning": 1.1896268529932323, "adv/std_final_conf": 0.9153218269348145, "adv/std_reasoning": 0.8430981636047363, "adv/std_step_conf": 0.9364861845970154, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5713598901098901, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.4228512396694215, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9710743801652892, "calib/gap": 0.011046703296703209, "calib/mean_conf": 0.9600413223140495, "calib/mu_c": 0.9651538461538461, "calib/mu_w": 0.9541071428571429, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.4228512396694215, "calib/std_conf": 0.06517299251701879, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5316510538641686, "calib/step_q_c_n": 854.0, "calib/step_q_gap": 0.02945398329159865, "calib/step_q_w": 0.5021970705725699, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 638.265625, "completions/mean_terminated_length": 638.265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0256, "grad_norm": 0.02951071783900261, "kl": 0.03293609619140625, "learning_rate": 4.888888888888889e-06, "loss": 0.0053, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03055085614323616, "mask/share_reasoning": 0.8465626835823059, "mask/share_step_conf": 0.12288644909858704, "num_tokens": 5713337.0, "reward": 1.1778736114501953, "reward_std": 0.3360109329223633, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.542444109916687, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7613389492034912, "step": 24 }, { "adv/mean_abs_final_conf": 0.7344235777854919, "adv/mean_abs_reasoning": 0.40606987476348877, "adv/mean_abs_step_conf": 0.7403815984725952, "adv/ratio_final_to_reasoning": 1.8086137963650946, "adv/ratio_step_to_reasoning": 1.8232861989671676, "adv/std_final_conf": 0.885766863822937, "adv/std_reasoning": 0.6613461375236511, "adv/std_step_conf": 0.9363404512405396, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5586995785671282, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.35924, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.98, "calib/gap": 0.01439494280553899, "calib/mean_conf": 0.96324, "calib/mu_c": 0.9689403973509934, "calib/mu_w": 0.9545454545454544, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35924, "calib/std_conf": 0.06453760454184831, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5418234610917537, "calib/step_q_c_n": 861.0, "calib/step_q_gap": 0.02732507921473104, "calib/step_q_w": 0.5144983818770227, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 509.11328125, "completions/mean_terminated_length": 509.11328125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.02666666666666667, "grad_norm": 0.022077206522226334, "kl": 0.03806877136230469, "learning_rate": 4.861111111111111e-06, "loss": 0.0451, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032237708568573, "mask/share_reasoning": 0.8392555117607117, "mask/share_step_conf": 0.12850674986839294, "num_tokens": 5946894.0, "reward": 1.240429162979126, "reward_std": 0.2546301782131195, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6195351481437683, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7740210294723511, "step": 25 }, { "adv/mean_abs_final_conf": 0.6930798292160034, "adv/mean_abs_reasoning": 0.37017562985420227, "adv/mean_abs_step_conf": 0.7637919187545776, "adv/ratio_final_to_reasoning": 1.8722999930843112, "adv/ratio_step_to_reasoning": 2.0633230746589275, "adv/std_final_conf": 0.8817503452301025, "adv/std_reasoning": 0.6814820766448975, "adv/std_step_conf": 0.9362680912017822, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5613811599061348, "calib/avg_num_step_conf": 5.5625, "calib/ece": 0.3476190476190478, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0003473013744553777, "calib/mean_conf": 0.9661111111111111, "calib/mu_c": 0.9662420382165604, "calib/mu_w": 0.965894736842105, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34535714285714303, "calib/std_conf": 0.0395906077971728, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5219230769230769, "calib/step_q_c_n": 806.0, "calib/step_q_gap": 0.019932785660941033, "calib/step_q_w": 0.5019902912621359, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 496.5, "completions/mean_terminated_length": 500.4094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.027733333333333332, "grad_norm": 0.023952292278409004, "kl": 0.040111541748046875, "learning_rate": 4.833333333333333e-06, "loss": -0.025, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030580170452594757, "mask/share_reasoning": 0.846382737159729, "mask/share_step_conf": 0.11522462964057922, "num_tokens": 6179238.0, "reward": 1.2827597856521606, "reward_std": 0.21078261733055115, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6359202861785889, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8050339221954346, "step": 26 }, { "adv/mean_abs_final_conf": 0.7596620321273804, "adv/mean_abs_reasoning": 0.5038591027259827, "adv/mean_abs_step_conf": 0.7605082988739014, "adv/ratio_final_to_reasoning": 1.5076874229669577, "adv/ratio_step_to_reasoning": 1.5093669931919322, "adv/std_final_conf": 0.9158267974853516, "adv/std_reasoning": 0.7576535940170288, "adv/std_step_conf": 0.9363695383071899, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5430107526881721, "calib/avg_num_step_conf": 5.9375, "calib/ece": 0.4544400000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.948, "calib/gap": 0.028728878648233702, "calib/mean_conf": 0.9504400000000001, "calib/mu_c": 0.9649193548387099, "calib/mu_w": 0.9361904761904762, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4544400000000002, "calib/std_conf": 0.10114646014567193, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5171448863636363, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.0358703765597147, "calib/step_q_w": 0.4812745098039216, "calib/step_q_w_n": 816.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 518.4296875, "completions/mean_terminated_length": 520.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.0288, "grad_norm": 0.03194215148687363, "kl": 0.047698974609375, "learning_rate": 4.805555555555556e-06, "loss": 0.0456, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03263816609978676, "mask/share_reasoning": 0.8364834785461426, "mask/share_step_conf": 0.12697207927703857, "num_tokens": 6417172.0, "reward": 1.1969051361083984, "reward_std": 0.28641411662101746, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5305890440940857, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7862981557846069, "step": 27 }, { "adv/mean_abs_final_conf": 0.7467812299728394, "adv/mean_abs_reasoning": 0.3947819173336029, "adv/mean_abs_step_conf": 0.752916157245636, "adv/ratio_final_to_reasoning": 1.8916297762994705, "adv/ratio_step_to_reasoning": 1.9071698175309246, "adv/std_final_conf": 0.9094687104225159, "adv/std_reasoning": 0.6613571643829346, "adv/std_step_conf": 0.9364560842514038, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5295658682634731, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.31478431372549026, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8823529411764706, "calib/gap": -0.0004994556341860745, "calib/mean_conf": 0.9226274509803921, "calib/mu_c": 0.9224550898203594, "calib/mu_w": 0.9229545454545455, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.29125490196078435, "calib/std_conf": 0.16148015853158199, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4966629464285714, "calib/step_q_c_n": 896.0, "calib/step_q_gap": 0.023838204160530174, "calib/step_q_w": 0.4728247422680412, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 542.46484375, "completions/mean_terminated_length": 542.46484375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.029866666666666666, "grad_norm": 0.024938073009252548, "kl": 0.06166839599609375, "learning_rate": 4.777777777777778e-06, "loss": -0.0126, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03137122094631195, "mask/share_reasoning": 0.851985514163971, "mask/share_step_conf": 0.11664323508739471, "num_tokens": 6662987.0, "reward": 1.2914047241210938, "reward_std": 0.2632555365562439, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6635933518409729, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.796326756477356, "step": 28 }, { "adv/mean_abs_final_conf": 0.7430602312088013, "adv/mean_abs_reasoning": 0.4121718406677246, "adv/mean_abs_step_conf": 0.7778118848800659, "adv/ratio_final_to_reasoning": 1.8027923256596872, "adv/ratio_step_to_reasoning": 1.887105833382501, "adv/std_final_conf": 0.9240677952766418, "adv/std_reasoning": 0.7014405131340027, "adv/std_step_conf": 0.9362191557884216, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5382687487950646, "calib/avg_num_step_conf": 6.16796875, "calib/ece": 0.4668799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.888, "calib/gap": 0.0321161879056614, "calib/mean_conf": 0.9348800000000002, "calib/mu_c": 0.9519658119658118, "calib/mu_w": 0.9198496240601504, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4668799999999999, "calib/std_conf": 0.12150467316115869, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.495695067264574, "calib/step_q_c_n": 669.0, "calib/step_q_gap": -0.008019218449711651, "calib/step_q_w": 0.5037142857142857, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 578.66796875, "completions/mean_terminated_length": 580.9373168945312, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.030933333333333334, "grad_norm": 0.028172692283988, "kl": 0.0406341552734375, "learning_rate": 4.75e-06, "loss": -0.0376, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029275210574269295, "mask/share_reasoning": 0.8464499115943909, "mask/share_step_conf": 0.12036862969398499, "num_tokens": 6918254.0, "reward": 1.1937874555587769, "reward_std": 0.2643427848815918, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.5176992416381836, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7923596501350403, "step": 29 }, { "adv/mean_abs_final_conf": 0.7707754969596863, "adv/mean_abs_reasoning": 0.5646791458129883, "adv/mean_abs_step_conf": 0.766981840133667, "adv/ratio_final_to_reasoning": 1.3649795687956103, "adv/ratio_step_to_reasoning": 1.3582613167508009, "adv/std_final_conf": 0.9269782304763794, "adv/std_reasoning": 0.809898316860199, "adv/std_step_conf": 0.9363453984260559, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5436838624338624, "calib/avg_num_step_conf": 5.96484375, "calib/ece": 0.37963562753036434, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7773279352226721, "calib/gap": 0.021731481481481518, "calib/mean_conf": 0.9056275303643725, "calib/mu_c": 0.9154814814814815, "calib/mu_w": 0.8937499999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3693522267206477, "calib/std_conf": 0.15803291991383311, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4759517766497463, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.03442606623026051, "calib/step_q_w": 0.4415257104194858, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 601.52734375, "completions/mean_terminated_length": 606.2637939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.032, "grad_norm": 0.01841893419623375, "kl": 0.03585052490234375, "learning_rate": 4.722222222222222e-06, "loss": 0.0245, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.028865158557891846, "mask/share_reasoning": 0.8505867719650269, "mask/share_step_conf": 0.11273553967475891, "num_tokens": 7179229.0, "reward": 1.2352664470672607, "reward_std": 0.2913099527359009, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5876230597496033, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7922360897064209, "step": 30 }, { "adv/mean_abs_final_conf": 0.7803065776824951, "adv/mean_abs_reasoning": 0.41063570976257324, "adv/mean_abs_step_conf": 0.7643026113510132, "adv/ratio_final_to_reasoning": 1.9002404299754228, "adv/ratio_step_to_reasoning": 1.861266794826314, "adv/std_final_conf": 0.9310461282730103, "adv/std_reasoning": 0.6613556146621704, "adv/std_step_conf": 0.9363279938697815, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.582306338028169, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.4614960629921261, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7322834645669292, "calib/gap": 0.050871478873239395, "calib/mean_conf": 0.8896850393700788, "calib/mu_c": 0.918125, "calib/mu_w": 0.8672535211267606, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4551181102362206, "calib/std_conf": 0.18142670374368175, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.460623259762309, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.028609017646337553, "calib/step_q_w": 0.43201424211597145, "calib/step_q_w_n": 983.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 585.14453125, "completions/mean_terminated_length": 585.14453125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.03306666666666667, "grad_norm": 0.02274276316165924, "kl": 0.040309906005859375, "learning_rate": 4.694444444444445e-06, "loss": -0.0472, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.02964765392243862, "mask/share_reasoning": 0.8560850024223328, "mask/share_step_conf": 0.11426734179258347, "num_tokens": 7434938.0, "reward": 1.216827630996704, "reward_std": 0.22792679071426392, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.5400328040122986, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8038425445556641, "step": 31 }, { "adv/mean_abs_final_conf": 0.7516148090362549, "adv/mean_abs_reasoning": 0.4341471195220947, "adv/mean_abs_step_conf": 0.7321542501449585, "adv/ratio_final_to_reasoning": 1.7312444911845224, "adv/ratio_step_to_reasoning": 1.6864196886781313, "adv/std_final_conf": 0.9313368797302246, "adv/std_reasoning": 0.7205541729927063, "adv/std_step_conf": 0.9363393783569336, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5853931864754098, "calib/avg_num_step_conf": 5.7578125, "calib/ece": 0.3932000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.66, "calib/gap": 0.04466572745901631, "calib/mean_conf": 0.86, "calib/mu_c": 0.881796875, "calib/mu_w": 0.8371311475409837, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37060000000000004, "calib/std_conf": 0.2149083525598761, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47861654135338344, "calib/step_q_c_n": 665.0, "calib/step_q_gap": 0.06373384666858739, "calib/step_q_w": 0.41488269468479605, "calib/step_q_w_n": 809.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 539.64453125, "completions/mean_terminated_length": 541.7608032226562, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.034133333333333335, "grad_norm": 0.023047886788845062, "kl": 0.046970367431640625, "learning_rate": 4.666666666666667e-06, "loss": 0.001, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03149145841598511, "mask/share_reasoning": 0.8450222015380859, "mask/share_step_conf": 0.11958011984825134, "num_tokens": 7679791.0, "reward": 1.2390613555908203, "reward_std": 0.2708419859409332, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5909906625747681, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7959098219871521, "step": 32 }, { "adv/mean_abs_final_conf": 0.753533124923706, "adv/mean_abs_reasoning": 0.33846065402030945, "adv/mean_abs_step_conf": 0.7617200613021851, "adv/ratio_final_to_reasoning": 2.22635368682615, "adv/ratio_step_to_reasoning": 2.2505424256979594, "adv/std_final_conf": 0.9347721338272095, "adv/std_reasoning": 0.6401256918907166, "adv/std_step_conf": 0.9361996054649353, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5481166312101113, "calib/avg_num_step_conf": 6.00390625, "calib/ece": 0.38264822134387355, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6245059288537549, "calib/gap": 0.04977850081341506, "calib/mean_conf": 0.8376679841897233, "calib/mu_c": 0.8634426229508196, "calib/mu_w": 0.8136641221374046, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3690513833992095, "calib/std_conf": 0.22231199909894053, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4628590785907859, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.010327288853614447, "calib/step_q_w": 0.45253178973717145, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 510.97265625, "completions/mean_terminated_length": 512.9765014648438, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0352, "grad_norm": 0.022563675418496132, "kl": 0.04727935791015625, "learning_rate": 4.638888888888889e-06, "loss": -0.0337, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032099347561597824, "mask/share_reasoning": 0.8392776250839233, "mask/share_step_conf": 0.12471682578325272, "num_tokens": 7917472.0, "reward": 1.2496271133422852, "reward_std": 0.22125095129013062, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5923793315887451, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8069530725479126, "step": 33 }, { "adv/mean_abs_final_conf": 0.8050199747085571, "adv/mean_abs_reasoning": 0.5667650699615479, "adv/mean_abs_step_conf": 0.7681470513343811, "adv/ratio_final_to_reasoning": 1.4203768322616876, "adv/ratio_step_to_reasoning": 1.3553182650908533, "adv/std_final_conf": 0.9358447194099426, "adv/std_reasoning": 0.7754738926887512, "adv/std_step_conf": 0.9362109899520874, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.516640866873065, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.30118110236220474, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.46062992125984253, "calib/gap": 5.15995872034658e-05, "calib/mean_conf": 0.7778740157480314, "calib/mu_c": 0.7778947368421052, "calib/mu_w": 0.7778431372549017, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2403149606299213, "calib/std_conf": 0.26099787286447235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4270541813898704, "calib/step_q_c_n": 849.0, "calib/step_q_gap": -0.006041904019382316, "calib/step_q_w": 0.4330960854092527, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 461.2421875, "completions/mean_terminated_length": 461.2421875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.03626666666666667, "grad_norm": 0.022051773965358734, "kl": 0.0632781982421875, "learning_rate": 4.611111111111112e-06, "loss": 0.0386, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.035803139209747314, "mask/share_reasoning": 0.8305374979972839, "mask/share_step_conf": 0.13365936279296875, "num_tokens": 8140662.0, "reward": 1.3087449073791504, "reward_std": 0.23226818442344666, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6542390584945679, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8230316042900085, "step": 34 }, { "adv/mean_abs_final_conf": 0.8097102642059326, "adv/mean_abs_reasoning": 0.5452890396118164, "adv/mean_abs_step_conf": 0.7554726004600525, "adv/ratio_final_to_reasoning": 1.484919382906273, "adv/ratio_step_to_reasoning": 1.3854534853623004, "adv/std_final_conf": 0.9360859990119934, "adv/std_reasoning": 0.7927421927452087, "adv/std_step_conf": 0.9362389445304871, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.558961526431029, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.24011811023622043, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3464566929133858, "calib/gap": 0.07475195495777298, "calib/mean_conf": 0.6883858267716535, "calib/mu_c": 0.7222302158273382, "calib/mu_w": 0.6474782608695652, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19062992125984246, "calib/std_conf": 0.29686444180297783, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4001253481894151, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.0017671392341912506, "calib/step_q_w": 0.39835820895522384, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 530.20703125, "completions/mean_terminated_length": 532.2863159179688, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.037333333333333336, "grad_norm": 0.022399406880140305, "kl": 0.05455780029296875, "learning_rate": 4.583333333333333e-06, "loss": 0.0168, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031053714454174042, "mask/share_reasoning": 0.8559054136276245, "mask/share_step_conf": 0.10913458466529846, "num_tokens": 8385651.0, "reward": 1.3202464580535889, "reward_std": 0.2486499547958374, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6759027242660522, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8287794589996338, "step": 35 }, { "adv/mean_abs_final_conf": 0.7830088138580322, "adv/mean_abs_reasoning": 0.43669629096984863, "adv/mean_abs_step_conf": 0.7589302062988281, "adv/ratio_final_to_reasoning": 1.7930283129244495, "adv/ratio_step_to_reasoning": 1.7378902042271476, "adv/std_final_conf": 0.9363185167312622, "adv/std_reasoning": 0.7013518810272217, "adv/std_step_conf": 0.9361932277679443, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4782952182952183, "calib/avg_num_step_conf": 5.81640625, "calib/ece": 0.24372, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.368, "calib/gap": -0.019987525987525845, "calib/mean_conf": 0.7364400000000001, "calib/mu_c": 0.7312432432432433, "calib/mu_w": 0.7512307692307691, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12007999999999999, "calib/std_conf": 0.25492847310569294, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4292969092721835, "calib/step_q_c_n": 1003.0, "calib/step_q_gap": 0.02199238252321234, "calib/step_q_w": 0.40730452674897116, "calib/step_q_w_n": 486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 494.734375, "completions/mean_terminated_length": 494.734375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0384, "grad_norm": 0.043217163532972336, "kl": 0.057544708251953125, "learning_rate": 4.555555555555556e-06, "loss": 0.0324, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03700346499681473, "mask/share_reasoning": 0.8279244899749756, "mask/share_step_conf": 0.13507205247879028, "num_tokens": 8615015.0, "reward": 1.3251851797103882, "reward_std": 0.23759011924266815, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7176832556724548, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7964216470718384, "step": 36 }, { "adv/mean_abs_final_conf": 0.7665172219276428, "adv/mean_abs_reasoning": 0.471565842628479, "adv/mean_abs_step_conf": 0.7417849898338318, "adv/ratio_final_to_reasoning": 1.6254723150750763, "adv/ratio_step_to_reasoning": 1.5730252761717598, "adv/std_final_conf": 0.9358230233192444, "adv/std_reasoning": 0.7393534183502197, "adv/std_step_conf": 0.936323881149292, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6534814323607426, "calib/avg_num_step_conf": 5.5078125, "calib/ece": 0.2753252032520326, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.34552845528455284, "calib/gap": 0.13435013262599493, "calib/mean_conf": 0.6929674796747969, "calib/mu_c": 0.7639655172413795, "calib/mu_w": 0.6296153846153846, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2483739837398375, "calib/std_conf": 0.288497183623495, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42760956175298803, "calib/step_q_c_n": 502.0, "calib/step_q_gap": 0.058395574968847064, "calib/step_q_w": 0.36921398678414097, "calib/step_q_w_n": 908.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 514.9140625, "completions/mean_terminated_length": 521.019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.039466666666666664, "grad_norm": 0.02293313853442669, "kl": 0.0543670654296875, "learning_rate": 4.527777777777778e-06, "loss": -0.0394, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03345978260040283, "mask/share_reasoning": 0.8411606550216675, "mask/share_step_conf": 0.1136607900261879, "num_tokens": 8853929.0, "reward": 1.2604620456695557, "reward_std": 0.26069581508636475, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6551094055175781, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7918916940689087, "step": 37 }, { "adv/mean_abs_final_conf": 0.7530003190040588, "adv/mean_abs_reasoning": 0.4760221540927887, "adv/mean_abs_step_conf": 0.7484251856803894, "adv/ratio_final_to_reasoning": 1.5818598200311496, "adv/ratio_step_to_reasoning": 1.572248642727041, "adv/std_final_conf": 0.936092734336853, "adv/std_reasoning": 0.7574803829193115, "adv/std_step_conf": 0.9363313913345337, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6419795866935484, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.23492063492063486, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2857142857142857, "calib/gap": 0.14496471774193553, "calib/mean_conf": 0.6747619047619049, "calib/mu_c": 0.74609375, "calib/mu_w": 0.6011290322580645, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20087301587301587, "calib/std_conf": 0.2782419704219843, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4147571214392804, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.01899553203530685, "calib/step_q_w": 0.39576158940397355, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 508.8359375, "completions/mean_terminated_length": 508.8359375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.04053333333333333, "grad_norm": 0.02247217856347561, "kl": 0.05515289306640625, "learning_rate": 4.5e-06, "loss": -0.0057, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034040916711091995, "mask/share_reasoning": 0.8516597151756287, "mask/share_step_conf": 0.11429935693740845, "num_tokens": 9091079.0, "reward": 1.311846375465393, "reward_std": 0.26997506618499756, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6984202861785889, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8149799108505249, "step": 38 }, { "adv/mean_abs_final_conf": 0.7924951314926147, "adv/mean_abs_reasoning": 0.3936646282672882, "adv/mean_abs_step_conf": 0.7448976635932922, "adv/ratio_final_to_reasoning": 2.013122527621483, "adv/ratio_step_to_reasoning": 1.8922138543967069, "adv/std_final_conf": 0.9342702031135559, "adv/std_reasoning": 0.6613290309906006, "adv/std_step_conf": 0.9362964034080505, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6302943969610636, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.21848095238095233, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.12048888888888876, "calib/mean_conf": 0.7262809523809524, "calib/mu_c": 0.7822222222222222, "calib/mu_w": 0.6617333333333334, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2045238095238095, "calib/std_conf": 0.256919250705781, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41668141592920355, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.0358411955435286, "calib/step_q_w": 0.38084022038567494, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 511.55859375, "completions/mean_terminated_length": 511.55859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0416, "grad_norm": 0.022071518003940582, "kl": 0.049930572509765625, "learning_rate": 4.472222222222223e-06, "loss": 0.0026, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034677714109420776, "mask/share_reasoning": 0.8489075899124146, "mask/share_step_conf": 0.11641469597816467, "num_tokens": 9328126.0, "reward": 1.3234676122665405, "reward_std": 0.250893771648407, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6959884166717529, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8246921300888062, "step": 39 }, { "adv/mean_abs_final_conf": 0.8059003949165344, "adv/mean_abs_reasoning": 0.45146048069000244, "adv/mean_abs_step_conf": 0.7521008253097534, "adv/ratio_final_to_reasoning": 1.7850962141466151, "adv/ratio_step_to_reasoning": 1.6659283757467735, "adv/std_final_conf": 0.9359130263328552, "adv/std_reasoning": 0.7205228209495544, "adv/std_step_conf": 0.9361854791641235, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5600571677126702, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.25244055118110237, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.25984251968503935, "calib/gap": 0.052298098552165384, "calib/mean_conf": 0.6772444881889764, "calib/mu_c": 0.7046289256198347, "calib/mu_w": 0.6523308270676693, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22665354330708662, "calib/std_conf": 0.25833764016861277, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.39845762144053604, "calib/step_q_c_n": 597.0, "calib/step_q_gap": 0.013673670823252104, "calib/step_q_w": 0.38478395061728393, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 517.79296875, "completions/mean_terminated_length": 519.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.042666666666666665, "grad_norm": 0.03780148923397064, "kl": 0.05272674560546875, "learning_rate": 4.444444444444444e-06, "loss": -0.0956, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034237541258335114, "mask/share_reasoning": 0.8539890050888062, "mask/share_step_conf": 0.10786722600460052, "num_tokens": 9567441.0, "reward": 1.304337978363037, "reward_std": 0.23489737510681152, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6606898307800293, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8278993964195251, "step": 40 }, { "adv/mean_abs_final_conf": 0.7408613562583923, "adv/mean_abs_reasoning": 0.461469441652298, "adv/mean_abs_step_conf": 0.7448447942733765, "adv/ratio_final_to_reasoning": 1.6054396876328962, "adv/ratio_step_to_reasoning": 1.614071760865571, "adv/std_final_conf": 0.934531033039093, "adv/std_reasoning": 0.7207053899765015, "adv/std_step_conf": 0.9361456036567688, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6257507198683667, "calib/avg_num_step_conf": 4.78515625, "calib/ece": 0.12253968253968252, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.30158730158730157, "calib/gap": 0.11344138214726429, "calib/mean_conf": 0.7158730158730158, "calib/mu_c": 0.7451336898395721, "calib/mu_w": 0.6316923076923078, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04817460317460316, "calib/std_conf": 0.24756198467451312, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42238202247191015, "calib/step_q_c_n": 890.0, "calib/step_q_gap": 0.027307395606238516, "calib/step_q_w": 0.39507462686567163, "calib/step_q_w_n": 335.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 477.79296875, "completions/mean_terminated_length": 479.66668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.04373333333333333, "grad_norm": 0.047497883439064026, "kl": 0.048786163330078125, "learning_rate": 4.416666666666667e-06, "loss": -0.044, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03716282173991203, "mask/share_reasoning": 0.8401378393173218, "mask/share_step_conf": 0.11879312247037888, "num_tokens": 9797004.0, "reward": 1.3697147369384766, "reward_std": 0.2405359447002411, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7777038812637329, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8093783855438232, "step": 41 }, { "adv/mean_abs_final_conf": 0.783008873462677, "adv/mean_abs_reasoning": 0.37281715869903564, "adv/mean_abs_step_conf": 0.7627483010292053, "adv/ratio_final_to_reasoning": 2.100249023395345, "adv/ratio_step_to_reasoning": 2.0459044956269024, "adv/std_final_conf": 0.9346907138824463, "adv/std_reasoning": 0.6403069496154785, "adv/std_step_conf": 0.9358947277069092, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.643806485911749, "calib/avg_num_step_conf": 5.17578125, "calib/ece": 0.15653386454183274, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": 0.11976076555023918, "calib/mean_conf": 0.7443426294820717, "calib/mu_c": 0.791578947368421, "calib/mu_w": 0.6718181818181819, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14764940239043833, "calib/std_conf": 0.21724613369410745, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4350208044382801, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.06530226139192252, "calib/step_q_w": 0.3697185430463576, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 434.33984375, "completions/mean_terminated_length": 436.04315185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.0448, "grad_norm": 0.037157829850912094, "kl": 0.049747467041015625, "learning_rate": 4.388888888888889e-06, "loss": -0.0278, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03924307972192764, "mask/share_reasoning": 0.8291383981704712, "mask/share_step_conf": 0.12771227955818176, "num_tokens": 10012563.0, "reward": 1.360295295715332, "reward_std": 0.20391181111335754, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7372199296951294, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8342635035514832, "step": 42 }, { "adv/mean_abs_final_conf": 0.7574180960655212, "adv/mean_abs_reasoning": 0.5579841136932373, "adv/mean_abs_step_conf": 0.748997151851654, "adv/ratio_final_to_reasoning": 1.357418746301309, "adv/ratio_step_to_reasoning": 1.3423270187642473, "adv/std_final_conf": 0.9346247911453247, "adv/std_reasoning": 0.7927505970001221, "adv/std_step_conf": 0.9355341792106628, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6606515490258703, "calib/avg_num_step_conf": 4.8125, "calib/ece": 0.15019531249999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.26953125, "calib/gap": 0.13881251996167354, "calib/mean_conf": 0.7194921875, "calib/mu_c": 0.7742580645161291, "calib/mu_w": 0.6354455445544556, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.132109375, "calib/std_conf": 0.23772057178852832, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.417127659574468, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.018440159574467985, "calib/step_q_w": 0.39868750000000003, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 463.96875, "completions/mean_terminated_length": 465.78826904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.04586666666666667, "grad_norm": 0.030185841023921967, "kl": 0.04340362548828125, "learning_rate": 4.361111111111112e-06, "loss": -0.012, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035996466875076294, "mask/share_reasoning": 0.8457590341567993, "mask/share_step_conf": 0.11433827131986618, "num_tokens": 10236563.0, "reward": 1.3812288045883179, "reward_std": 0.18326275050640106, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7540620565414429, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8440415263175964, "step": 43 }, { "adv/mean_abs_final_conf": 0.744002103805542, "adv/mean_abs_reasoning": 0.3944712281227112, "adv/mean_abs_step_conf": 0.7345870733261108, "adv/ratio_final_to_reasoning": 1.8860744479293166, "adv/ratio_step_to_reasoning": 1.8622069772287606, "adv/std_final_conf": 0.9311710000038147, "adv/std_reasoning": 0.68148273229599, "adv/std_step_conf": 0.9358228445053101, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7231182795698924, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.27812500000000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.390625, "calib/gap": 0.16265884652981422, "calib/mean_conf": 0.7625, "calib/mu_c": 0.8463709677419354, "calib/mu_w": 0.6837121212121212, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27812500000000007, "calib/std_conf": 0.23051911525945087, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44964102564102565, "calib/step_q_c_n": 585.0, "calib/step_q_gap": 0.05307239819004528, "calib/step_q_w": 0.39656862745098037, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 497.2109375, "completions/mean_terminated_length": 499.16082763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.046933333333333334, "grad_norm": 0.03155337646603584, "kl": 0.036449432373046875, "learning_rate": 4.333333333333334e-06, "loss": -0.0888, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03214806690812111, "mask/share_reasoning": 0.8558369874954224, "mask/share_step_conf": 0.10810869187116623, "num_tokens": 10470169.0, "reward": 1.3513096570968628, "reward_std": 0.18334747850894928, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6977843642234802, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8543705940246582, "step": 44 }, { "adv/mean_abs_final_conf": 0.7419617772102356, "adv/mean_abs_reasoning": 0.3817136585712433, "adv/mean_abs_step_conf": 0.7646172642707825, "adv/ratio_final_to_reasoning": 1.9437653344326304, "adv/ratio_step_to_reasoning": 2.0031173815806063, "adv/std_final_conf": 0.9219281673431396, "adv/std_reasoning": 0.6612144112586975, "adv/std_step_conf": 0.9353498816490173, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6971271001667309, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.25143426294820725, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4541832669322709, "calib/gap": 0.1532518917532385, "calib/mean_conf": 0.7931075697211155, "calib/mu_c": 0.8621014492753624, "calib/mu_w": 0.7088495575221239, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2473705179282869, "calib/std_conf": 0.2116015207133383, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.42065073041168666, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.01766979453647005, "calib/step_q_w": 0.4029809358752166, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 483.76953125, "completions/mean_terminated_length": 485.66668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.048, "grad_norm": 0.03212614730000496, "kl": 0.038379669189453125, "learning_rate": 4.305555555555556e-06, "loss": -0.0542, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036979176104068756, "mask/share_reasoning": 0.8345528244972229, "mask/share_step_conf": 0.12456175684928894, "num_tokens": 10699062.0, "reward": 1.3494054079055786, "reward_std": 0.16485074162483215, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7102246284484863, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8423399329185486, "step": 45 }, { "adv/mean_abs_final_conf": 0.7200635671615601, "adv/mean_abs_reasoning": 0.43889564275741577, "adv/mean_abs_step_conf": 0.7514091730117798, "adv/ratio_final_to_reasoning": 1.6406259188122085, "adv/ratio_step_to_reasoning": 1.7120451875324152, "adv/std_final_conf": 0.9301496744155884, "adv/std_reasoning": 0.7014629244804382, "adv/std_step_conf": 0.9356130957603455, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.617149292149292, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.26247011952191235, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.0762303732303734, "calib/mean_conf": 0.8007171314741036, "calib/mu_c": 0.8344285714285715, "calib/mu_w": 0.7581981981981981, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25270916334661353, "calib/std_conf": 0.20530873570676092, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39652830188679244, "calib/step_q_c_n": 795.0, "calib/step_q_gap": -0.004236128690430607, "calib/step_q_w": 0.40076443057722305, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 523.0431518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.04906666666666667, "grad_norm": 0.03554841876029968, "kl": 0.0423126220703125, "learning_rate": 4.277777777777778e-06, "loss": -0.1217, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03799302875995636, "mask/share_reasoning": 0.8365983963012695, "mask/share_step_conf": 0.1215023547410965, "num_tokens": 10937206.0, "reward": 1.3064703941345215, "reward_std": 0.21699872612953186, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.669989824295044, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8195223212242126, "step": 46 }, { "adv/mean_abs_final_conf": 0.7372655868530273, "adv/mean_abs_reasoning": 0.3531673550605774, "adv/mean_abs_step_conf": 0.7708775997161865, "adv/ratio_final_to_reasoning": 2.087581358493814, "adv/ratio_step_to_reasoning": 2.1827544043077283, "adv/std_final_conf": 0.9174608588218689, "adv/std_reasoning": 0.661200761795044, "adv/std_step_conf": 0.9348491430282593, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7244660194174756, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.23446640316205525, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5454545454545454, "calib/gap": 0.15109967637540467, "calib/mean_conf": 0.8273517786561264, "calib/mu_c": 0.8888666666666666, "calib/mu_w": 0.7377669902912619, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23446640316205525, "calib/std_conf": 0.18792528255905222, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.44727496917385945, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.05620830250719283, "calib/step_q_w": 0.3910666666666666, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 514.8359375, "completions/mean_terminated_length": 518.8897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.050133333333333335, "grad_norm": 0.0314362570643425, "kl": 0.036376953125, "learning_rate": 4.25e-06, "loss": -0.1308, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03257117420434952, "mask/share_reasoning": 0.8494213819503784, "mask/share_step_conf": 0.11019494384527206, "num_tokens": 11174980.0, "reward": 1.3781641721725464, "reward_std": 0.173916757106781, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7321382761001587, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8550637364387512, "step": 47 }, { "adv/mean_abs_final_conf": 0.7383579015731812, "adv/mean_abs_reasoning": 0.47094452381134033, "adv/mean_abs_step_conf": 0.7545657157897949, "adv/ratio_final_to_reasoning": 1.567823521118097, "adv/ratio_step_to_reasoning": 1.602239069865632, "adv/std_final_conf": 0.9191538095474243, "adv/std_reasoning": 0.7392716407775879, "adv/std_step_conf": 0.9354907274246216, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6004624871531347, "calib/avg_num_step_conf": 4.953125, "calib/ece": 0.26035856573705174, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4701195219123506, "calib/gap": 0.078800102774923, "calib/mean_conf": 0.7968525896414342, "calib/mu_c": 0.8320143884892086, "calib/mu_w": 0.7532142857142856, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2517131474103585, "calib/std_conf": 0.20617956767526674, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45039573820395734, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.04181963345763978, "calib/step_q_w": 0.40857610474631756, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 457.046875, "completions/mean_terminated_length": 462.4664306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0512, "grad_norm": 0.08840116858482361, "kl": 0.0417633056640625, "learning_rate": 4.222222222222223e-06, "loss": -0.1124, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03736000880599022, "mask/share_reasoning": 0.8268569707870483, "mask/share_step_conf": 0.12406426668167114, "num_tokens": 11395672.0, "reward": 1.3278179168701172, "reward_std": 0.19362950325012207, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.676763653755188, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8370923399925232, "step": 48 }, { "adv/mean_abs_final_conf": 0.7822320461273193, "adv/mean_abs_reasoning": 0.4077925682067871, "adv/mean_abs_step_conf": 0.7859296202659607, "adv/ratio_final_to_reasoning": 1.9182106470627442, "adv/ratio_step_to_reasoning": 1.9272779386882413, "adv/std_final_conf": 0.9106044769287109, "adv/std_reasoning": 0.6613754034042358, "adv/std_step_conf": 0.934582531452179, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7893849206349206, "calib/avg_num_step_conf": 4.73828125, "calib/ece": 0.2482661290322581, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4879032258064516, "calib/gap": 0.1889603174603175, "calib/mean_conf": 0.8127822580645161, "calib/mu_c": 0.8950714285714286, "calib/mu_w": 0.7061111111111111, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2482661290322581, "calib/std_conf": 0.19813424336772567, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4865970149253731, "calib/step_q_c_n": 670.0, "calib/step_q_gap": 0.04882537588301583, "calib/step_q_w": 0.4377716390423573, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 489.18359375, "completions/mean_terminated_length": 491.10198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.05226666666666667, "grad_norm": 0.045245908200740814, "kl": 0.041103363037109375, "learning_rate": 4.194444444444445e-06, "loss": -0.0491, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03568429872393608, "mask/share_reasoning": 0.8462221622467041, "mask/share_step_conf": 0.11418728530406952, "num_tokens": 11625439.0, "reward": 1.3370085954666138, "reward_std": 0.19713446497917175, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7224773168563843, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8245980143547058, "step": 49 }, { "adv/mean_abs_final_conf": 0.7385240197181702, "adv/mean_abs_reasoning": 0.38276487588882446, "adv/mean_abs_step_conf": 0.7558243274688721, "adv/ratio_final_to_reasoning": 1.9294456368370574, "adv/ratio_step_to_reasoning": 1.9746439004199647, "adv/std_final_conf": 0.9209765195846558, "adv/std_reasoning": 0.6613095998764038, "adv/std_step_conf": 0.9355844855308533, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8078224522292994, "calib/avg_num_step_conf": 4.7734375, "calib/ece": 0.13798418972332013, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.30434782608695654, "calib/gap": 0.22320660828025463, "calib/mean_conf": 0.7363241106719368, "calib/mu_c": 0.8210191082802546, "calib/mu_w": 0.5978125, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12687747035573121, "calib/std_conf": 0.21101937253409556, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.47943396226415097, "calib/step_q_c_n": 742.0, "calib/step_q_gap": 0.04578812893081757, "calib/step_q_w": 0.4336458333333334, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 451.03515625, "completions/mean_terminated_length": 452.803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.05333333333333334, "grad_norm": 0.05697335675358772, "kl": 0.04230499267578125, "learning_rate": 4.166666666666667e-06, "loss": -0.1203, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037577033042907715, "mask/share_reasoning": 0.8432788252830505, "mask/share_step_conf": 0.11523788422346115, "num_tokens": 11846264.0, "reward": 1.378005027770996, "reward_std": 0.19581623375415802, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.789250373840332, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8247860670089722, "step": 50 }, { "adv/mean_abs_final_conf": 0.7307857871055603, "adv/mean_abs_reasoning": 0.3543166220188141, "adv/mean_abs_step_conf": 0.7410742044448853, "adv/ratio_final_to_reasoning": 2.062521884922339, "adv/ratio_step_to_reasoning": 2.0915592393673657, "adv/std_final_conf": 0.9202434420585632, "adv/std_reasoning": 0.6403036117553711, "adv/std_step_conf": 0.935762345790863, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7444818926777689, "calib/avg_num_step_conf": 4.0078125, "calib/ece": 0.08952569169960473, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2648221343873518, "calib/gap": 0.2010064763415278, "calib/mean_conf": 0.6851778656126482, "calib/mu_c": 0.7622435897435896, "calib/mu_w": 0.5612371134020618, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07905138339920946, "calib/std_conf": 0.24350793541255103, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47646280991735535, "calib/step_q_c_n": 605.0, "calib/step_q_gap": 0.038386800416167655, "calib/step_q_w": 0.4380760095011877, "calib/step_q_w_n": 421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 453.87890625, "completions/mean_terminated_length": 455.6588439941406, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0544, "grad_norm": 0.05265422165393829, "kl": 0.04512786865234375, "learning_rate": 4.138888888888889e-06, "loss": -0.0625, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03776656091213226, "mask/share_reasoning": 0.8576105833053589, "mask/share_step_conf": 0.10071656852960587, "num_tokens": 12071753.0, "reward": 1.3860688209533691, "reward_std": 0.1876891851425171, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7822070121765137, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8355902433395386, "step": 51 }, { "adv/mean_abs_final_conf": 0.714397668838501, "adv/mean_abs_reasoning": 0.35804280638694763, "adv/mean_abs_step_conf": 0.747557520866394, "adv/ratio_final_to_reasoning": 1.9952856365069096, "adv/ratio_step_to_reasoning": 2.0878998475352306, "adv/std_final_conf": 0.9077821373939514, "adv/std_reasoning": 0.6403156518936157, "adv/std_step_conf": 0.9354573488235474, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7871723790322583, "calib/avg_num_step_conf": 3.98046875, "calib/ece": 0.13412000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.268, "calib/gap": 0.2769590053763441, "calib/mean_conf": 0.6301199999999999, "calib/mu_c": 0.7010215053763441, "calib/mu_w": 0.4240625, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.010119999999999995, "calib/std_conf": 0.2800792487850537, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.486183699870634, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.08187475678120304, "calib/step_q_w": 0.40430894308943094, "calib/step_q_w_n": 246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 443.59375, "completions/mean_terminated_length": 445.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.055466666666666664, "grad_norm": 0.06576532870531082, "kl": 0.05825042724609375, "learning_rate": 4.111111111111111e-06, "loss": -0.0366, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03848676383495331, "mask/share_reasoning": 0.8548662662506104, "mask/share_step_conf": 0.10274067521095276, "num_tokens": 12293265.0, "reward": 1.3783490657806396, "reward_std": 0.19893211126327515, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7833257913589478, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8191080093383789, "step": 52 }, { "adv/mean_abs_final_conf": 0.7468120455741882, "adv/mean_abs_reasoning": 0.41244423389434814, "adv/mean_abs_step_conf": 0.744057297706604, "adv/ratio_final_to_reasoning": 1.8106982331228125, "adv/ratio_step_to_reasoning": 1.804019153525618, "adv/std_final_conf": 0.9090811014175415, "adv/std_reasoning": 0.6613813638687134, "adv/std_step_conf": 0.9356816411018372, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7108433734939759, "calib/avg_num_step_conf": 4.59375, "calib/ece": 0.1287007874015748, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.38188976377952755, "calib/gap": 0.19396631982475365, "calib/mean_conf": 0.6846062992125984, "calib/mu_c": 0.7518072289156627, "calib/mu_w": 0.557840909090909, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07988188976377955, "calib/std_conf": 0.2762980148882472, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5022727272727272, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.09506933744221868, "calib/step_q_w": 0.4072033898305085, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 453.46484375, "completions/mean_terminated_length": 457.0354309082031, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.05653333333333333, "grad_norm": 1.054863452911377, "kl": 0.2156829833984375, "learning_rate": 4.083333333333334e-06, "loss": -0.0359, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03535082936286926, "mask/share_reasoning": 0.8552125096321106, "mask/share_step_conf": 0.10162418335676193, "num_tokens": 12515176.0, "reward": 1.39669668674469, "reward_std": 0.18330219388008118, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7779816389083862, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.843643307685852, "step": 53 }, { "adv/mean_abs_final_conf": 0.7108153104782104, "adv/mean_abs_reasoning": 0.28954315185546875, "adv/mean_abs_step_conf": 0.774592399597168, "adv/ratio_final_to_reasoning": 2.454954661932492, "adv/ratio_step_to_reasoning": 2.6752226555294985, "adv/std_final_conf": 0.9168617725372314, "adv/std_reasoning": 0.5959535241127014, "adv/std_step_conf": 0.9359902739524841, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7334784836065573, "calib/avg_num_step_conf": 4.21875, "calib/ece": 0.08545454545454545, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4980237154150198, "calib/gap": 0.22697660519125684, "calib/mean_conf": 0.7989723320158103, "calib/mu_c": 0.8536979166666666, "calib/mu_w": 0.6267213114754098, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06276679841897231, "calib/std_conf": 0.23509591690110984, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5720314547837483, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.1498232528910038, "calib/step_q_w": 0.4222082018927445, "calib/step_q_w_n": 317.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 420.83984375, "completions/mean_terminated_length": 422.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.0576, "grad_norm": 0.0555671751499176, "kl": 0.046390533447265625, "learning_rate": 4.055555555555556e-06, "loss": -0.0081, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.041799530386924744, "mask/share_reasoning": 0.8436595797538757, "mask/share_step_conf": 0.11063466221094131, "num_tokens": 12729143.0, "reward": 1.4318828582763672, "reward_std": 0.1910424828529358, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.8274754285812378, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8450983166694641, "step": 54 }, { "adv/mean_abs_final_conf": 0.7284795641899109, "adv/mean_abs_reasoning": 0.4116016924381256, "adv/mean_abs_step_conf": 0.7531675100326538, "adv/ratio_final_to_reasoning": 1.7698653275081473, "adv/ratio_step_to_reasoning": 1.8298455129551596, "adv/std_final_conf": 0.9045955538749695, "adv/std_reasoning": 0.6817708015441895, "adv/std_step_conf": 0.9273386001586914, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8350949305147778, "calib/avg_num_step_conf": 3.62109375, "calib/ece": 0.24459677419354844, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5403225806451613, "calib/gap": 0.2914164546225616, "calib/mean_conf": 0.7728225806451613, "calib/mu_c": 0.9103053435114504, "calib/mu_w": 0.6188888888888888, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.24459677419354844, "calib/std_conf": 0.26942759061923977, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6509382151029748, "calib/step_q_c_n": 437.0, "calib/step_q_gap": 0.12850964367440332, "calib/step_q_w": 0.5224285714285715, "calib/step_q_w_n": 490.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 440.0703125, "completions/mean_terminated_length": 445.2885437011719, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.058666666666666666, "grad_norm": 0.03891420364379883, "kl": 0.048374176025390625, "learning_rate": 4.027777777777779e-06, "loss": -0.1015, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.039170823991298676, "mask/share_reasoning": 0.8518698215484619, "mask/share_step_conf": 0.0972406268119812, "num_tokens": 12949625.0, "reward": 1.320771336555481, "reward_std": 0.2680937945842743, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7362039089202881, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.805013120174408, "step": 55 }, { "adv/mean_abs_final_conf": 0.7066901922225952, "adv/mean_abs_reasoning": 0.5397046804428101, "adv/mean_abs_step_conf": 0.7587162256240845, "adv/ratio_final_to_reasoning": 1.3094016372857449, "adv/ratio_step_to_reasoning": 1.4057988620769095, "adv/std_final_conf": 0.9016559720039368, "adv/std_reasoning": 0.8098204135894775, "adv/std_step_conf": 0.9363850951194763, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7619768360094761, "calib/avg_num_step_conf": 4.05078125, "calib/ece": 0.30882591093117423, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6275303643724697, "calib/gap": 0.17795143458804952, "calib/mean_conf": 0.8336032388663968, "calib/mu_c": 0.917175572519084, "calib/mu_w": 0.7392241379310345, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.30603238866396776, "calib/std_conf": 0.22455257463439346, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.6526278659611994, "calib/step_q_c_n": 567.0, "calib/step_q_gap": 0.08932999362077387, "calib/step_q_w": 0.5632978723404255, "calib/step_q_w_n": 470.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 454.89453125, "completions/mean_terminated_length": 458.47637939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.05973333333333333, "grad_norm": 0.05582032352685928, "kl": 0.049285888671875, "learning_rate": 4.000000000000001e-06, "loss": -0.0568, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03810788691043854, "mask/share_reasoning": 0.8576039671897888, "mask/share_step_conf": 0.09647566825151443, "num_tokens": 13172918.0, "reward": 1.1864277124404907, "reward_std": 0.32152634859085083, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6578918099403381, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7117786407470703, "step": 56 }, { "adv/mean_abs_final_conf": 0.7351686954498291, "adv/mean_abs_reasoning": 0.49061357975006104, "adv/mean_abs_step_conf": 0.797583281993866, "adv/ratio_final_to_reasoning": 1.4984678895850265, "adv/ratio_step_to_reasoning": 1.6256852947286704, "adv/std_final_conf": 0.8790720701217651, "adv/std_reasoning": 0.7209060192108154, "adv/std_step_conf": 0.9364942908287048, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7373493975903614, "calib/avg_num_step_conf": 3.42578125, "calib/ece": 0.18983739837398383, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6463414634146342, "calib/gap": 0.18938704819277108, "calib/mean_conf": 0.8464227642276423, "calib/mu_c": 0.9080120481927711, "calib/mu_w": 0.7186250000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.18073170731707328, "calib/std_conf": 0.2420445772331386, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.7040172413793103, "calib/step_q_c_n": 580.0, "calib/step_q_gap": 0.1559027632648321, "calib/step_q_w": 0.5481144781144782, "calib/step_q_w_n": 297.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 482.3984375, "completions/mean_terminated_length": 484.29022216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.0608, "grad_norm": 0.033215321600437164, "kl": 0.05623626708984375, "learning_rate": 3.972222222222223e-06, "loss": -0.0313, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0380302369594574, "mask/share_reasoning": 0.8718808889389038, "mask/share_step_conf": 0.0861825942993164, "num_tokens": 13403204.0, "reward": 1.2480785846710205, "reward_std": 0.3816050887107849, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7185015678405762, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7314058542251587, "step": 57 }, { "adv/mean_abs_final_conf": 0.7716591954231262, "adv/mean_abs_reasoning": 0.5968241095542908, "adv/mean_abs_step_conf": 0.7493637800216675, "adv/ratio_final_to_reasoning": 1.29294239805326, "adv/ratio_step_to_reasoning": 1.2555856374188596, "adv/std_final_conf": 0.9079048037528992, "adv/std_reasoning": 0.8268017172813416, "adv/std_step_conf": 0.9220500588417053, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.56484375, "calib/avg_num_step_conf": 3.56640625, "calib/ece": 0.41373949579831926, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 0.6092436974789915, "calib/gap": 0.0316548295454544, "calib/mean_conf": 0.8175210084033613, "calib/mu_c": 0.8345454545454544, "calib/mu_w": 0.802890625, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.875, "calib/pce": 0.38453781512605034, "calib/std_conf": 0.251095393461271, "calib/step_conf_rate": 0.875, "calib/step_q_c": 0.6568894601542417, "calib/step_q_c_n": 389.0, "calib/step_q_gap": 0.03402686473439431, "calib/step_q_w": 0.6228625954198473, "calib/step_q_w_n": 524.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 514.76171875, "completions/mean_terminated_length": 516.7803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.06186666666666667, "grad_norm": 0.03600175306200981, "kl": 0.0579376220703125, "learning_rate": 3.944444444444445e-06, "loss": -0.2256, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.03492421656847, "mask/share_reasoning": 0.8780443668365479, "mask/share_step_conf": 0.08312517404556274, "num_tokens": 13641303.0, "reward": 0.9850034713745117, "reward_std": 0.45760661363601685, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.48338747024536133, "rewards/format_reward_step": 0.859375, "rewards/step_l2_reward": 0.614403486251831, "step": 58 }, { "adv/mean_abs_final_conf": 0.7581777572631836, "adv/mean_abs_reasoning": 0.6072738766670227, "adv/mean_abs_step_conf": 0.7742183804512024, "adv/ratio_final_to_reasoning": 1.2484939438271008, "adv/ratio_step_to_reasoning": 1.274908093693149, "adv/std_final_conf": 0.910625696182251, "adv/std_reasoning": 0.8101241588592529, "adv/std_step_conf": 0.9366313815116882, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.599885498753957, "calib/avg_num_step_conf": 3.5625, "calib/ece": 0.32088709677419364, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.6653225806451613, "calib/gap": 0.05568667070788702, "calib/mean_conf": 0.8576612903225808, "calib/mu_c": 0.8803401360544217, "calib/mu_w": 0.8246534653465347, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.29290322580645173, "calib/std_conf": 0.22982149058305867, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.7102025782688766, "calib/step_q_c_n": 543.0, "calib/step_q_gap": 0.038468160924703176, "calib/step_q_w": 0.6717344173441734, "calib/step_q_w_n": 369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 482.671875, "completions/mean_terminated_length": 482.671875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.06293333333333333, "grad_norm": 0.036133717745542526, "kl": 0.0674896240234375, "learning_rate": 3.916666666666667e-06, "loss": -0.122, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.038872212171554565, "mask/share_reasoning": 0.8735235929489136, "mask/share_step_conf": 0.08760420233011246, "num_tokens": 13871115.0, "reward": 1.1213886737823486, "reward_std": 0.4144839942455292, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5979918241500854, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.674345850944519, "step": 59 }, { "adv/mean_abs_final_conf": 0.7874451875686646, "adv/mean_abs_reasoning": 0.6605117917060852, "adv/mean_abs_step_conf": 0.7728654742240906, "adv/ratio_final_to_reasoning": 1.1921743070395665, "adv/ratio_step_to_reasoning": 1.1701009488835903, "adv/std_final_conf": 0.9360385537147522, "adv/std_reasoning": 0.8748874664306641, "adv/std_step_conf": 0.936693012714386, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6743748223927253, "calib/avg_num_step_conf": 3.37109375, "calib/ece": 0.27641666666666664, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.5916666666666667, "calib/gap": 0.15312020460358078, "calib/mean_conf": 0.8160833333333334, "calib/mu_c": 0.881159420289855, "calib/mu_w": 0.7280392156862743, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.25875000000000004, "calib/std_conf": 0.25857234910605237, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.725909090909091, "calib/step_q_c_n": 484.0, "calib/step_q_gap": 0.11179299592228353, "calib/step_q_w": 0.6141160949868074, "calib/step_q_w_n": 379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 465.4453125, "completions/mean_terminated_length": 470.9644470214844, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.064, "grad_norm": 0.04495846852660179, "kl": 0.067657470703125, "learning_rate": 3.88888888888889e-06, "loss": 0.0181, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03766370192170143, "mask/share_reasoning": 0.8644055128097534, "mask/share_step_conf": 0.08621197193861008, "num_tokens": 14099125.0, "reward": 1.1500201225280762, "reward_std": 0.43505987524986267, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6390793323516846, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.6867303848266602, "step": 60 }, { "adv/mean_abs_final_conf": 0.7372546195983887, "adv/mean_abs_reasoning": 0.6561082601547241, "adv/mean_abs_step_conf": 0.7634576559066772, "adv/ratio_final_to_reasoning": 1.1236783079434614, "adv/ratio_step_to_reasoning": 1.163615370010184, "adv/std_final_conf": 0.9167507886886597, "adv/std_reasoning": 0.8747683763504028, "adv/std_step_conf": 0.9367372393608093, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6563605442176872, "calib/avg_num_step_conf": 3.6328125, "calib/ece": 0.28923387096774206, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.6895161290322581, "calib/gap": 0.10141088435374157, "calib/mean_conf": 0.8524596774193549, "calib/mu_c": 0.8925333333333333, "calib/mu_w": 0.7911224489795917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.2684274193548388, "calib/std_conf": 0.23937081165184632, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.7221122153209111, "calib/step_q_c_n": 483.0, "calib/step_q_gap": 0.08810774104798047, "calib/step_q_w": 0.6340044742729306, "calib/step_q_w_n": 447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 398.2109375, "completions/mean_terminated_length": 401.3464660644531, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.06506666666666666, "grad_norm": 0.02924097701907158, "kl": 0.0796356201171875, "learning_rate": 3.861111111111112e-06, "loss": -0.1026, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.044213712215423584, "mask/share_reasoning": 0.8519928455352783, "mask/share_step_conf": 0.0959809198975563, "num_tokens": 14305131.0, "reward": 1.125230073928833, "reward_std": 0.4287129044532776, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6329164505004883, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.6572092771530151, "step": 61 }, { "adv/mean_abs_final_conf": 0.7798162698745728, "adv/mean_abs_reasoning": 0.609207272529602, "adv/mean_abs_step_conf": 0.7699545621871948, "adv/ratio_final_to_reasoning": 1.2800508218435305, "adv/ratio_step_to_reasoning": 1.2638630510599853, "adv/std_final_conf": 0.931247889995575, "adv/std_reasoning": 0.8267863392829895, "adv/std_step_conf": 0.9367672801017761, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5772306397306397, "calib/avg_num_step_conf": 3.3125, "calib/ece": 0.31995833333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.5416666666666666, "calib/gap": 0.07760101010101017, "calib/mean_conf": 0.7546250000000001, "calib/mu_c": 0.7895454545454544, "calib/mu_w": 0.7119444444444443, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.26229166666666665, "calib/std_conf": 0.31616165597417617, "calib/step_conf_rate": 0.9140625, "calib/step_q_c": 0.678656591099916, "calib/step_q_c_n": 397.0, "calib/step_q_gap": 0.09136169087818657, "calib/step_q_w": 0.5872949002217295, "calib/step_q_w_n": 451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 484.515625, "completions/mean_terminated_length": 486.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.06613333333333334, "grad_norm": 0.04646060988306999, "kl": 0.07982635498046875, "learning_rate": 3.833333333333334e-06, "loss": -0.1848, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03795677050948143, "mask/share_reasoning": 0.8766661882400513, "mask/share_step_conf": 0.08147076517343521, "num_tokens": 14536247.0, "reward": 1.0513150691986084, "reward_std": 0.428830087184906, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5791070461273193, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.6211365461349487, "step": 62 }, { "adv/mean_abs_final_conf": 0.7935585379600525, "adv/mean_abs_reasoning": 0.7317885756492615, "adv/mean_abs_step_conf": 0.7919634580612183, "adv/ratio_final_to_reasoning": 1.084409574522241, "adv/ratio_step_to_reasoning": 1.0822298740569543, "adv/std_final_conf": 0.9344494342803955, "adv/std_reasoning": 0.8906053304672241, "adv/std_step_conf": 0.9364073276519775, "calib/answer_extract_rate": 0.8984375, "calib/auroc": 0.6406417112299465, "calib/avg_num_step_conf": 3.07421875, "calib/ece": 0.2033187134502923, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.83203125, "calib/frac_conf_gt_0.9": 0.4649122807017544, "calib/gap": 0.16477197312491432, "calib/mean_conf": 0.7373830409356724, "calib/mu_c": 0.7988111888111888, "calib/mu_w": 0.6340392156862745, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 0.921875, "calib/nonempty_step_conf_rate": 0.86328125, "calib/pce": 0.15675438596491217, "calib/std_conf": 0.29977562022356213, "calib/step_conf_rate": 0.86328125, "calib/step_q_c": 0.6830984996738423, "calib/step_q_c_n": 511.0, "calib/step_q_gap": 0.0007398040216683999, "calib/step_q_w": 0.6823586956521739, "calib/step_q_w_n": 276.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 513.73046875, "completions/mean_terminated_length": 515.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.0672, "grad_norm": 0.031390607357025146, "kl": 0.0851898193359375, "learning_rate": 3.8055555555555556e-06, "loss": -0.196, "mask/has_final_conf_rate": 0.890625, "mask/share_final_conf": 0.03502983972430229, "mask/share_reasoning": 0.8833227157592773, "mask/share_step_conf": 0.07774117588996887, "num_tokens": 14776402.0, "reward": 1.0807759761810303, "reward_std": 0.4876406192779541, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6108878254890442, "rewards/format_reward_step": 0.83203125, "rewards/step_l2_reward": 0.6362695693969727, "step": 63 }, { "adv/mean_abs_final_conf": 0.777665376663208, "adv/mean_abs_reasoning": 0.5823945999145508, "adv/mean_abs_step_conf": 0.7876840829849243, "adv/ratio_final_to_reasoning": 1.3352894700213696, "adv/ratio_step_to_reasoning": 1.3524920785675103, "adv/std_final_conf": 0.9342407584190369, "adv/std_reasoning": 0.8099990487098694, "adv/std_step_conf": 0.9365174770355225, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5644473360803876, "calib/avg_num_step_conf": 3.72265625, "calib/ece": 0.2408536585365854, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.34552845528455284, "calib/gap": 0.07828453697679472, "calib/mean_conf": 0.6684146341463415, "calib/mu_c": 0.6961006289308177, "calib/mu_w": 0.617816091954023, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.1314634146341464, "calib/std_conf": 0.31550631464772577, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.6019636645962733, "calib/step_q_c_n": 644.0, "calib/step_q_gap": -0.03713989527427686, "calib/step_q_w": 0.6391035598705501, "calib/step_q_w_n": 309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 439.484375, "completions/mean_terminated_length": 448.2390441894531, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.06826666666666667, "grad_norm": 0.030355457216501236, "kl": 0.09368896484375, "learning_rate": 3.777777777777778e-06, "loss": -0.1502, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03789021074771881, "mask/share_reasoning": 0.853408694267273, "mask/share_step_conf": 0.08916984498500824, "num_tokens": 14992686.0, "reward": 1.1851518154144287, "reward_std": 0.35710498690605164, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6556019186973572, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7014913558959961, "step": 64 }, { "adv/mean_abs_final_conf": 0.7822667360305786, "adv/mean_abs_reasoning": 0.5045019388198853, "adv/mean_abs_step_conf": 0.7830569744110107, "adv/ratio_final_to_reasoning": 1.5505723087218095, "adv/ratio_step_to_reasoning": 1.5521386820488985, "adv/std_final_conf": 0.9360599517822266, "adv/std_reasoning": 0.7577037811279297, "adv/std_step_conf": 0.9366105794906616, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6241640208065933, "calib/avg_num_step_conf": 3.10546875, "calib/ece": 0.2878688524590164, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.45901639344262296, "calib/gap": 0.1069729108964399, "calib/mean_conf": 0.7355737704918033, "calib/mu_c": 0.7851145038167938, "calib/mu_w": 0.6781415929203539, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.24327868852459014, "calib/std_conf": 0.2929626764688911, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.6546169724770643, "calib/step_q_c_n": 436.0, "calib/step_q_gap": 0.012208894482635402, "calib/step_q_w": 0.6424080779944289, "calib/step_q_w_n": 359.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 361.3828125, "completions/mean_terminated_length": 362.8000183105469, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 0.06933333333333333, "grad_norm": 0.034218862652778625, "kl": 0.1111907958984375, "learning_rate": 3.7500000000000005e-06, "loss": -0.1079, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.04415278136730194, "mask/share_reasoning": 0.8564043045043945, "mask/share_step_conf": 0.09553661197423935, "num_tokens": 15190224.0, "reward": 1.1099624633789062, "reward_std": 0.36487138271331787, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6112703084945679, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.662139892578125, "step": 65 }, { "adv/mean_abs_final_conf": 0.7913875579833984, "adv/mean_abs_reasoning": 0.6099362373352051, "adv/mean_abs_step_conf": 0.7825324535369873, "adv/ratio_final_to_reasoning": 1.2974922779485103, "adv/ratio_step_to_reasoning": 1.2829741957222454, "adv/std_final_conf": 0.9346356987953186, "adv/std_reasoning": 0.8429571390151978, "adv/std_step_conf": 0.9366850852966309, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6495956873315365, "calib/avg_num_step_conf": 3.93359375, "calib/ece": 0.24447698744769875, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.3138075313807531, "calib/gap": 0.19739608455100033, "calib/mean_conf": 0.592510460251046, "calib/mu_c": 0.7023584905660379, "calib/mu_w": 0.5049624060150376, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.19673640167364015, "calib/std_conf": 0.35774803388463744, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.6350295857988166, "calib/step_q_c_n": 338.0, "calib/step_q_gap": 0.11145210697470109, "calib/step_q_w": 0.5235774788241155, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 500.46875, "completions/mean_terminated_length": 506.4031677246094, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.0704, "grad_norm": 0.03262786567211151, "kl": 0.09377288818359375, "learning_rate": 3.7222222222222225e-06, "loss": -0.2462, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03721674531698227, "mask/share_reasoning": 0.8671850562095642, "mask/share_step_conf": 0.08387944102287292, "num_tokens": 15424696.0, "reward": 1.1032490730285645, "reward_std": 0.4415382444858551, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.6247620582580566, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.660789966583252, "step": 66 }, { "adv/mean_abs_final_conf": 0.7765552401542664, "adv/mean_abs_reasoning": 0.5262472629547119, "adv/mean_abs_step_conf": 0.7633633613586426, "adv/ratio_final_to_reasoning": 1.4756470861133877, "adv/ratio_step_to_reasoning": 1.4505792525600965, "adv/std_final_conf": 0.9337157011032104, "adv/std_reasoning": 0.7755067944526672, "adv/std_step_conf": 0.9365633726119995, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7346554719166185, "calib/avg_num_step_conf": 3.46484375, "calib/ece": 0.12836734693877544, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.2938775510204082, "calib/gap": 0.26189200926462064, "calib/mean_conf": 0.6271428571428571, "calib/mu_c": 0.7212101910828025, "calib/mu_w": 0.45931818181818185, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.057346938775510135, "calib/std_conf": 0.3254133948549451, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.6145028935185185, "calib/step_q_c_n": 576.0, "calib/step_q_gap": 0.13908488708765027, "calib/step_q_w": 0.47541800643086823, "calib/step_q_w_n": 311.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 465.90234375, "completions/mean_terminated_length": 467.72943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.07146666666666666, "grad_norm": 0.04384162649512291, "kl": 0.1028289794921875, "learning_rate": 3.694444444444445e-06, "loss": -0.1681, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0390692800283432, "mask/share_reasoning": 0.8731340169906616, "mask/share_step_conf": 0.0838904082775116, "num_tokens": 15648975.0, "reward": 1.2170250415802002, "reward_std": 0.37866127490997314, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7070285081863403, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7115576267242432, "step": 67 }, { "adv/mean_abs_final_conf": 0.7916392087936401, "adv/mean_abs_reasoning": 0.47120824456214905, "adv/mean_abs_step_conf": 0.7685818672180176, "adv/ratio_final_to_reasoning": 1.6800198594344171, "adv/ratio_step_to_reasoning": 1.6310874779625106, "adv/std_final_conf": 0.9357117414474487, "adv/std_reasoning": 0.757487416267395, "adv/std_step_conf": 0.9363951683044434, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7330974047859791, "calib/avg_num_step_conf": 4.34765625, "calib/ece": 0.17592896174863387, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3401639344262295, "calib/gap": 0.28620918997865397, "calib/mean_conf": 0.6250546448087431, "calib/mu_c": 0.7599483204134366, "calib/mu_w": 0.47373913043478266, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13614754098360654, "calib/std_conf": 0.35058583123815296, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5797899999999999, "calib/step_q_c_n": 480.0, "calib/step_q_gap": 0.174296055818852, "calib/step_q_w": 0.4054939441811479, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 491.796875, "completions/mean_terminated_length": 491.796875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.07253333333333334, "grad_norm": 0.03584762290120125, "kl": 0.10675811767578125, "learning_rate": 3.6666666666666666e-06, "loss": -0.0105, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.04064110666513443, "mask/share_reasoning": 0.8522640466690063, "mask/share_step_conf": 0.10709486156702042, "num_tokens": 15878963.0, "reward": 1.2514537572860718, "reward_std": 0.31897321343421936, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7216943502426147, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7452940344810486, "step": 68 }, { "adv/mean_abs_final_conf": 0.8010131120681763, "adv/mean_abs_reasoning": 0.6304863691329956, "adv/mean_abs_step_conf": 0.7542392611503601, "adv/ratio_final_to_reasoning": 1.2704685640859739, "adv/ratio_step_to_reasoning": 1.1962816296687613, "adv/std_final_conf": 0.9363143444061279, "adv/std_reasoning": 0.8432164192199707, "adv/std_step_conf": 0.9367656111717224, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6672566371681415, "calib/avg_num_step_conf": 4.203125, "calib/ece": 0.20028806584362135, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.205761316872428, "calib/gap": 0.19340231449965972, "calib/mean_conf": 0.5725514403292181, "calib/mu_c": 0.6760176991150443, "calib/mu_w": 0.48261538461538456, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.15390946502057612, "calib/std_conf": 0.33092598669606615, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.5628824833702881, "calib/step_q_c_n": 451.0, "calib/step_q_gap": 0.08485656337028818, "calib/step_q_w": 0.47802591999999994, "calib/step_q_w_n": 625.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 536.20703125, "completions/mean_terminated_length": 540.4291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.0736, "grad_norm": 38426.73046875, "kl": 115712.10214233398, "learning_rate": 3.638888888888889e-06, "loss": 2490.8416, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03471978008747101, "mask/share_reasoning": 0.8694478273391724, "mask/share_step_conf": 0.08801987767219543, "num_tokens": 16120728.0, "reward": 1.1657168865203857, "reward_std": 0.40911099314689636, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.6522519588470459, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7056064605712891, "step": 69 }, { "adv/mean_abs_final_conf": 0.7606246471405029, "adv/mean_abs_reasoning": 0.5701305270195007, "adv/mean_abs_step_conf": 0.7519538402557373, "adv/ratio_final_to_reasoning": 1.3341236981588367, "adv/ratio_step_to_reasoning": 1.3189152389133822, "adv/std_final_conf": 0.9351255297660828, "adv/std_reasoning": 0.8099208474159241, "adv/std_step_conf": 0.936662495136261, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7202919407894737, "calib/avg_num_step_conf": 4.546875, "calib/ece": 0.19057231404958683, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.2727272727272727, "calib/gap": 0.2710372121710526, "calib/mean_conf": 0.5600888429752066, "calib/mu_c": 0.7034473684210526, "calib/mu_w": 0.43241015625, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.1397933884297521, "calib/std_conf": 0.35751327398275423, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.6014413223140495, "calib/step_q_c_n": 484.0, "calib/step_q_gap": 0.16143151839248088, "calib/step_q_w": 0.4400098039215686, "calib/step_q_w_n": 680.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 476.52734375, "completions/mean_terminated_length": 482.1778869628906, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.07466666666666667, "grad_norm": 0.024441594257950783, "kl": 0.1150970458984375, "learning_rate": 3.6111111111111115e-06, "loss": -0.1595, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03888453543186188, "mask/share_reasoning": 0.8433162569999695, "mask/share_step_conf": 0.10608047246932983, "num_tokens": 16349711.0, "reward": 1.228013515472412, "reward_std": 0.35695314407348633, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.702923059463501, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7386612892150879, "step": 70 }, { "adv/mean_abs_final_conf": 0.8182979226112366, "adv/mean_abs_reasoning": 0.6510483026504517, "adv/mean_abs_step_conf": 0.7783514261245728, "adv/ratio_final_to_reasoning": 1.256892797784593, "adv/ratio_step_to_reasoning": 1.1955356045870382, "adv/std_final_conf": 0.9314525127410889, "adv/std_reasoning": 0.8592411875724792, "adv/std_step_conf": 0.9367839097976685, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6540616246498598, "calib/avg_num_step_conf": 4.046875, "calib/ece": 0.2361694214876034, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.24793388429752067, "calib/gap": 0.17637172917947663, "calib/mean_conf": 0.5520371900826447, "calib/mu_c": 0.6416806722689076, "calib/mu_w": 0.46530894308943094, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.1482355371900827, "calib/std_conf": 0.3500814820458849, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.5092372161172162, "calib/step_q_c_n": 455.0, "calib/step_q_gap": 0.036463968551058656, "calib/step_q_w": 0.4727732475661575, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 476.046875, "completions/mean_terminated_length": 481.69171142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.07573333333333333, "grad_norm": 0.026767849922180176, "kl": 0.1395721435546875, "learning_rate": 3.5833333333333335e-06, "loss": -0.2371, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03905648738145828, "mask/share_reasoning": 0.8491967916488647, "mask/share_step_conf": 0.10002797096967697, "num_tokens": 16575987.0, "reward": 1.149741768836975, "reward_std": 0.42206907272338867, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6432366967201233, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.6910139918327332, "step": 71 }, { "adv/mean_abs_final_conf": 0.8043488264083862, "adv/mean_abs_reasoning": 0.606508731842041, "adv/mean_abs_step_conf": 0.7735322713851929, "adv/ratio_final_to_reasoning": 1.3261949650180316, "adv/ratio_step_to_reasoning": 1.27538521833953, "adv/std_final_conf": 0.9366946220397949, "adv/std_reasoning": 0.8430470824241638, "adv/std_step_conf": 0.9365664124488831, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6659829286045126, "calib/avg_num_step_conf": 3.79296875, "calib/ece": 0.23981029810298105, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.27235772357723576, "calib/gap": 0.18124484439445065, "calib/mean_conf": 0.5645257452574526, "calib/mu_c": 0.6580952380952381, "calib/mu_w": 0.4768503937007874, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.16029810298102987, "calib/std_conf": 0.3556680369373666, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5695942720763723, "calib/step_q_c_n": 419.0, "calib/step_q_gap": 0.09832506917782152, "calib/step_q_w": 0.47126920289855073, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 428.80859375, "completions/mean_terminated_length": 433.893310546875, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.0768, "grad_norm": 0.03424179181456566, "kl": 0.17218017578125, "learning_rate": 3.555555555555556e-06, "loss": -0.1104, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.038318004459142685, "mask/share_reasoning": 0.8529452085494995, "mask/share_step_conf": 0.0970180556178093, "num_tokens": 16790170.0, "reward": 1.2140846252441406, "reward_std": 0.374131441116333, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6577873229980469, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7453470826148987, "step": 72 }, { "adv/mean_abs_final_conf": 0.8027011156082153, "adv/mean_abs_reasoning": 0.6620787382125854, "adv/mean_abs_step_conf": 0.7813224196434021, "adv/ratio_final_to_reasoning": 1.2123952473919768, "adv/ratio_step_to_reasoning": 1.1801049853265775, "adv/std_final_conf": 0.9355809688568115, "adv/std_reasoning": 0.859298050403595, "adv/std_step_conf": 0.9367927312850952, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7339769375167604, "calib/avg_num_step_conf": 3.4921875, "calib/ece": 0.15781700680272104, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.3020408163265306, "calib/gap": 0.265606474032359, "calib/mean_conf": 0.6284414965986395, "calib/mu_c": 0.7509457070707071, "calib/mu_w": 0.4853392330383481, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.1237414965986394, "calib/std_conf": 0.3376719177174699, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.5467737634408601, "calib/step_q_c_n": 465.0, "calib/step_q_gap": 0.04966700353410025, "calib/step_q_w": 0.4971067599067599, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 425.296875, "completions/mean_terminated_length": 430.3399353027344, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.07786666666666667, "grad_norm": 0.033793821930885315, "kl": 0.16473388671875, "learning_rate": 3.5277777777777784e-06, "loss": -0.2237, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03980312496423721, "mask/share_reasoning": 0.8606506586074829, "mask/share_step_conf": 0.08782745897769928, "num_tokens": 17006078.0, "reward": 1.1668775081634521, "reward_std": 0.42709246277809143, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6866916418075562, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.6809535622596741, "step": 73 }, { "adv/mean_abs_final_conf": 0.8049176931381226, "adv/mean_abs_reasoning": 0.6102519035339355, "adv/mean_abs_step_conf": 0.7477874159812927, "adv/ratio_final_to_reasoning": 1.31899251518412, "adv/ratio_step_to_reasoning": 1.2253749831027096, "adv/std_final_conf": 0.9247974753379822, "adv/std_reasoning": 0.8268208503723145, "adv/std_step_conf": 0.9366647601127625, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6851890756302521, "calib/avg_num_step_conf": 3.5703125, "calib/ece": 0.2299170124481328, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.2572614107883817, "calib/gap": 0.22164705882352942, "calib/mean_conf": 0.5289211618257261, "calib/mu_c": 0.654, "calib/mu_w": 0.4323529411764706, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.16157676348547723, "calib/std_conf": 0.3664901344931509, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5905740196078421, "calib/step_q_c_n": 340.0, "calib/step_q_gap": 0.1016081369133009, "calib/step_q_w": 0.48896588269454117, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 482.46484375, "completions/mean_terminated_length": 482.46484375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07893333333333333, "grad_norm": 0.11903407424688339, "kl": 0.85650634765625, "learning_rate": 3.5e-06, "loss": -0.0307, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.04106244444847107, "mask/share_reasoning": 0.8614534139633179, "mask/share_step_conf": 0.09748411923646927, "num_tokens": 17233517.0, "reward": 1.189833641052246, "reward_std": 0.39333751797676086, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6666750311851501, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7236837148666382, "step": 74 }, { "adv/mean_abs_final_conf": 0.7921708822250366, "adv/mean_abs_reasoning": 0.6618683934211731, "adv/mean_abs_step_conf": 0.7805386781692505, "adv/ratio_final_to_reasoning": 1.1968706922691001, "adv/ratio_step_to_reasoning": 1.1792958931528292, "adv/std_final_conf": 0.9356744289398193, "adv/std_reasoning": 0.8592137098312378, "adv/std_step_conf": 0.9367683529853821, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6576481770656527, "calib/avg_num_step_conf": 3.640625, "calib/ece": 0.23174796747967477, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.4186991869918699, "calib/gap": 0.17265734265734278, "calib/mean_conf": 0.6803658536585365, "calib/mu_c": 0.7526573426573429, "calib/mu_w": 0.5800000000000001, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.16540650406504062, "calib/std_conf": 0.3361236892035184, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5404908675799087, "calib/step_q_c_n": 511.0, "calib/step_q_gap": 0.05061430384277493, "calib/step_q_w": 0.4898765637371338, "calib/step_q_w_n": 421.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 385.39453125, "completions/mean_terminated_length": 386.9059143066406, "completions/min_length": 0.0, "completions/min_terminated_length": 67.0, "epoch": 0.08, "grad_norm": 0.03059612214565277, "kl": 0.230804443359375, "learning_rate": 3.4722222222222224e-06, "loss": -0.0291, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.04576520621776581, "mask/share_reasoning": 0.8473720550537109, "mask/share_step_conf": 0.10295650362968445, "num_tokens": 17436930.0, "reward": 1.1896072626113892, "reward_std": 0.41622453927993774, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6638140678405762, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7092627286911011, "step": 75 }, { "adv/mean_abs_final_conf": 0.8058186769485474, "adv/mean_abs_reasoning": 0.6808255910873413, "adv/mean_abs_step_conf": 0.7750113010406494, "adv/ratio_final_to_reasoning": 1.1835904635452679, "adv/ratio_step_to_reasoning": 1.138340437237215, "adv/std_final_conf": 0.9322393536567688, "adv/std_reasoning": 0.8903933167457581, "adv/std_step_conf": 0.9368122220039368, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6704371548339877, "calib/avg_num_step_conf": 3.60546875, "calib/ece": 0.24695121951219523, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.2926829268292683, "calib/gap": 0.20995741566305148, "calib/mean_conf": 0.5502032520325203, "calib/mu_c": 0.6637168141592921, "calib/mu_w": 0.4537593984962406, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.16890243902439034, "calib/std_conf": 0.36900156342563445, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5367044917257684, "calib/step_q_c_n": 423.0, "calib/step_q_gap": 0.04805522505910176, "calib/step_q_w": 0.48864926666666664, "calib/step_q_w_n": 500.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 400.859375, "completions/mean_terminated_length": 400.859375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.08106666666666666, "grad_norm": 0.031853899359703064, "kl": 0.2403564453125, "learning_rate": 3.444444444444445e-06, "loss": -0.0718, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.042230501770973206, "mask/share_reasoning": 0.8568806052207947, "mask/share_step_conf": 0.1008889302611351, "num_tokens": 17642606.0, "reward": 1.1947133541107178, "reward_std": 0.4264411926269531, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6630562543869019, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.72685706615448, "step": 76 }, { "adv/mean_abs_final_conf": 0.8104222416877747, "adv/mean_abs_reasoning": 0.6468051075935364, "adv/mean_abs_step_conf": 0.7566022276878357, "adv/ratio_final_to_reasoning": 1.2529620316434764, "adv/ratio_step_to_reasoning": 1.169753019580819, "adv/std_final_conf": 0.9349915981292725, "adv/std_reasoning": 0.8748694658279419, "adv/std_step_conf": 0.936540424823761, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6321777896067602, "calib/avg_num_step_conf": 4.1875, "calib/ece": 0.2494715447154472, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.23170731707317074, "calib/gap": 0.15541752611617538, "calib/mean_conf": 0.505, "calib/mu_c": 0.576390977443609, "calib/mu_w": 0.42097345132743363, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.10691056910569105, "calib/std_conf": 0.35322756892286095, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.4897856976744186, "calib/step_q_c_n": 516.0, "calib/step_q_gap": 0.028134618537727973, "calib/step_q_w": 0.4616510791366906, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 425.9527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.08213333333333334, "grad_norm": 0.0389665924012661, "kl": 0.227203369140625, "learning_rate": 3.416666666666667e-06, "loss": -0.1589, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.043137792497873306, "mask/share_reasoning": 0.833725094795227, "mask/share_step_conf": 0.11532455682754517, "num_tokens": 17855462.0, "reward": 1.2275618314743042, "reward_std": 0.3440520763397217, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6616062521934509, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7510555982589722, "step": 77 }, { "adv/mean_abs_final_conf": 0.8046442866325378, "adv/mean_abs_reasoning": 0.6689764261245728, "adv/mean_abs_step_conf": 0.7666066288948059, "adv/ratio_final_to_reasoning": 1.2027991648284209, "adv/ratio_step_to_reasoning": 1.1459396758355325, "adv/std_final_conf": 0.9366459846496582, "adv/std_reasoning": 0.8904491662979126, "adv/std_step_conf": 0.9367106556892395, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.634294068504595, "calib/avg_num_step_conf": 4.1640625, "calib/ece": 0.21283333333333337, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.19166666666666668, "calib/gap": 0.16725981620718455, "calib/mean_conf": 0.5100833333333333, "calib/mu_c": 0.5978947368421051, "calib/mu_w": 0.4306349206349206, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.12395833333333332, "calib/std_conf": 0.34734947779177994, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5521872037914692, "calib/step_q_c_n": 422.0, "calib/step_q_gap": 0.108439274184844, "calib/step_q_w": 0.44374792960662524, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 444.2265625, "completions/mean_terminated_length": 451.2778015136719, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.0832, "grad_norm": 0.02942829020321369, "kl": 0.23907470703125, "learning_rate": 3.3888888888888893e-06, "loss": -0.1937, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03722963482141495, "mask/share_reasoning": 0.8538007736206055, "mask/share_step_conf": 0.09334458410739899, "num_tokens": 18077208.0, "reward": 1.1721746921539307, "reward_std": 0.4041286110877991, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.650941014289856, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.7103760242462158, "step": 78 }, { "adv/mean_abs_final_conf": 0.7829466462135315, "adv/mean_abs_reasoning": 0.48934206366539, "adv/mean_abs_step_conf": 0.7657012939453125, "adv/ratio_final_to_reasoning": 1.599998660137476, "adv/ratio_step_to_reasoning": 1.5647567433910519, "adv/std_final_conf": 0.9350141882896423, "adv/std_reasoning": 0.7577651739120483, "adv/std_step_conf": 0.9365842938423157, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6615384615384615, "calib/avg_num_step_conf": 4.2890625, "calib/ece": 0.2012749003984064, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.15139442231075698, "calib/gap": 0.1808150031786398, "calib/mean_conf": 0.4870119521912351, "calib/mu_c": 0.5806611570247936, "calib/mu_w": 0.3998461538461538, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.10310756972111554, "calib/std_conf": 0.3280684687315595, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5104648698884758, "calib/step_q_c_n": 538.0, "calib/step_q_gap": 0.04709552465038064, "calib/step_q_w": 0.4633693452380952, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 433.28515625, "completions/mean_terminated_length": 434.9843444824219, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.08426666666666667, "grad_norm": 0.027330050244927406, "kl": 0.232757568359375, "learning_rate": 3.3611111111111117e-06, "loss": -0.0585, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.038672707974910736, "mask/share_reasoning": 0.856682300567627, "mask/share_step_conf": 0.1007387787103653, "num_tokens": 18294505.0, "reward": 1.2576024532318115, "reward_std": 0.3016151189804077, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.699630856513977, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7648181915283203, "step": 79 }, { "adv/mean_abs_final_conf": 0.7790245413780212, "adv/mean_abs_reasoning": 0.5155444145202637, "adv/mean_abs_step_conf": 0.7440930604934692, "adv/ratio_final_to_reasoning": 1.511071635026707, "adv/ratio_step_to_reasoning": 1.4433151432469304, "adv/std_final_conf": 0.9263374209403992, "adv/std_reasoning": 0.7754383087158203, "adv/std_step_conf": 0.9366190433502197, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6552506908803789, "calib/avg_num_step_conf": 4.5625, "calib/ece": 0.21924302788844624, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.32270916334661354, "calib/gap": 0.17512830635609938, "calib/mean_conf": 0.5974900398406374, "calib/mu_c": 0.6686577181208053, "calib/mu_w": 0.49352941176470594, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11155378486055782, "calib/std_conf": 0.3541642917041823, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5173976034858387, "calib/step_q_c_n": 612.0, "calib/step_q_gap": 0.07182925816209773, "calib/step_q_w": 0.445568345323741, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 391.9296875, "completions/mean_terminated_length": 393.4666748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.08533333333333333, "grad_norm": 0.036177873611450195, "kl": 0.247222900390625, "learning_rate": 3.3333333333333333e-06, "loss": -0.028, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04190149903297424, "mask/share_reasoning": 0.8368489742279053, "mask/share_step_conf": 0.1173432394862175, "num_tokens": 18496999.0, "reward": 1.2550415992736816, "reward_std": 0.3323076367378235, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6922093629837036, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7546399831771851, "step": 80 }, { "adv/mean_abs_final_conf": 0.7841025590896606, "adv/mean_abs_reasoning": 0.6617062091827393, "adv/mean_abs_step_conf": 0.7557902336120605, "adv/ratio_final_to_reasoning": 1.184970834803093, "adv/ratio_step_to_reasoning": 1.142183983048191, "adv/std_final_conf": 0.9362009167671204, "adv/std_reasoning": 0.8591841459274292, "adv/std_step_conf": 0.936693012714386, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7397331305309734, "calib/avg_num_step_conf": 4.49609375, "calib/ece": 0.15922406639004152, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.2074688796680498, "calib/gap": 0.30157072732300894, "calib/mean_conf": 0.4990331950207469, "calib/mu_c": 0.6592035398230089, "calib/mu_w": 0.3576328125, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.09468879668049796, "calib/std_conf": 0.35698346448937585, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5209234234234235, "calib/step_q_c_n": 444.0, "calib/step_q_gap": 0.14357052384775154, "calib/step_q_w": 0.3773528995756719, "calib/step_q_w_n": 707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 444.83203125, "completions/mean_terminated_length": 448.33465576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.0864, "grad_norm": 0.022152747958898544, "kl": 0.228851318359375, "learning_rate": 3.3055555555555558e-06, "loss": -0.119, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.037026625126600266, "mask/share_reasoning": 0.8459908962249756, "mask/share_step_conf": 0.10916998982429504, "num_tokens": 18717124.0, "reward": 1.2390631437301636, "reward_std": 0.3699154853820801, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7167019844055176, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7436027526855469, "step": 81 }, { "adv/mean_abs_final_conf": 0.746849775314331, "adv/mean_abs_reasoning": 0.5456186532974243, "adv/mean_abs_step_conf": 0.75649493932724, "adv/ratio_final_to_reasoning": 1.3688127610754774, "adv/ratio_step_to_reasoning": 1.3864902432411232, "adv/std_final_conf": 0.918038010597229, "adv/std_reasoning": 0.8099225163459778, "adv/std_step_conf": 0.9364215135574341, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7458376872169976, "calib/avg_num_step_conf": 3.703125, "calib/ece": 0.1523770491803279, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.32786885245901637, "calib/gap": 0.328181121560432, "calib/mean_conf": 0.559672131147541, "calib/mu_c": 0.6928275862068967, "calib/mu_w": 0.36464646464646466, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.05889344262295089, "calib/std_conf": 0.3713641984788674, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.596678961748634, "calib/step_q_c_n": 488.0, "calib/step_q_gap": 0.1554093965312427, "calib/step_q_w": 0.4412695652173913, "calib/step_q_w_n": 460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 436.87890625, "completions/mean_terminated_length": 436.87890625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.08746666666666666, "grad_norm": 0.034861210733652115, "kl": 0.22509765625, "learning_rate": 3.277777777777778e-06, "loss": -0.0454, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.041458241641521454, "mask/share_reasoning": 0.856322169303894, "mask/share_step_conf": 0.10221955180168152, "num_tokens": 18934517.0, "reward": 1.294608235359192, "reward_std": 0.2984686493873596, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7305476665496826, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7789437770843506, "step": 82 }, { "adv/mean_abs_final_conf": 0.7801580429077148, "adv/mean_abs_reasoning": 0.607515275478363, "adv/mean_abs_step_conf": 0.7453281879425049, "adv/ratio_final_to_reasoning": 1.2841784797812883, "adv/ratio_step_to_reasoning": 1.2268468267824653, "adv/std_final_conf": 0.9362401962280273, "adv/std_reasoning": 0.8590611815452576, "adv/std_step_conf": 0.9365143775939941, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.685905567300916, "calib/avg_num_step_conf": 4.73828125, "calib/ece": 0.18514644351464435, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.22594142259414227, "calib/gap": 0.2133467230443974, "calib/mean_conf": 0.5450627615062762, "calib/mu_c": 0.6432558139534883, "calib/mu_w": 0.4299090909090909, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.09523012552301255, "calib/std_conf": 0.3402753872339691, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5234452296819788, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.11635094838367899, "calib/step_q_w": 0.40709428129829983, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 493.2109375, "completions/mean_terminated_length": 503.0358581542969, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.08853333333333334, "grad_norm": 0.03021882101893425, "kl": 0.208709716796875, "learning_rate": 3.2500000000000002e-06, "loss": -0.1775, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03436541557312012, "mask/share_reasoning": 0.841633677482605, "mask/share_step_conf": 0.1044696718454361, "num_tokens": 19168043.0, "reward": 1.2520440816879272, "reward_std": 0.3603915572166443, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6813781261444092, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7683862447738647, "step": 83 }, { "adv/mean_abs_final_conf": 0.7931539416313171, "adv/mean_abs_reasoning": 0.6649503707885742, "adv/mean_abs_step_conf": 0.7759151458740234, "adv/ratio_final_to_reasoning": 1.192801713443222, "adv/ratio_step_to_reasoning": 1.1668767775162747, "adv/std_final_conf": 0.936396062374115, "adv/std_reasoning": 0.8590731620788574, "adv/std_step_conf": 0.9365212321281433, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7538262079433342, "calib/avg_num_step_conf": 4.4140625, "calib/ece": 0.17948412698412697, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.28174603174603174, "calib/gap": 0.2864191753098913, "calib/mean_conf": 0.5808333333333333, "calib/mu_c": 0.733135593220339, "calib/mu_w": 0.44671641791044775, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.146031746031746, "calib/std_conf": 0.3371915109835434, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5514638783269963, "calib/step_q_c_n": 526.0, "calib/step_q_gap": 0.07481156044620152, "calib/step_q_w": 0.47665231788079476, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 431.24609375, "completions/mean_terminated_length": 432.9372863769531, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.0896, "grad_norm": 0.02368287183344364, "kl": 0.212493896484375, "learning_rate": 3.2222222222222227e-06, "loss": -0.0731, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04129483923316002, "mask/share_reasoning": 0.8416211605072021, "mask/share_step_conf": 0.11317770928144455, "num_tokens": 19384362.0, "reward": 1.2972630262374878, "reward_std": 0.3043041229248047, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7445859313011169, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7816106677055359, "step": 84 }, { "adv/mean_abs_final_conf": 0.8060850501060486, "adv/mean_abs_reasoning": 0.6809021830558777, "adv/mean_abs_step_conf": 0.7726335525512695, "adv/ratio_final_to_reasoning": 1.183848532381483, "adv/ratio_step_to_reasoning": 1.134720333960016, "adv/std_final_conf": 0.9317854642868042, "adv/std_reasoning": 0.8592715859413147, "adv/std_step_conf": 0.9366334080696106, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.676860119047619, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.2394353448275861, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.34913793103448276, "calib/gap": 0.20447916666666666, "calib/mean_conf": 0.6247025862068966, "calib/mu_c": 0.7234166666666667, "calib/mu_w": 0.5189375, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.1734482758620689, "calib/std_conf": 0.34420667457342796, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.51070281124498, "calib/step_q_c_n": 498.0, "calib/step_q_gap": 0.12859617925538303, "calib/step_q_w": 0.38210663198959693, "calib/step_q_w_n": 769.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 482.8203125, "completions/mean_terminated_length": 498.3951416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.09066666666666667, "grad_norm": 0.042419079691171646, "kl": 0.1931610107421875, "learning_rate": 3.1944444444444443e-06, "loss": -0.1715, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.035991325974464417, "mask/share_reasoning": 0.8205660581588745, "mask/share_step_conf": 0.11219260841608047, "num_tokens": 19615788.0, "reward": 1.1812331676483154, "reward_std": 0.4072074890136719, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6329659223556519, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.7295938730239868, "step": 85 }, { "adv/mean_abs_final_conf": 0.7583739757537842, "adv/mean_abs_reasoning": 0.6253935098648071, "adv/mean_abs_step_conf": 0.7672119140625, "adv/ratio_final_to_reasoning": 1.21263486715384, "adv/ratio_step_to_reasoning": 1.2267666708411318, "adv/std_final_conf": 0.9313982129096985, "adv/std_reasoning": 0.8590084314346313, "adv/std_step_conf": 0.9365409016609192, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7173065286088885, "calib/avg_num_step_conf": 4.6875, "calib/ece": 0.2385230352303524, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.3861788617886179, "calib/gap": 0.2747786817275151, "calib/mean_conf": 0.5890379403794039, "calib/mu_c": 0.7442990654205606, "calib/mu_w": 0.4695203836930455, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.19630081300813013, "calib/std_conf": 0.37293087749823367, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5390138067061143, "calib/step_q_c_n": 507.0, "calib/step_q_gap": 0.09472809242039998, "calib/step_q_w": 0.44428571428571434, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 481.3913269042969, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.09173333333333333, "grad_norm": 0.031027162447571754, "kl": 0.208587646484375, "learning_rate": 3.1666666666666667e-06, "loss": -0.0867, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036850061267614365, "mask/share_reasoning": 0.8430253863334656, "mask/share_step_conf": 0.10840578377246857, "num_tokens": 19843092.0, "reward": 1.250777006149292, "reward_std": 0.35842105746269226, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.6893526911735535, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7697724103927612, "step": 86 }, { "adv/mean_abs_final_conf": 0.7686463594436646, "adv/mean_abs_reasoning": 0.6159418225288391, "adv/mean_abs_step_conf": 0.7691528797149658, "adv/ratio_final_to_reasoning": 1.2479203900911853, "adv/ratio_step_to_reasoning": 1.2487427409249405, "adv/std_final_conf": 0.9339625239372253, "adv/std_reasoning": 0.8430215120315552, "adv/std_step_conf": 0.9363822340965271, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6926729024943311, "calib/avg_num_step_conf": 4.11328125, "calib/ece": 0.1801920438957477, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.448559670781893, "calib/gap": 0.21848143424036282, "calib/mean_conf": 0.7293552812071331, "calib/mu_c": 0.8156689342403628, "calib/mu_w": 0.5971875, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15230452674897127, "calib/std_conf": 0.3049039748892971, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5789740484429065, "calib/step_q_c_n": 578.0, "calib/step_q_gap": 0.0984287852850117, "calib/step_q_w": 0.48054526315789475, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 426.203125, "completions/mean_terminated_length": 426.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0928, "grad_norm": 0.02669893018901348, "kl": 0.227569580078125, "learning_rate": 3.138888888888889e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.04013894498348236, "mask/share_reasoning": 0.8468420505523682, "mask/share_step_conf": 0.11301898956298828, "num_tokens": 20057696.0, "reward": 1.2784401178359985, "reward_std": 0.34235256910324097, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7148365378379822, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7686781287193298, "step": 87 }, { "adv/mean_abs_final_conf": 0.7558637261390686, "adv/mean_abs_reasoning": 0.6402626037597656, "adv/mean_abs_step_conf": 0.7453916072845459, "adv/ratio_final_to_reasoning": 1.1805526696397186, "adv/ratio_step_to_reasoning": 1.164196695086421, "adv/std_final_conf": 0.9355050325393677, "adv/std_reasoning": 0.8902518153190613, "adv/std_step_conf": 0.9365984797477722, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7807500829737802, "calib/avg_num_step_conf": 4.51171875, "calib/ece": 0.15808943089430894, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.3821138211382114, "calib/gap": 0.35411682708264186, "calib/mean_conf": 0.6736178861788618, "calib/mu_c": 0.8391603053435115, "calib/mu_w": 0.48504347826086963, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14959349593495935, "calib/std_conf": 0.32852887215994614, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5536013986013986, "calib/step_q_c_n": 572.0, "calib/step_q_gap": 0.077744623301227, "calib/step_q_w": 0.47585677530017156, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 459.27734375, "completions/mean_terminated_length": 461.0784606933594, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.09386666666666667, "grad_norm": 0.02687120996415615, "kl": 0.2032012939453125, "learning_rate": 3.1111111111111116e-06, "loss": 0.0127, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.036057278513908386, "mask/share_reasoning": 0.849613606929779, "mask/share_step_conf": 0.11042284220457077, "num_tokens": 20285119.0, "reward": 1.3039413690567017, "reward_std": 0.3519474267959595, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7495101690292358, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7838737964630127, "step": 88 }, { "adv/mean_abs_final_conf": 0.7642644643783569, "adv/mean_abs_reasoning": 0.4801882207393646, "adv/mean_abs_step_conf": 0.7683746814727783, "adv/ratio_final_to_reasoning": 1.5915935280577873, "adv/ratio_step_to_reasoning": 1.6001531238931304, "adv/std_final_conf": 0.912933349609375, "adv/std_reasoning": 0.7394396662712097, "adv/std_step_conf": 0.9365631341934204, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8067254540668597, "calib/avg_num_step_conf": 4.4765625, "calib/ece": 0.22570850202429146, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4048582995951417, "calib/gap": 0.32661489865754134, "calib/mean_conf": 0.6598785425101215, "calib/mu_c": 0.833103448275862, "calib/mu_w": 0.5064885496183207, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.20797570850202426, "calib/std_conf": 0.34517434634925115, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5599941176470588, "calib/step_q_c_n": 493.0, "calib/step_q_gap": 0.12964189712638496, "calib/step_q_w": 0.4303522205206738, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 474.0625, "completions/mean_terminated_length": 481.58734130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.09493333333333333, "grad_norm": 0.046834517270326614, "kl": 0.19915771484375, "learning_rate": 3.0833333333333336e-06, "loss": -0.2275, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.037030596286058426, "mask/share_reasoning": 0.8412870168685913, "mask/share_step_conf": 0.10605736076831818, "num_tokens": 20515367.0, "reward": 1.2782588005065918, "reward_std": 0.299771249294281, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.721447229385376, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7769100666046143, "step": 89 }, { "adv/mean_abs_final_conf": 0.785028874874115, "adv/mean_abs_reasoning": 0.5897440910339355, "adv/mean_abs_step_conf": 0.7673530578613281, "adv/ratio_final_to_reasoning": 1.331134786781513, "adv/ratio_step_to_reasoning": 1.3011627747147236, "adv/std_final_conf": 0.9305712580680847, "adv/std_reasoning": 0.8100884556770325, "adv/std_step_conf": 0.936499834060669, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7446981524850376, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.2431048387096774, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5443548387096774, "calib/gap": 0.29383944834764497, "calib/mean_conf": 0.7241532258064517, "calib/mu_c": 0.8734426229508196, "calib/mu_w": 0.5796031746031747, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2376612903225806, "calib/std_conf": 0.3348893785665514, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5530253807106599, "calib/step_q_c_n": 591.0, "calib/step_q_gap": 0.08176625662306863, "calib/step_q_w": 0.47125912408759124, "calib/step_q_w_n": 685.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 471.6015625, "completions/mean_terminated_length": 475.3149719238281, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.096, "grad_norm": 0.022943692281842232, "kl": 0.201751708984375, "learning_rate": 3.055555555555556e-06, "loss": -0.0693, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03743113577365875, "mask/share_reasoning": 0.8343207836151123, "mask/share_step_conf": 0.12043557316064835, "num_tokens": 20739417.0, "reward": 1.2747962474822998, "reward_std": 0.33885419368743896, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7002820372581482, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7812958359718323, "step": 90 }, { "adv/mean_abs_final_conf": 0.7473675608634949, "adv/mean_abs_reasoning": 0.6477749347686768, "adv/mean_abs_step_conf": 0.7765180468559265, "adv/ratio_final_to_reasoning": 1.1537457236291229, "adv/ratio_step_to_reasoning": 1.1987466713778057, "adv/std_final_conf": 0.9242017269134521, "adv/std_reasoning": 0.874825119972229, "adv/std_step_conf": 0.9364200830459595, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6350736641998778, "calib/avg_num_step_conf": 5.046875, "calib/ece": 0.2510162601626016, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.540650406504065, "calib/gap": 0.1434374363500579, "calib/mean_conf": 0.7913414634146342, "calib/mu_c": 0.8513986013986014, "calib/mu_w": 0.7079611650485435, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23052845528455282, "calib/std_conf": 0.2702577610106254, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5467656250000001, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.10696501150306764, "calib/step_q_w": 0.4398006134969325, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 462.6953125, "completions/mean_terminated_length": 466.3385925292969, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.09706666666666666, "grad_norm": 0.03227195516228676, "kl": 0.222259521484375, "learning_rate": 3.0277777777777776e-06, "loss": -0.0292, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03430453687906265, "mask/share_reasoning": 0.84847092628479, "mask/share_step_conf": 0.10941202938556671, "num_tokens": 20965579.0, "reward": 1.2857780456542969, "reward_std": 0.32636329531669617, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6812605261802673, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.793975830078125, "step": 91 }, { "adv/mean_abs_final_conf": 0.7557482123374939, "adv/mean_abs_reasoning": 0.578435480594635, "adv/mean_abs_step_conf": 0.7436249256134033, "adv/ratio_final_to_reasoning": 1.3065384778274327, "adv/ratio_step_to_reasoning": 1.2855797242052867, "adv/std_final_conf": 0.9175612330436707, "adv/std_reasoning": 0.8266932368278503, "adv/std_step_conf": 0.9365796446800232, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7118818453368736, "calib/avg_num_step_conf": 4.46484375, "calib/ece": 0.27937398373983746, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.508130081300813, "calib/gap": 0.25143909724527036, "calib/mean_conf": 0.7268861788617887, "calib/mu_c": 0.8607826086956522, "calib/mu_w": 0.6093435114503818, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2693902439024391, "calib/std_conf": 0.3191577509188485, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5954941295546559, "calib/step_q_c_n": 494.0, "calib/step_q_gap": 0.1347545301709886, "calib/step_q_w": 0.46073959938366726, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 443.3046875, "completions/mean_terminated_length": 443.3046875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.09813333333333334, "grad_norm": 0.036638762801885605, "kl": 0.224090576171875, "learning_rate": 3e-06, "loss": -0.0083, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03690659627318382, "mask/share_reasoning": 0.8470161557197571, "mask/share_step_conf": 0.1160772442817688, "num_tokens": 21185785.0, "reward": 1.2579727172851562, "reward_std": 0.34334200620651245, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6777262091636658, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7780939340591431, "step": 92 }, { "adv/mean_abs_final_conf": 0.7904902696609497, "adv/mean_abs_reasoning": 0.6281392574310303, "adv/mean_abs_step_conf": 0.7773154973983765, "adv/ratio_final_to_reasoning": 1.2584634064966804, "adv/ratio_step_to_reasoning": 1.2374891207682968, "adv/std_final_conf": 0.9221128821372986, "adv/std_reasoning": 0.8431077003479004, "adv/std_step_conf": 0.9364581108093262, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6762441937624419, "calib/avg_num_step_conf": 5.32421875, "calib/ece": 0.29676113360323886, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4574898785425101, "calib/gap": 0.16911413404114128, "calib/mean_conf": 0.6739271255060728, "calib/mu_c": 0.7677272727272726, "calib/mu_w": 0.5986131386861313, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.26267206477732796, "calib/std_conf": 0.34761122757976415, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5337713310580204, "calib/step_q_c_n": 586.0, "calib/step_q_gap": 0.07632088060757003, "calib/step_q_w": 0.4574504504504504, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 473.28515625, "completions/mean_terminated_length": 478.8972473144531, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0992, "grad_norm": 0.03193812072277069, "kl": 0.194610595703125, "learning_rate": 2.9722222222222225e-06, "loss": -0.1757, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.036378346383571625, "mask/share_reasoning": 0.8255068063735962, "mask/share_step_conf": 0.12639610469341278, "num_tokens": 21412722.0, "reward": 1.2335413694381714, "reward_std": 0.3283604383468628, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6352039575576782, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7772674560546875, "step": 93 }, { "adv/mean_abs_final_conf": 0.7623587846755981, "adv/mean_abs_reasoning": 0.5652198791503906, "adv/mean_abs_step_conf": 0.7501968145370483, "adv/ratio_final_to_reasoning": 1.3487826822749698, "adv/ratio_step_to_reasoning": 1.327265445201088, "adv/std_final_conf": 0.9355966448783875, "adv/std_reasoning": 0.8266376256942749, "adv/std_step_conf": 0.9366117119789124, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7823311941964285, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.22558333333333333, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.45416666666666666, "calib/gap": 0.34285714285714286, "calib/mean_conf": 0.6625, "calib/mu_c": 0.8453571428571428, "calib/mu_w": 0.5025, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21070833333333333, "calib/std_conf": 0.3472001104070485, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5995357833655707, "calib/step_q_c_n": 517.0, "calib/step_q_gap": 0.21405328071445723, "calib/step_q_w": 0.38548250265111345, "calib/step_q_w_n": 943.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 476.62890625, "completions/mean_terminated_length": 486.12353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.10026666666666667, "grad_norm": 0.0320698544383049, "kl": 0.205352783203125, "learning_rate": 2.944444444444445e-06, "loss": -0.0988, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.036758385598659515, "mask/share_reasoning": 0.8234202861785889, "mask/share_step_conf": 0.120290108025074, "num_tokens": 21643419.0, "reward": 1.2477514743804932, "reward_std": 0.38341718912124634, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.709418773651123, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7559326887130737, "step": 94 }, { "adv/mean_abs_final_conf": 0.7547749280929565, "adv/mean_abs_reasoning": 0.5156925916671753, "adv/mean_abs_step_conf": 0.7422448992729187, "adv/ratio_final_to_reasoning": 1.463614060564367, "adv/ratio_step_to_reasoning": 1.4393165836905386, "adv/std_final_conf": 0.9236453175544739, "adv/std_reasoning": 0.7754721641540527, "adv/std_step_conf": 0.9363172054290771, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8249806501547987, "calib/avg_num_step_conf": 5.3984375, "calib/ece": 0.16419999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.528, "calib/gap": 0.4083655830753353, "calib/mean_conf": 0.6818, "calib/mu_c": 0.8680147058823529, "calib/mu_w": 0.45964912280701753, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.15099999999999997, "calib/std_conf": 0.36050902901314413, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5535278154681139, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.11976037360764874, "calib/step_q_w": 0.4337674418604652, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 468.7578125, "completions/mean_terminated_length": 470.5960998535156, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.10133333333333333, "grad_norm": 0.02963675558567047, "kl": 0.1885986328125, "learning_rate": 2.916666666666667e-06, "loss": -0.0364, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03658926114439964, "mask/share_reasoning": 0.8279895186424255, "mask/share_step_conf": 0.13151498138904572, "num_tokens": 21869549.0, "reward": 1.3565418720245361, "reward_std": 0.2720193862915039, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7827984094619751, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8147519826889038, "step": 95 }, { "adv/mean_abs_final_conf": 0.731205940246582, "adv/mean_abs_reasoning": 0.47899991273880005, "adv/mean_abs_step_conf": 0.7501848340034485, "adv/ratio_final_to_reasoning": 1.5265262493801552, "adv/ratio_step_to_reasoning": 1.5661481642326025, "adv/std_final_conf": 0.910809338092804, "adv/std_reasoning": 0.739449679851532, "adv/std_step_conf": 0.9365458488464355, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7924603174603175, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.13610655737704916, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.45081967213114754, "calib/gap": 0.38388311688311694, "calib/mean_conf": 0.6702868852459016, "calib/mu_c": 0.8118831168831169, "calib/mu_w": 0.42799999999999994, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.08762295081967211, "calib/std_conf": 0.35880882117901175, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.537461139896373, "calib/step_q_c_n": 772.0, "calib/step_q_gap": 0.1175108087705452, "calib/step_q_w": 0.4199503311258278, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 448.65625, "completions/mean_terminated_length": 455.7778015136719, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.1024, "grad_norm": 0.05616743117570877, "kl": 0.2193756103515625, "learning_rate": 2.888888888888889e-06, "loss": -0.08, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03599565476179123, "mask/share_reasoning": 0.8235194087028503, "mask/share_step_conf": 0.12485991418361664, "num_tokens": 22090221.0, "reward": 1.309035301208496, "reward_std": 0.31176167726516724, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7697300910949707, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7694828510284424, "step": 96 }, { "adv/mean_abs_final_conf": 0.7836225032806396, "adv/mean_abs_reasoning": 0.6345371007919312, "adv/mean_abs_step_conf": 0.7422855496406555, "adv/ratio_final_to_reasoning": 1.234951435152717, "adv/ratio_step_to_reasoning": 1.1698063812411432, "adv/std_final_conf": 0.9360610842704773, "adv/std_reasoning": 0.8430257439613342, "adv/std_step_conf": 0.9365617036819458, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7327619414483824, "calib/avg_num_step_conf": 5.15625, "calib/ece": 0.17587999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.38, "calib/gap": 0.31287236774524946, "calib/mean_conf": 0.5725999999999999, "calib/mu_c": 0.7377966101694918, "calib/mu_w": 0.42492424242424237, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13823999999999992, "calib/std_conf": 0.371011644022125, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5475830258302583, "calib/step_q_c_n": 542.0, "calib/step_q_gap": 0.17355988958347168, "calib/step_q_w": 0.3740231362467866, "calib/step_q_w_n": 778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 450.6328125, "completions/mean_terminated_length": 450.6328125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.10346666666666667, "grad_norm": 0.0345601886510849, "kl": 0.227081298828125, "learning_rate": 2.861111111111111e-06, "loss": 0.0051, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037733692675828934, "mask/share_reasoning": 0.8358029127120972, "mask/share_step_conf": 0.1264633983373642, "num_tokens": 22310655.0, "reward": 1.3154611587524414, "reward_std": 0.32647430896759033, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7387187480926514, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.802351713180542, "step": 97 }, { "adv/mean_abs_final_conf": 0.7813445329666138, "adv/mean_abs_reasoning": 0.5696825981140137, "adv/mean_abs_step_conf": 0.7636961936950684, "adv/ratio_final_to_reasoning": 1.3715436201725772, "adv/ratio_step_to_reasoning": 1.3405643707976238, "adv/std_final_conf": 0.9365703463554382, "adv/std_reasoning": 0.8099415898323059, "adv/std_step_conf": 0.9364157915115356, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7403743315508021, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.17791327913279126, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.33739837398373984, "calib/gap": 0.2944015151515151, "calib/mean_conf": 0.597940379403794, "calib/mu_c": 0.7295833333333333, "calib/mu_w": 0.43518181818181817, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11150406504065036, "calib/std_conf": 0.3598716323190871, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5329219117647058, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.15070401293202101, "calib/step_q_w": 0.3822178988326848, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 472.16796875, "completions/mean_terminated_length": 481.57373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 56.0, "epoch": 0.10453333333333334, "grad_norm": 0.06730476021766663, "kl": 0.202850341796875, "learning_rate": 2.8333333333333335e-06, "loss": -0.1437, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03540075570344925, "mask/share_reasoning": 0.8269611597061157, "mask/share_step_conf": 0.11810681968927383, "num_tokens": 22537714.0, "reward": 1.2975715398788452, "reward_std": 0.3349180817604065, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7235423922538757, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7881441116333008, "step": 98 }, { "adv/mean_abs_final_conf": 0.8026585578918457, "adv/mean_abs_reasoning": 0.5982890725135803, "adv/mean_abs_step_conf": 0.7559108734130859, "adv/ratio_final_to_reasoning": 1.341589868121194, "adv/ratio_step_to_reasoning": 1.263454253371689, "adv/std_final_conf": 0.9297617077827454, "adv/std_reasoning": 0.8265897035598755, "adv/std_step_conf": 0.9366044998168945, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7273783587509077, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.21506072874493934, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.2874493927125506, "calib/gap": 0.30488961510530127, "calib/mean_conf": 0.4822672064777328, "calib/mu_c": 0.6822352941176469, "calib/mu_w": 0.37734567901234567, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17659919028340088, "calib/std_conf": 0.3781166324165204, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5168824940047961, "calib/step_q_c_n": 417.0, "calib/step_q_gap": 0.15879089095136106, "calib/step_q_w": 0.35809160305343507, "calib/step_q_w_n": 1048.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 498.96875, "completions/mean_terminated_length": 510.94403076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.1056, "grad_norm": 0.04307432472705841, "kl": 0.2559814453125, "learning_rate": 2.805555555555556e-06, "loss": -0.2176, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033838823437690735, "mask/share_reasoning": 0.8234269618988037, "mask/share_step_conf": 0.11929672211408615, "num_tokens": 22771250.0, "reward": 1.2689709663391113, "reward_std": 0.3343353867530823, "rewards/accuracy_reward_step": 0.33203125, "rewards/final_brier_reward_step": 0.7190163731575012, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7805565595626831, "step": 99 }, { "adv/mean_abs_final_conf": 0.7759315967559814, "adv/mean_abs_reasoning": 0.5533714890480042, "adv/mean_abs_step_conf": 0.7387323379516602, "adv/ratio_final_to_reasoning": 1.4021893287109168, "adv/ratio_step_to_reasoning": 1.3349663879910811, "adv/std_final_conf": 0.9354363679885864, "adv/std_reasoning": 0.8099173307418823, "adv/std_step_conf": 0.9364297389984131, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7968636273407002, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.14951219512195119, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.34552845528455284, "calib/gap": 0.3771825580625951, "calib/mean_conf": 0.5680487804878049, "calib/mu_c": 0.7627731092436975, "calib/mu_w": 0.38559055118110236, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.116910569105691, "calib/std_conf": 0.3771548571056062, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5040644171779141, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.11798451705307011, "calib/step_q_w": 0.38607990012484394, "calib/step_q_w_n": 801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 488.25, "completions/mean_terminated_length": 497.9761047363281, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.10666666666666667, "grad_norm": 0.034269217401742935, "kl": 0.18756103515625, "learning_rate": 2.7777777777777783e-06, "loss": -0.1659, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03305002674460411, "mask/share_reasoning": 0.8251796960830688, "mask/share_step_conf": 0.12223898619413376, "num_tokens": 23003650.0, "reward": 1.328675627708435, "reward_std": 0.32988256216049194, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7545890808105469, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8091936111450195, "step": 100 }, { "adv/mean_abs_final_conf": 0.7679054737091064, "adv/mean_abs_reasoning": 0.5656028985977173, "adv/mean_abs_step_conf": 0.7648128271102905, "adv/ratio_final_to_reasoning": 1.3576759871863315, "adv/ratio_step_to_reasoning": 1.3522081110377415, "adv/std_final_conf": 0.9275540709495544, "adv/std_reasoning": 0.7929666638374329, "adv/std_step_conf": 0.9364734292030334, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7262038073908176, "calib/avg_num_step_conf": 6.6015625, "calib/ece": 0.210609756097561, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.2926829268292683, "calib/gap": 0.29192049272116477, "calib/mean_conf": 0.47207317073170735, "calib/mu_c": 0.6524468085106384, "calib/mu_w": 0.36052631578947364, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15028455284552847, "calib/std_conf": 0.3820816043329242, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4556084656084656, "calib/step_q_c_n": 567.0, "calib/step_q_gap": 0.07670997941078078, "calib/step_q_w": 0.3788984861976848, "calib/step_q_w_n": 1123.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3043.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 560.671875, "completions/mean_terminated_length": 567.3201904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.10773333333333333, "grad_norm": 0.027241338044404984, "kl": 0.192413330078125, "learning_rate": 2.7500000000000004e-06, "loss": -0.0628, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03061143308877945, "mask/share_reasoning": 0.8381972312927246, "mask/share_step_conf": 0.11947259306907654, "num_tokens": 23254174.0, "reward": 1.2669718265533447, "reward_std": 0.31347090005874634, "rewards/accuracy_reward_step": 0.3671875, "rewards/final_brier_reward_step": 0.7102582454681396, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7802020907402039, "step": 101 }, { "adv/mean_abs_final_conf": 0.7464408278465271, "adv/mean_abs_reasoning": 0.5585829019546509, "adv/mean_abs_step_conf": 0.7430539131164551, "adv/ratio_final_to_reasoning": 1.336311629365139, "adv/ratio_step_to_reasoning": 1.3302482237037407, "adv/std_final_conf": 0.9207327961921692, "adv/std_reasoning": 0.8097980618476868, "adv/std_step_conf": 0.9364011883735657, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7597402597402597, "calib/avg_num_step_conf": 5.81640625, "calib/ece": 0.1727016129032258, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.34274193548387094, "calib/gap": 0.33634698634698634, "calib/mean_conf": 0.5458467741935484, "calib/mu_c": 0.6882517482517483, "calib/mu_w": 0.35190476190476194, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.07096774193548386, "calib/std_conf": 0.38226100575634475, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5010460251046025, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.14460820127040558, "calib/step_q_w": 0.3564378238341969, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 458.13671875, "completions/mean_terminated_length": 461.74407958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1088, "grad_norm": 0.045535702258348465, "kl": 0.1992034912109375, "learning_rate": 2.7222222222222224e-06, "loss": 0.022, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.038107555359601974, "mask/share_reasoning": 0.8163143396377563, "mask/share_step_conf": 0.13776561617851257, "num_tokens": 23478153.0, "reward": 1.326077938079834, "reward_std": 0.2865089774131775, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7478504180908203, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7994182109832764, "step": 102 }, { "adv/mean_abs_final_conf": 0.7523104548454285, "adv/mean_abs_reasoning": 0.547295331954956, "adv/mean_abs_step_conf": 0.7360366582870483, "adv/ratio_final_to_reasoning": 1.3745968783584395, "adv/ratio_step_to_reasoning": 1.3448619334243221, "adv/std_final_conf": 0.9252135157585144, "adv/std_reasoning": 0.8100307583808899, "adv/std_step_conf": 0.9364539384841919, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8137295794230102, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.1308953168044077, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.33884297520661155, "calib/gap": 0.4076239137990963, "calib/mean_conf": 0.5544765840220386, "calib/mu_c": 0.731338199513382, "calib/mu_w": 0.3237142857142857, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.05962809917355369, "calib/std_conf": 0.3759544042808338, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.552143928035982, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.18616189979464692, "calib/step_q_w": 0.3659820282413351, "calib/step_q_w_n": 779.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 548.4296875, "completions/mean_terminated_length": 559.3546142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.10986666666666667, "grad_norm": 0.03547711670398712, "kl": 0.194061279296875, "learning_rate": 2.6944444444444444e-06, "loss": -0.1787, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.032446619123220444, "mask/share_reasoning": 0.836654782295227, "mask/share_step_conf": 0.1113673597574234, "num_tokens": 23723103.0, "reward": 1.322067141532898, "reward_std": 0.32839345932006836, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7610505819320679, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7942761778831482, "step": 103 }, { "adv/mean_abs_final_conf": 0.773780345916748, "adv/mean_abs_reasoning": 0.5395663976669312, "adv/mean_abs_step_conf": 0.7615673542022705, "adv/ratio_final_to_reasoning": 1.4340780842961143, "adv/ratio_step_to_reasoning": 1.4114432579479834, "adv/std_final_conf": 0.9268155694007874, "adv/std_reasoning": 0.7929793000221252, "adv/std_step_conf": 0.9363500475883484, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7784725023530994, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.1491836734693877, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.31020408163265306, "calib/gap": 0.370834341804491, "calib/mean_conf": 0.5094285714285715, "calib/mu_c": 0.7122522522522522, "calib/mu_w": 0.3414179104477612, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1027755102040816, "calib/std_conf": 0.3827404939567876, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4928595317725753, "calib/step_q_c_n": 598.0, "calib/step_q_gap": 0.14775933137177366, "calib/step_q_w": 0.34510020040080164, "calib/step_q_w_n": 998.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 517.3125, "completions/mean_terminated_length": 523.4466552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.11093333333333333, "grad_norm": 0.02864542417228222, "kl": 0.182952880859375, "learning_rate": 2.666666666666667e-06, "loss": -0.0655, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03282156586647034, "mask/share_reasoning": 0.8282309174537659, "mask/share_step_conf": 0.1272287517786026, "num_tokens": 23962215.0, "reward": 1.3129898309707642, "reward_std": 0.2917391061782837, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.7525316476821899, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7976614832878113, "step": 104 }, { "adv/mean_abs_final_conf": 0.7816280126571655, "adv/mean_abs_reasoning": 0.6181885600090027, "adv/mean_abs_step_conf": 0.7564232349395752, "adv/ratio_final_to_reasoning": 1.26438446652229, "adv/ratio_step_to_reasoning": 1.223612476634248, "adv/std_final_conf": 0.9332498908042908, "adv/std_reasoning": 0.8429936170578003, "adv/std_step_conf": 0.9363241195678711, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7555118110236221, "calib/avg_num_step_conf": 6.9296875, "calib/ece": 0.20573549257759788, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.41295546558704455, "calib/gap": 0.3361561679790027, "calib/mean_conf": 0.5773414304993252, "calib/mu_c": 0.7406561679790027, "calib/mu_w": 0.40449999999999997, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13445344129554662, "calib/std_conf": 0.39609121884439313, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.47596306068601585, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.14378786383562214, "calib/step_q_w": 0.3321751968503937, "calib/step_q_w_n": 1016.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 548.26953125, "completions/mean_terminated_length": 554.770751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.112, "grad_norm": 0.024644332006573677, "kl": 0.1766510009765625, "learning_rate": 2.6388888888888893e-06, "loss": -0.0284, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03252999484539032, "mask/share_reasoning": 0.8243792057037354, "mask/share_step_conf": 0.13137198984622955, "num_tokens": 24208332.0, "reward": 1.310977578163147, "reward_std": 0.3071625828742981, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7267365455627441, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8019061088562012, "step": 105 }, { "adv/mean_abs_final_conf": 0.7614802122116089, "adv/mean_abs_reasoning": 0.5627346038818359, "adv/mean_abs_step_conf": 0.7822858095169067, "adv/ratio_final_to_reasoning": 1.3531782246174182, "adv/ratio_step_to_reasoning": 1.3901505329876116, "adv/std_final_conf": 0.9094207286834717, "adv/std_reasoning": 0.7929022908210754, "adv/std_step_conf": 0.9362094402313232, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7898050103305785, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.2254618473895582, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4779116465863454, "calib/gap": 0.3493950154958677, "calib/mean_conf": 0.6636144578313253, "calib/mu_c": 0.8432231404958677, "calib/mu_w": 0.49382812499999995, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20156626506024097, "calib/std_conf": 0.36601482944451913, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5342596646341463, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.12753104009883032, "calib/step_q_w": 0.406728624535316, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 486.57421875, "completions/mean_terminated_length": 490.405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.11306666666666666, "grad_norm": 0.04457806795835495, "kl": 0.196075439453125, "learning_rate": 2.6111111111111113e-06, "loss": -0.0506, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03309110552072525, "mask/share_reasoning": 0.8332442045211792, "mask/share_step_conf": 0.12585218250751495, "num_tokens": 24437479.0, "reward": 1.3296716213226318, "reward_std": 0.29688361287117004, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.738463282585144, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8155180215835571, "step": 106 }, { "adv/mean_abs_final_conf": 0.741417646408081, "adv/mean_abs_reasoning": 0.5233364105224609, "adv/mean_abs_step_conf": 0.7449461221694946, "adv/ratio_final_to_reasoning": 1.4167132870955867, "adv/ratio_step_to_reasoning": 1.4234555578233028, "adv/std_final_conf": 0.9123862981796265, "adv/std_reasoning": 0.7754129767417908, "adv/std_step_conf": 0.9363296031951904, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6859736400051686, "calib/avg_num_step_conf": 6.2421875, "calib/ece": 0.26199203187250997, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5936254980079682, "calib/gap": 0.21708618684584546, "calib/mean_conf": 0.7286852589641435, "calib/mu_c": 0.8229577464788731, "calib/mu_w": 0.6058715596330276, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21247011952191236, "calib/std_conf": 0.35288038522525694, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4779481132075472, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.06538811320754717, "calib/step_q_w": 0.41256000000000004, "calib/step_q_w_n": 750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2853.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 479.109375, "completions/mean_terminated_length": 482.88189697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.11413333333333334, "grad_norm": 0.02329426445066929, "kl": 0.2061767578125, "learning_rate": 2.5833333333333337e-06, "loss": -0.0092, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03365754708647728, "mask/share_reasoning": 0.8238677382469177, "mask/share_step_conf": 0.13466224074363708, "num_tokens": 24664747.0, "reward": 1.2976536750793457, "reward_std": 0.2505210340023041, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6960461139678955, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7961149215698242, "step": 107 }, { "adv/mean_abs_final_conf": 0.721204400062561, "adv/mean_abs_reasoning": 0.5183454751968384, "adv/mean_abs_step_conf": 0.7541326284408569, "adv/ratio_final_to_reasoning": 1.3913585332035323, "adv/ratio_step_to_reasoning": 1.4548841738311304, "adv/std_final_conf": 0.9127658009529114, "adv/std_reasoning": 0.7754495739936829, "adv/std_step_conf": 0.9363890886306763, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7076661656100909, "calib/avg_num_step_conf": 6.37109375, "calib/ece": 0.24965333333333334, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.676, "calib/gap": 0.28732152582619863, "calib/mean_conf": 0.7634133333333334, "calib/mu_c": 0.8863869463869463, "calib/mu_w": 0.5990654205607476, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22053333333333333, "calib/std_conf": 0.35792586792984565, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5244013683010262, "calib/step_q_c_n": 877.0, "calib/step_q_gap": 0.10637749562198107, "calib/step_q_w": 0.41802387267904517, "calib/step_q_w_n": 754.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 523.984375, "completions/mean_terminated_length": 528.1102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.1152, "grad_norm": 0.023294299840927124, "kl": 0.1746368408203125, "learning_rate": 2.5555555555555557e-06, "loss": -0.0822, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032426171004772186, "mask/share_reasoning": 0.8296167850494385, "mask/share_step_conf": 0.13014449179172516, "num_tokens": 24902119.0, "reward": 1.3081953525543213, "reward_std": 0.2910231947898865, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.713525652885437, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7975262999534607, "step": 108 }, { "adv/mean_abs_final_conf": 0.7435927391052246, "adv/mean_abs_reasoning": 0.4847774803638458, "adv/mean_abs_step_conf": 0.7423934936523438, "adv/ratio_final_to_reasoning": 1.5338846568267301, "adv/ratio_step_to_reasoning": 1.5314108507993116, "adv/std_final_conf": 0.9094902873039246, "adv/std_reasoning": 0.757621169090271, "adv/std_step_conf": 0.9364386796951294, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8225327812284333, "calib/avg_num_step_conf": 7.19921875, "calib/ece": 0.20417136929460583, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5186721991701245, "calib/gap": 0.409628171152519, "calib/mean_conf": 0.6638784232365145, "calib/mu_c": 0.8780408695652174, "calib/mu_w": 0.46841269841269845, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19543568464730293, "calib/std_conf": 0.3838578300950947, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5061785714285715, "calib/step_q_c_n": 728.0, "calib/step_q_gap": 0.16293821268417685, "calib/step_q_w": 0.3432403587443946, "calib/step_q_w_n": 1115.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2760.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 537.6953125, "completions/mean_terminated_length": 548.4063720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.11626666666666667, "grad_norm": 0.033814504742622375, "kl": 0.176971435546875, "learning_rate": 2.5277777777777778e-06, "loss": -0.1109, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.030990304425358772, "mask/share_reasoning": 0.8147619366645813, "mask/share_step_conf": 0.13471654057502747, "num_tokens": 25144369.0, "reward": 1.2998316287994385, "reward_std": 0.31807374954223633, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7274287939071655, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7970547676086426, "step": 109 }, { "adv/mean_abs_final_conf": 0.769821047782898, "adv/mean_abs_reasoning": 0.543070912361145, "adv/mean_abs_step_conf": 0.7480874061584473, "adv/ratio_final_to_reasoning": 1.4175331991836877, "adv/ratio_step_to_reasoning": 1.37751330283174, "adv/std_final_conf": 0.9342584609985352, "adv/std_reasoning": 0.7755556106567383, "adv/std_step_conf": 0.9363867044448853, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7072878102289868, "calib/avg_num_step_conf": 5.984375, "calib/ece": 0.30103319727891154, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5510204081632653, "calib/gap": 0.2845284473696238, "calib/mean_conf": 0.7037014965986396, "calib/mu_c": 0.8697732026143791, "calib/mu_w": 0.5852447552447553, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29420408163265305, "calib/std_conf": 0.3655013675119274, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5206039076376554, "calib/step_q_c_n": 563.0, "calib/step_q_gap": 0.09992279308657176, "calib/step_q_w": 0.42068111455108365, "calib/step_q_w_n": 969.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 500.484375, "completions/mean_terminated_length": 508.4285888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.11733333333333333, "grad_norm": 0.04026241973042488, "kl": 0.18853759765625, "learning_rate": 2.5e-06, "loss": -0.1173, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0325818732380867, "mask/share_reasoning": 0.8240237236022949, "mask/share_step_conf": 0.12776941061019897, "num_tokens": 25377413.0, "reward": 1.2209287881851196, "reward_std": 0.3078498840332031, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.64644455909729, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7625502347946167, "step": 110 }, { "adv/mean_abs_final_conf": 0.7420628666877747, "adv/mean_abs_reasoning": 0.5744061470031738, "adv/mean_abs_step_conf": 0.7535001039505005, "adv/ratio_final_to_reasoning": 1.291878352206552, "adv/ratio_step_to_reasoning": 1.3117897638834584, "adv/std_final_conf": 0.9208226203918457, "adv/std_reasoning": 0.809952437877655, "adv/std_step_conf": 0.9363412261009216, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7463254422476588, "calib/avg_num_step_conf": 6.58984375, "calib/ece": 0.2692876344086022, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6169354838709677, "calib/gap": 0.30018817204301085, "calib/mean_conf": 0.740739247311828, "calib/mu_c": 0.8908333333333334, "calib/mu_w": 0.5906451612903225, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2550134408602151, "calib/std_conf": 0.35988541102546445, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5089107611548557, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.1425540043980989, "calib/step_q_w": 0.3663567567567568, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 498.91015625, "completions/mean_terminated_length": 504.82611083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.1184, "grad_norm": 0.038521915674209595, "kl": 0.1896514892578125, "learning_rate": 2.4722222222222226e-06, "loss": -0.0369, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03448629006743431, "mask/share_reasoning": 0.8184767961502075, "mask/share_step_conf": 0.13531821966171265, "num_tokens": 25612542.0, "reward": 1.2936921119689941, "reward_std": 0.2981795072555542, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6903517842292786, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8028130531311035, "step": 111 }, { "adv/mean_abs_final_conf": 0.7690993547439575, "adv/mean_abs_reasoning": 0.5475917458534241, "adv/mean_abs_step_conf": 0.7494326829910278, "adv/ratio_final_to_reasoning": 1.404512322488194, "adv/ratio_step_to_reasoning": 1.3685974791731637, "adv/std_final_conf": 0.9229586124420166, "adv/std_reasoning": 0.7755283713340759, "adv/std_step_conf": 0.9363421201705933, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7962740384615385, "calib/avg_num_step_conf": 6.2109375, "calib/ece": 0.20719306122448983, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.49387755102040815, "calib/gap": 0.3853264623397437, "calib/mean_conf": 0.6222763265306122, "calib/mu_c": 0.8235897435897437, "calib/mu_w": 0.43826328124999997, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1759591836734694, "calib/std_conf": 0.3985463575458145, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5046802325581395, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.1419640463053679, "calib/step_q_w": 0.36271618625277163, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 533.59375, "completions/mean_terminated_length": 546.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.11946666666666667, "grad_norm": 0.0229276642203331, "kl": 0.18072509765625, "learning_rate": 2.4444444444444447e-06, "loss": -0.1488, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030761126428842545, "mask/share_reasoning": 0.8221441507339478, "mask/share_step_conf": 0.12365718185901642, "num_tokens": 25857062.0, "reward": 1.2952532768249512, "reward_std": 0.30912482738494873, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.722603440284729, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7933263778686523, "step": 112 }, { "adv/mean_abs_final_conf": 0.7608456015586853, "adv/mean_abs_reasoning": 0.6191291809082031, "adv/mean_abs_step_conf": 0.7413913607597351, "adv/ratio_final_to_reasoning": 1.2288963677056826, "adv/ratio_step_to_reasoning": 1.1974744263744523, "adv/std_final_conf": 0.9008378982543945, "adv/std_reasoning": 0.84315425157547, "adv/std_step_conf": 0.9363915920257568, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7687163978494624, "calib/avg_num_step_conf": 7.5390625, "calib/ece": 0.2684426229508198, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5655737704918032, "calib/gap": 0.3054381720430108, "calib/mean_conf": 0.7083606557377048, "calib/mu_c": 0.8635833333333334, "calib/mu_w": 0.5581451612903225, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24250000000000008, "calib/std_conf": 0.35913799497635385, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5165575916230366, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.17800699127998343, "calib/step_q_w": 0.3385506003430532, "calib/step_q_w_n": 1166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 503.640625, "completions/mean_terminated_length": 515.72802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.12053333333333334, "grad_norm": 0.04532643035054207, "kl": 0.1916046142578125, "learning_rate": 2.4166666666666667e-06, "loss": -0.1054, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03311200439929962, "mask/share_reasoning": 0.8013951778411865, "mask/share_step_conf": 0.14205534756183624, "num_tokens": 26091194.0, "reward": 1.3048895597457886, "reward_std": 0.34084320068359375, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6927961111068726, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8163040280342102, "step": 113 }, { "adv/mean_abs_final_conf": 0.7304548025131226, "adv/mean_abs_reasoning": 0.39244359731674194, "adv/mean_abs_step_conf": 0.7630499601364136, "adv/ratio_final_to_reasoning": 1.861298814676727, "adv/ratio_step_to_reasoning": 1.9443557376235, "adv/std_final_conf": 0.8721811771392822, "adv/std_reasoning": 0.6404451727867126, "adv/std_step_conf": 0.9361764192581177, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7922496965050156, "calib/avg_num_step_conf": 6.10546875, "calib/ece": 0.23928571428571427, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6825396825396826, "calib/gap": 0.3352846463484762, "calib/mean_conf": 0.7964285714285714, "calib/mu_c": 0.944113475177305, "calib/mu_w": 0.6088288288288288, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23809523809523808, "calib/std_conf": 0.3278682088252483, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5436524288107203, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.1709014509749967, "calib/step_q_w": 0.3727509778357236, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 481.078125, "completions/mean_terminated_length": 482.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.1216, "grad_norm": 0.026106946170330048, "kl": 0.1927032470703125, "learning_rate": 2.388888888888889e-06, "loss": 0.0689, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03571019694209099, "mask/share_reasoning": 0.8188567161560059, "mask/share_step_conf": 0.14152684807777405, "num_tokens": 26319374.0, "reward": 1.363892674446106, "reward_std": 0.24893157184123993, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.743388295173645, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8386828899383545, "step": 114 }, { "adv/mean_abs_final_conf": 0.7304072380065918, "adv/mean_abs_reasoning": 0.5260987281799316, "adv/mean_abs_step_conf": 0.7568455934524536, "adv/ratio_final_to_reasoning": 1.3883463290882245, "adv/ratio_step_to_reasoning": 1.4385999298473955, "adv/std_final_conf": 0.8974595665931702, "adv/std_reasoning": 0.7576373815536499, "adv/std_step_conf": 0.9363742470741272, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7158929722759511, "calib/avg_num_step_conf": 6.94140625, "calib/ece": 0.33760956175298806, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5896414342629482, "calib/gap": 0.2438639587362993, "calib/mean_conf": 0.7250996015936255, "calib/mu_c": 0.8620909090909091, "calib/mu_w": 0.6182269503546098, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31223107569721115, "calib/std_conf": 0.37384761851903137, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4810737386804657, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.07554784226612304, "calib/step_q_w": 0.40552589641434267, "calib/step_q_w_n": 1004.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 489.9296875, "completions/mean_terminated_length": 493.78741455078125, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.12266666666666666, "grad_norm": 0.0611027330160141, "kl": 0.188079833984375, "learning_rate": 2.361111111111111e-06, "loss": -0.0067, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0336696058511734, "mask/share_reasoning": 0.811077356338501, "mask/share_step_conf": 0.14744055271148682, "num_tokens": 26550060.0, "reward": 1.253339409828186, "reward_std": 0.2882465422153473, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6391078233718872, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7927697896957397, "step": 115 }, { "adv/mean_abs_final_conf": 0.7632418870925903, "adv/mean_abs_reasoning": 0.5066288113594055, "adv/mean_abs_step_conf": 0.7639197111129761, "adv/ratio_final_to_reasoning": 1.5065110194673512, "adv/ratio_step_to_reasoning": 1.507848929995114, "adv/std_final_conf": 0.9044250249862671, "adv/std_reasoning": 0.7394497394561768, "adv/std_step_conf": 0.936267077922821, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7873931623931624, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.24788487282463187, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.3496574721574721, "calib/mean_conf": 0.7074029451137885, "calib/mu_c": 0.8927635327635327, "calib/mu_w": 0.5431060606060606, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2427041499330656, "calib/std_conf": 0.372757543371515, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.501568047337278, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.13382848689771754, "calib/step_q_w": 0.36773956043956046, "calib/step_q_w_n": 1001.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 522.34765625, "completions/mean_terminated_length": 530.638916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.12373333333333333, "grad_norm": 0.04662587121129036, "kl": 0.1765594482421875, "learning_rate": 2.3333333333333336e-06, "loss": -0.1362, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03099883534014225, "mask/share_reasoning": 0.8247101902961731, "mask/share_step_conf": 0.128665953874588, "num_tokens": 26788301.0, "reward": 1.3226664066314697, "reward_std": 0.2959708571434021, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7060322761535645, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8270720839500427, "step": 116 }, { "adv/mean_abs_final_conf": 0.7685558199882507, "adv/mean_abs_reasoning": 0.4718118906021118, "adv/mean_abs_step_conf": 0.7780332565307617, "adv/ratio_final_to_reasoning": 1.6289454235827832, "adv/ratio_step_to_reasoning": 1.6490327438290282, "adv/std_final_conf": 0.9033436179161072, "adv/std_reasoning": 0.7208157777786255, "adv/std_step_conf": 0.9363489151000977, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7734520329524315, "calib/avg_num_step_conf": 6.6171875, "calib/ece": 0.23354798387096778, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4717741935483871, "calib/gap": 0.3672770528833378, "calib/mean_conf": 0.5995165322580646, "calib/mu_c": 0.8098122641509434, "calib/mu_w": 0.44253521126760564, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20282258064516134, "calib/std_conf": 0.4084491915841145, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.510756843800322, "calib/step_q_c_n": 621.0, "calib/step_q_gap": 0.12838032935484206, "calib/step_q_w": 0.38237651444548, "calib/step_q_w_n": 1073.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 511.90234375, "completions/mean_terminated_length": 515.9330444335938, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.1248, "grad_norm": 0.030565468594431877, "kl": 0.1809539794921875, "learning_rate": 2.305555555555556e-06, "loss": -0.0003, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03183672949671745, "mask/share_reasoning": 0.8255809545516968, "mask/share_step_conf": 0.13476982712745667, "num_tokens": 27025948.0, "reward": 1.298375129699707, "reward_std": 0.30287715792655945, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.711667537689209, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8046505451202393, "step": 117 }, { "adv/mean_abs_final_conf": 0.7155437469482422, "adv/mean_abs_reasoning": 0.509640634059906, "adv/mean_abs_step_conf": 0.7621777057647705, "adv/ratio_final_to_reasoning": 1.404016279565599, "adv/ratio_step_to_reasoning": 1.495519891522582, "adv/std_final_conf": 0.9100464582443237, "adv/std_reasoning": 0.7926938533782959, "adv/std_step_conf": 0.9363301396369934, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7660634670862837, "calib/avg_num_step_conf": 7.6640625, "calib/ece": 0.22732793522267203, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5263157894736842, "calib/gap": 0.34154864935746126, "calib/mean_conf": 0.6694331983805667, "calib/mu_c": 0.839516129032258, "calib/mu_w": 0.49796747967479676, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19736842105263155, "calib/std_conf": 0.3901765439081213, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.48639664804469274, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.18926181658401858, "calib/step_q_w": 0.29713483146067415, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 536.75, "completions/mean_terminated_length": 540.9763793945312, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.12586666666666665, "grad_norm": 0.029284710064530373, "kl": 0.173187255859375, "learning_rate": 2.277777777777778e-06, "loss": -0.1268, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03445703908801079, "mask/share_reasoning": 0.816845178604126, "mask/share_step_conf": 0.14088527858257294, "num_tokens": 27267364.0, "reward": 1.2850561141967773, "reward_std": 0.29840779304504395, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7069492340087891, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7870502471923828, "step": 118 }, { "adv/mean_abs_final_conf": 0.7403872013092041, "adv/mean_abs_reasoning": 0.5154287815093994, "adv/mean_abs_step_conf": 0.7521135807037354, "adv/ratio_final_to_reasoning": 1.436449084471047, "adv/ratio_step_to_reasoning": 1.459199811274062, "adv/std_final_conf": 0.9069008827209473, "adv/std_reasoning": 0.7754497528076172, "adv/std_step_conf": 0.936379611492157, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7275557011795543, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.26562248995983945, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.42971887550200805, "calib/gap": 0.2937332896461336, "calib/mean_conf": 0.5562248995983936, "calib/mu_c": 0.7213761467889908, "calib/mu_w": 0.42764285714285716, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1920481927710844, "calib/std_conf": 0.42390919030967705, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5179632721202003, "calib/step_q_c_n": 599.0, "calib/step_q_gap": 0.16260182633706782, "calib/step_q_w": 0.3553614457831325, "calib/step_q_w_n": 996.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 516.97265625, "completions/mean_terminated_length": 521.0433349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.12693333333333334, "grad_norm": 0.03230740875005722, "kl": 0.19720458984375, "learning_rate": 2.25e-06, "loss": -0.0263, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03197638690471649, "mask/share_reasoning": 0.8323537707328796, "mask/share_step_conf": 0.12785735726356506, "num_tokens": 27504773.0, "reward": 1.308273434638977, "reward_std": 0.30329740047454834, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.6854601502418518, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8256995677947998, "step": 119 }, { "adv/mean_abs_final_conf": 0.7392501831054688, "adv/mean_abs_reasoning": 0.5184506177902222, "adv/mean_abs_step_conf": 0.7542674541473389, "adv/ratio_final_to_reasoning": 1.425883503150897, "adv/ratio_step_to_reasoning": 1.4548491761129196, "adv/std_final_conf": 0.9314262270927429, "adv/std_reasoning": 0.7927581667900085, "adv/std_step_conf": 0.9363489747047424, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.792329093799682, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.20263157894736847, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3684210526315789, "calib/gap": 0.41860559088500265, "calib/mean_conf": 0.4960728744939271, "calib/mu_c": 0.6841911764705882, "calib/mu_w": 0.2655855855855856, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07404858299595149, "calib/std_conf": 0.42423587267105084, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.49790523690773075, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.12909637264180274, "calib/step_q_w": 0.368808864265928, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 486.69921875, "completions/mean_terminated_length": 494.42462158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.128, "grad_norm": 0.05551515147089958, "kl": 0.199859619140625, "learning_rate": 2.222222222222222e-06, "loss": -0.0976, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033555854111909866, "mask/share_reasoning": 0.8180158138275146, "mask/share_step_conf": 0.13280335068702698, "num_tokens": 27736056.0, "reward": 1.3415977954864502, "reward_std": 0.2857484221458435, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.74946129322052, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8172576427459717, "step": 120 }, { "adv/mean_abs_final_conf": 0.8071632385253906, "adv/mean_abs_reasoning": 0.6904492378234863, "adv/mean_abs_step_conf": 0.7211095094680786, "adv/ratio_final_to_reasoning": 1.1690406684636565, "adv/ratio_step_to_reasoning": 1.0444062647403929, "adv/std_final_conf": 0.9355504512786865, "adv/std_reasoning": 0.8749426007270813, "adv/std_step_conf": 0.9364193677902222, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6235042735042735, "calib/avg_num_step_conf": 6.79296875, "calib/ece": 0.29651821862348177, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.30364372469635625, "calib/gap": 0.18208547008547005, "calib/mean_conf": 0.4480971659919028, "calib/mu_c": 0.5439316239316239, "calib/mu_w": 0.3618461538461538, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1354655870445344, "calib/std_conf": 0.40929684776102304, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4465653896961691, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.08508677462488601, "calib/step_q_w": 0.3614786150712831, "calib/step_q_w_n": 982.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2646.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 552.10546875, "completions/mean_terminated_length": 560.8690795898438, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.12906666666666666, "grad_norm": 0.05335312709212303, "kl": 0.1780548095703125, "learning_rate": 2.1944444444444445e-06, "loss": -0.1412, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029981641098856926, "mask/share_reasoning": 0.8297488689422607, "mask/share_step_conf": 0.1246444582939148, "num_tokens": 27982451.0, "reward": 1.2759337425231934, "reward_std": 0.30707472562789917, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6496335864067078, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.808929443359375, "step": 121 }, { "adv/mean_abs_final_conf": 0.7398604154586792, "adv/mean_abs_reasoning": 0.6184444427490234, "adv/mean_abs_step_conf": 0.7338098883628845, "adv/ratio_final_to_reasoning": 1.196324785731689, "adv/ratio_step_to_reasoning": 1.1865413247163392, "adv/std_final_conf": 0.9198704361915588, "adv/std_reasoning": 0.858933687210083, "adv/std_step_conf": 0.9362030029296875, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8391908328532104, "calib/avg_num_step_conf": 6.453125, "calib/ece": 0.16308000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.368, "calib/gap": 0.4945758914282057, "calib/mean_conf": 0.5140399999999999, "calib/mu_c": 0.7652845528455285, "calib/mu_w": 0.27070866141732286, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09256, "calib/std_conf": 0.4218806447326068, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5378229317851959, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.1784771373926725, "calib/step_q_w": 0.3593457943925234, "calib/step_q_w_n": 963.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 504.703125, "completions/mean_terminated_length": 508.6771545410156, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.13013333333333332, "grad_norm": 0.02880645915865898, "kl": 0.1920318603515625, "learning_rate": 2.166666666666667e-06, "loss": -0.0227, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03213570639491081, "mask/share_reasoning": 0.8285558223724365, "mask/share_step_conf": 0.13149595260620117, "num_tokens": 28218999.0, "reward": 1.376186728477478, "reward_std": 0.2694529592990875, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.799628496170044, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8306692838668823, "step": 122 }, { "adv/mean_abs_final_conf": 0.7473129630088806, "adv/mean_abs_reasoning": 0.532801628112793, "adv/mean_abs_step_conf": 0.7362229824066162, "adv/ratio_final_to_reasoning": 1.402610134011595, "adv/ratio_step_to_reasoning": 1.381795669458351, "adv/std_final_conf": 0.9205432534217834, "adv/std_reasoning": 0.7930430173873901, "adv/std_step_conf": 0.9363547563552856, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7428940568475452, "calib/avg_num_step_conf": 6.90625, "calib/ece": 0.22855962059620608, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3170731707317073, "calib/gap": 0.3538009010799709, "calib/mean_conf": 0.47003116531165307, "calib/mu_c": 0.6383023255813954, "calib/mu_w": 0.2845014245014245, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08710027100271014, "calib/std_conf": 0.4104095641191893, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.49329949238578685, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.16405119306605898, "calib/step_q_w": 0.3292482993197279, "calib/step_q_w_n": 980.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 589.2109375, "completions/mean_terminated_length": 596.1976318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.1312, "grad_norm": 0.06741070747375488, "kl": 0.176788330078125, "learning_rate": 2.138888888888889e-06, "loss": -0.0264, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.028647879138588905, "mask/share_reasoning": 0.8349099159240723, "mask/share_step_conf": 0.12472347915172577, "num_tokens": 28475125.0, "reward": 1.3184385299682617, "reward_std": 0.2987160086631775, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7228055596351624, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8113325834274292, "step": 123 }, { "adv/mean_abs_final_conf": 0.7560903429985046, "adv/mean_abs_reasoning": 0.5550056099891663, "adv/mean_abs_step_conf": 0.7471824288368225, "adv/ratio_final_to_reasoning": 1.3623111719776375, "adv/ratio_step_to_reasoning": 1.3462610384269946, "adv/std_final_conf": 0.9353420734405518, "adv/std_reasoning": 0.8098151087760925, "adv/std_step_conf": 0.9362608194351196, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.716503937007874, "calib/avg_num_step_conf": 6.27734375, "calib/ece": 0.22730158730158734, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.38095238095238093, "calib/gap": 0.31188787401574797, "calib/mean_conf": 0.5223015873015873, "calib/mu_c": 0.6770078740157479, "calib/mu_w": 0.36511999999999994, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12281746031746037, "calib/std_conf": 0.4147712368117748, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5220954907161803, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.12324437699988489, "calib/step_q_w": 0.39885111371629545, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 493.953125, "completions/mean_terminated_length": 497.842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.13226666666666667, "grad_norm": 0.03444257378578186, "kl": 0.1947021484375, "learning_rate": 2.1111111111111114e-06, "loss": -0.0103, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031655892729759216, "mask/share_reasoning": 0.8293701410293579, "mask/share_step_conf": 0.13116146624088287, "num_tokens": 28708393.0, "reward": 1.3348913192749023, "reward_std": 0.2748156785964966, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7221164107322693, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8257861733436584, "step": 124 }, { "adv/mean_abs_final_conf": 0.7547118067741394, "adv/mean_abs_reasoning": 0.6525298953056335, "adv/mean_abs_step_conf": 0.7577742338180542, "adv/ratio_final_to_reasoning": 1.1565934560295443, "adv/ratio_step_to_reasoning": 1.1612866157850532, "adv/std_final_conf": 0.9224909543991089, "adv/std_reasoning": 0.8747900128364563, "adv/std_step_conf": 0.9363504648208618, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6063936063936064, "calib/avg_num_step_conf": 6.5625, "calib/ece": 0.2936827956989248, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3790322580645161, "calib/gap": 0.19537040737040734, "calib/mean_conf": 0.5385215053763441, "calib/mu_c": 0.6511746031746032, "calib/mu_w": 0.45580419580419584, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2044086021505377, "calib/std_conf": 0.4167364032866065, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5129478827361564, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.11568052813953345, "calib/step_q_w": 0.397267354596623, "calib/step_q_w_n": 1066.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 511.86328125, "completions/mean_terminated_length": 522.0597534179688, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.13333333333333333, "grad_norm": 0.04801061376929283, "kl": 0.182464599609375, "learning_rate": 2.0833333333333334e-06, "loss": -0.1594, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03276129066944122, "mask/share_reasoning": 0.8192596435546875, "mask/share_step_conf": 0.12844780087471008, "num_tokens": 28944238.0, "reward": 1.2482175827026367, "reward_std": 0.31207823753356934, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6435756087303162, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7885391712188721, "step": 125 }, { "adv/mean_abs_final_conf": 0.6897639632225037, "adv/mean_abs_reasoning": 0.4571602940559387, "adv/mean_abs_step_conf": 0.7568371295928955, "adv/ratio_final_to_reasoning": 1.5088011189749198, "adv/ratio_step_to_reasoning": 1.6555180741490378, "adv/std_final_conf": 0.8660165667533875, "adv/std_reasoning": 0.7207136154174805, "adv/std_step_conf": 0.936290979385376, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7953821138211381, "calib/avg_num_step_conf": 6.68359375, "calib/ece": 0.22935483870967743, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.49193548387096775, "calib/gap": 0.4246009756097562, "calib/mean_conf": 0.5835483870967741, "calib/mu_c": 0.7975609756097561, "calib/mu_w": 0.37295999999999996, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15846774193548388, "calib/std_conf": 0.43630171113870253, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5008433734939759, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.14420436934459835, "calib/step_q_w": 0.35663900414937755, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 511.95703125, "completions/mean_terminated_length": 518.0277099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.1344, "grad_norm": 0.03157423064112663, "kl": 0.174652099609375, "learning_rate": 2.0555555555555555e-06, "loss": -0.0076, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033401016145944595, "mask/share_reasoning": 0.8195816278457642, "mask/share_step_conf": 0.13529860973358154, "num_tokens": 29180763.0, "reward": 1.298039436340332, "reward_std": 0.28187108039855957, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7335789203643799, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7871094346046448, "step": 126 }, { "adv/mean_abs_final_conf": 0.722667396068573, "adv/mean_abs_reasoning": 0.5203970670700073, "adv/mean_abs_step_conf": 0.7545014023780823, "adv/ratio_final_to_reasoning": 1.3886846060401699, "adv/ratio_step_to_reasoning": 1.449857138177494, "adv/std_final_conf": 0.9137582182884216, "adv/std_reasoning": 0.7927393317222595, "adv/std_step_conf": 0.9363046288490295, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8147929769392033, "calib/avg_num_step_conf": 6.8125, "calib/ece": 0.17755999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.444, "calib/gap": 0.4920073375262055, "calib/mean_conf": 0.545, "calib/mu_c": 0.8283962264150944, "calib/mu_w": 0.33638888888888885, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14927999999999997, "calib/std_conf": 0.43970831240721386, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48094224924012163, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.13905919214988222, "calib/step_q_w": 0.3418830570902394, "calib/step_q_w_n": 1086.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 507.9140625, "completions/mean_terminated_length": 515.9761962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.13546666666666668, "grad_norm": 0.03133423998951912, "kl": 0.1859588623046875, "learning_rate": 2.027777777777778e-06, "loss": -0.1211, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03366788104176521, "mask/share_reasoning": 0.8146689534187317, "mask/share_step_conf": 0.1360381543636322, "num_tokens": 29414461.0, "reward": 1.3259928226470947, "reward_std": 0.29253286123275757, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.7657339572906494, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.80445396900177, "step": 127 }, { "adv/mean_abs_final_conf": 0.7339892983436584, "adv/mean_abs_reasoning": 0.5475684404373169, "adv/mean_abs_step_conf": 0.7649527788162231, "adv/ratio_final_to_reasoning": 1.3404521592907293, "adv/ratio_step_to_reasoning": 1.3969993928161595, "adv/std_final_conf": 0.892899751663208, "adv/std_reasoning": 0.7755528688430786, "adv/std_step_conf": 0.9360752701759338, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7845185482790946, "calib/avg_num_step_conf": 6.484375, "calib/ece": 0.20612244897959187, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.46530612244897956, "calib/gap": 0.4364115441274944, "calib/mean_conf": 0.562530612244898, "calib/mu_c": 0.7958771929824562, "calib/mu_w": 0.3594656488549618, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1516734693877551, "calib/std_conf": 0.43586943851368776, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5166235864297254, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.17665240487352946, "calib/step_q_w": 0.33997118155619593, "calib/step_q_w_n": 1041.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 529.4375, "completions/mean_terminated_length": 537.84130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.13653333333333334, "grad_norm": 0.02912876568734646, "kl": 0.181182861328125, "learning_rate": 2.0000000000000003e-06, "loss": 0.033, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03279316425323486, "mask/share_reasoning": 0.8236231803894043, "mask/share_step_conf": 0.12795865535736084, "num_tokens": 29656661.0, "reward": 1.3163447380065918, "reward_std": 0.2899932861328125, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.7358843684196472, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8081681728363037, "step": 128 }, { "adv/mean_abs_final_conf": 0.7602497339248657, "adv/mean_abs_reasoning": 0.6379810571670532, "adv/mean_abs_step_conf": 0.7466095685958862, "adv/ratio_final_to_reasoning": 1.191649384232104, "adv/ratio_step_to_reasoning": 1.1702691799521392, "adv/std_final_conf": 0.9339715242385864, "adv/std_reasoning": 0.8747352361679077, "adv/std_step_conf": 0.9363745450973511, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.673697518131612, "calib/avg_num_step_conf": 6.15234375, "calib/ece": 0.2922764227642277, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4796747967479675, "calib/gap": 0.22183511877037726, "calib/mean_conf": 0.610731707317073, "calib/mu_c": 0.7126315789473684, "calib/mu_w": 0.49079646017699113, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18117886178861797, "calib/std_conf": 0.41257799603927525, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.498955223880597, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.0724312290686644, "calib/step_q_w": 0.4265239948119326, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 469.109375, "completions/mean_terminated_length": 476.5555725097656, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.1376, "grad_norm": 0.035489436239004135, "kl": 0.188812255859375, "learning_rate": 1.9722222222222224e-06, "loss": -0.1051, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03501635044813156, "mask/share_reasoning": 0.810721218585968, "mask/share_step_conf": 0.1386374533176422, "num_tokens": 29879137.0, "reward": 1.2699917554855347, "reward_std": 0.31520164012908936, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.659880518913269, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7916139364242554, "step": 129 }, { "adv/mean_abs_final_conf": 0.6919351816177368, "adv/mean_abs_reasoning": 0.4041707217693329, "adv/mean_abs_step_conf": 0.7728725671768188, "adv/ratio_final_to_reasoning": 1.711987396288037, "adv/ratio_step_to_reasoning": 1.9122428358823833, "adv/std_final_conf": 0.8840898275375366, "adv/std_reasoning": 0.6816485524177551, "adv/std_step_conf": 0.9361173510551453, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7564238410596027, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.2087250996015937, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.50199203187251, "calib/gap": 0.4129847682119204, "calib/mean_conf": 0.6027490039840637, "calib/mu_c": 0.7672847682119205, "calib/mu_w": 0.35430000000000006, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10494023904382477, "calib/std_conf": 0.4313372294700475, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5396012345679012, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.16350139057414143, "calib/step_q_w": 0.3760998439937598, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 431.69140625, "completions/mean_terminated_length": 438.5436706542969, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.13866666666666666, "grad_norm": 0.049744050949811935, "kl": 0.205169677734375, "learning_rate": 1.944444444444445e-06, "loss": -0.0721, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035941801965236664, "mask/share_reasoning": 0.8154062032699585, "mask/share_step_conf": 0.13302695751190186, "num_tokens": 30094938.0, "reward": 1.3648321628570557, "reward_std": 0.23609794676303864, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7537527084350586, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8313151597976685, "step": 130 }, { "adv/mean_abs_final_conf": 0.7291051149368286, "adv/mean_abs_reasoning": 0.4814450740814209, "adv/mean_abs_step_conf": 0.7603658437728882, "adv/ratio_final_to_reasoning": 1.5144097513676587, "adv/ratio_step_to_reasoning": 1.579340790273195, "adv/std_final_conf": 0.9119905233383179, "adv/std_reasoning": 0.7574331164360046, "adv/std_step_conf": 0.9364023804664612, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.802385752688172, "calib/avg_num_step_conf": 6.0390625, "calib/ece": 0.18179282868525895, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.36254980079681276, "calib/gap": 0.43690120967741936, "calib/mean_conf": 0.48426294820717136, "calib/mu_c": 0.7540625, "calib/mu_w": 0.3171612903225806, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14179282868525894, "calib/std_conf": 0.42399888486199633, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4926744186046511, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.0861278084351596, "calib/step_q_w": 0.4065466101694915, "calib/step_q_w_n": 944.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 471.90234375, "completions/mean_terminated_length": 477.498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.13973333333333332, "grad_norm": 0.03378261998295784, "kl": 0.1905059814453125, "learning_rate": 1.916666666666667e-06, "loss": -0.1266, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03321835398674011, "mask/share_reasoning": 0.8248918056488037, "mask/share_step_conf": 0.1301710605621338, "num_tokens": 30321953.0, "reward": 1.32669997215271, "reward_std": 0.27674633264541626, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.7648215293884277, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8087424039840698, "step": 131 }, { "adv/mean_abs_final_conf": 0.672884464263916, "adv/mean_abs_reasoning": 0.5584405660629272, "adv/mean_abs_step_conf": 0.7435173988342285, "adv/ratio_final_to_reasoning": 1.2049347865392945, "adv/ratio_step_to_reasoning": 1.3314172429773772, "adv/std_final_conf": 0.8716760873794556, "adv/std_reasoning": 0.792852520942688, "adv/std_step_conf": 0.9363497495651245, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8161107897950003, "calib/avg_num_step_conf": 6.43359375, "calib/ece": 0.19391999999999993, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.48, "calib/gap": 0.4710121457489878, "calib/mean_conf": 0.57904, "calib/mu_c": 0.7994736842105263, "calib/mu_w": 0.3284615384615385, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12047999999999992, "calib/std_conf": 0.43685132299215945, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5411795865633074, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.1681097125655983, "calib/step_q_w": 0.37306987399770913, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 504.84375, "completions/mean_terminated_length": 508.81890869140625, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.1408, "grad_norm": 0.02799972891807556, "kl": 0.177398681640625, "learning_rate": 1.888888888888889e-06, "loss": -0.0943, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03382992744445801, "mask/share_reasoning": 0.8206533193588257, "mask/share_step_conf": 0.1377042829990387, "num_tokens": 30556785.0, "reward": 1.359838843345642, "reward_std": 0.28009533882141113, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7739390134811401, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8228692412376404, "step": 132 }, { "adv/mean_abs_final_conf": 0.7618807554244995, "adv/mean_abs_reasoning": 0.5961230993270874, "adv/mean_abs_step_conf": 0.7423065900802612, "adv/ratio_final_to_reasoning": 1.278059441555816, "adv/ratio_step_to_reasoning": 1.245223664236779, "adv/std_final_conf": 0.9019972681999207, "adv/std_reasoning": 0.8100215792655945, "adv/std_step_conf": 0.9363874793052673, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6911414338314922, "calib/avg_num_step_conf": 6.4609375, "calib/ece": 0.2834920634920636, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3611111111111111, "calib/gap": 0.2903573749187784, "calib/mean_conf": 0.47753968253968254, "calib/mu_c": 0.6745679012345679, "calib/mu_w": 0.3842105263157895, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21980158730158736, "calib/std_conf": 0.43238391690706907, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.46067924528301885, "calib/step_q_c_n": 530.0, "calib/step_q_gap": 0.06793013496273409, "calib/step_q_w": 0.39274911032028476, "calib/step_q_w_n": 1124.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 539.03515625, "completions/mean_terminated_length": 543.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.14186666666666667, "grad_norm": 0.024806467816233635, "kl": 0.16949462890625, "learning_rate": 1.8611111111111113e-06, "loss": -0.0913, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03038078360259533, "mask/share_reasoning": 0.8374552726745605, "mask/share_step_conf": 0.12435144186019897, "num_tokens": 30801122.0, "reward": 1.2896798849105835, "reward_std": 0.29440850019454956, "rewards/accuracy_reward_step": 0.31640625, "rewards/final_brier_reward_step": 0.6855859160423279, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8171993494033813, "step": 133 }, { "adv/mean_abs_final_conf": 0.7816120982170105, "adv/mean_abs_reasoning": 0.6557806730270386, "adv/mean_abs_step_conf": 0.7567148804664612, "adv/ratio_final_to_reasoning": 1.1918803501926074, "adv/ratio_step_to_reasoning": 1.1539145808819238, "adv/std_final_conf": 0.9028863906860352, "adv/std_reasoning": 0.8591221570968628, "adv/std_step_conf": 0.9363250136375427, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.73828125, "calib/avg_num_step_conf": 5.8828125, "calib/ece": 0.23370967741935483, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.45161290322580644, "calib/gap": 0.396828125, "calib/mean_conf": 0.5344354838709677, "calib/mu_c": 0.73925, "calib/mu_w": 0.342421875, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14213709677419356, "calib/std_conf": 0.4472592205352485, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.49645251396648044, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.1322120076373665, "calib/step_q_w": 0.3642405063291139, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 531.12109375, "completions/mean_terminated_length": 539.5516357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.14293333333333333, "grad_norm": 0.051934972405433655, "kl": 0.1751556396484375, "learning_rate": 1.8333333333333333e-06, "loss": -0.1266, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030912235379219055, "mask/share_reasoning": 0.8365188837051392, "mask/share_step_conf": 0.11694388091564178, "num_tokens": 31046041.0, "reward": 1.3200663328170776, "reward_std": 0.33075568079948425, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7186558246612549, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8173789978027344, "step": 134 }, { "adv/mean_abs_final_conf": 0.8009029626846313, "adv/mean_abs_reasoning": 0.665070652961731, "adv/mean_abs_step_conf": 0.7526130080223083, "adv/ratio_final_to_reasoning": 1.2042374131500226, "adv/ratio_step_to_reasoning": 1.131628654295192, "adv/std_final_conf": 0.9338492155075073, "adv/std_reasoning": 0.8591198921203613, "adv/std_step_conf": 0.9364001154899597, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6511030129498274, "calib/avg_num_step_conf": 6.71875, "calib/ece": 0.29258064516129023, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3790322580645161, "calib/gap": 0.23656146287499186, "calib/mean_conf": 0.517258064516129, "calib/mu_c": 0.6326771653543307, "calib/mu_w": 0.39611570247933886, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14887096774193545, "calib/std_conf": 0.4273555947573957, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4868894952251024, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.12857135946015807, "calib/step_q_w": 0.3583181357649443, "calib/step_q_w_n": 987.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 540.61328125, "completions/mean_terminated_length": 547.0237426757812, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.144, "grad_norm": 0.03680422529578209, "kl": 0.170989990234375, "learning_rate": 1.8055555555555557e-06, "loss": -0.0591, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03190198540687561, "mask/share_reasoning": 0.8299878239631653, "mask/share_step_conf": 0.1263914406299591, "num_tokens": 31290318.0, "reward": 1.2803423404693604, "reward_std": 0.31323179602622986, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6570253372192383, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8061264753341675, "step": 135 }, { "adv/mean_abs_final_conf": 0.6936593055725098, "adv/mean_abs_reasoning": 0.5179922580718994, "adv/mean_abs_step_conf": 0.7456120252609253, "adv/ratio_final_to_reasoning": 1.339130642906688, "adv/ratio_step_to_reasoning": 1.4394269675695257, "adv/std_final_conf": 0.877546489238739, "adv/std_reasoning": 0.7755007147789001, "adv/std_step_conf": 0.9362220168113708, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7837920489296637, "calib/avg_num_step_conf": 6.82421875, "calib/ece": 0.2108196721311475, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3442622950819672, "calib/gap": 0.44151002378525317, "calib/mean_conf": 0.42663934426229505, "calib/mu_c": 0.6709174311926606, "calib/mu_w": 0.2294074074074074, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09536885245901636, "calib/std_conf": 0.4467214092344606, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5147012578616352, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.19264905264111315, "calib/step_q_w": 0.3220522052205221, "calib/step_q_w_n": 1111.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 518.37890625, "completions/mean_terminated_length": 528.7052001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.14506666666666668, "grad_norm": 0.03678504377603531, "kl": 0.1739654541015625, "learning_rate": 1.777777777777778e-06, "loss": -0.152, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.032310232520103455, "mask/share_reasoning": 0.8122735023498535, "mask/share_step_conf": 0.13588495552539825, "num_tokens": 31531511.0, "reward": 1.3241932392120361, "reward_std": 0.3050885796546936, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7310745716094971, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8211559057235718, "step": 136 }, { "adv/mean_abs_final_conf": 0.7300252914428711, "adv/mean_abs_reasoning": 0.5708158612251282, "adv/mean_abs_step_conf": 0.7334673404693604, "adv/ratio_final_to_reasoning": 1.2789155681063866, "adv/ratio_step_to_reasoning": 1.2849456195823594, "adv/std_final_conf": 0.902219831943512, "adv/std_reasoning": 0.8099907040596008, "adv/std_step_conf": 0.9362177848815918, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7611658836763051, "calib/avg_num_step_conf": 6.953125, "calib/ece": 0.23313008130081314, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.37398373983739835, "calib/gap": 0.3961655528353075, "calib/mean_conf": 0.48085365853658535, "calib/mu_c": 0.6853781512605043, "calib/mu_w": 0.28921259842519675, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11512195121951233, "calib/std_conf": 0.4436375907399072, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4547826086956521, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.14203356655005828, "calib/step_q_w": 0.31274904214559385, "calib/step_q_w_n": 1044.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 511.68359375, "completions/mean_terminated_length": 517.7510375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.14613333333333334, "grad_norm": 0.029734788462519646, "kl": 0.171844482421875, "learning_rate": 1.75e-06, "loss": -0.1109, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.032275162637233734, "mask/share_reasoning": 0.8221548795700073, "mask/share_step_conf": 0.13385118544101715, "num_tokens": 31769486.0, "reward": 1.3177752494812012, "reward_std": 0.30651336908340454, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7180605530738831, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8165574669837952, "step": 137 }, { "adv/mean_abs_final_conf": 0.7477434873580933, "adv/mean_abs_reasoning": 0.649196982383728, "adv/mean_abs_step_conf": 0.7544237375259399, "adv/ratio_final_to_reasoning": 1.1517975401125884, "adv/ratio_step_to_reasoning": 1.1620875604748488, "adv/std_final_conf": 0.908065140247345, "adv/std_reasoning": 0.8430894017219543, "adv/std_step_conf": 0.9363033175468445, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7634548611111112, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.22817204301075272, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3870967741935484, "calib/gap": 0.4108618233618235, "calib/mean_conf": 0.4977956989247312, "calib/mu_c": 0.6700925925925927, "calib/mu_w": 0.2592307692307692, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07266129032258065, "calib/std_conf": 0.4400136213060975, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.48257706535141803, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.13929002831438098, "calib/step_q_w": 0.34328703703703706, "calib/step_q_w_n": 864.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 492.703125, "completions/mean_terminated_length": 504.52801513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.1472, "grad_norm": 0.03782318904995918, "kl": 0.1793670654296875, "learning_rate": 1.7222222222222224e-06, "loss": -0.1363, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0330810621380806, "mask/share_reasoning": 0.8155226707458496, "mask/share_step_conf": 0.12795880436897278, "num_tokens": 31999954.0, "reward": 1.335602045059204, "reward_std": 0.2940444350242615, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7324857711791992, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8158434629440308, "step": 138 }, { "adv/mean_abs_final_conf": 0.729174017906189, "adv/mean_abs_reasoning": 0.4861128628253937, "adv/mean_abs_step_conf": 0.7631297707557678, "adv/ratio_final_to_reasoning": 1.500009717225072, "adv/ratio_step_to_reasoning": 1.5698612999464603, "adv/std_final_conf": 0.9038439989089966, "adv/std_reasoning": 0.7393209338188171, "adv/std_step_conf": 0.9362198114395142, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7629672006102213, "calib/avg_num_step_conf": 5.83984375, "calib/ece": 0.23633597883597884, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.25396825396825395, "calib/gap": 0.38138952453597774, "calib/mean_conf": 0.38017195767195766, "calib/mu_c": 0.5527053140096619, "calib/mu_w": 0.1713157894736842, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.034444444444444465, "calib/std_conf": 0.41144764911398296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5170434010152285, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.15315372633347463, "calib/step_q_w": 0.36388967468175387, "calib/step_q_w_n": 707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 474.29296875, "completions/mean_terminated_length": 474.29296875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.14826666666666666, "grad_norm": 0.04484456777572632, "kl": 0.183837890625, "learning_rate": 1.6944444444444446e-06, "loss": 0.0127, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033740751445293427, "mask/share_reasoning": 0.8341965675354004, "mask/share_step_conf": 0.1320626437664032, "num_tokens": 32224469.0, "reward": 1.3563064336776733, "reward_std": 0.2344740480184555, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7322814464569092, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8378219604492188, "step": 139 }, { "adv/mean_abs_final_conf": 0.7029258012771606, "adv/mean_abs_reasoning": 0.547933042049408, "adv/mean_abs_step_conf": 0.7588620185852051, "adv/ratio_final_to_reasoning": 1.2828680647694481, "adv/ratio_step_to_reasoning": 1.3849539274851348, "adv/std_final_conf": 0.8725690245628357, "adv/std_reasoning": 0.7755323052406311, "adv/std_step_conf": 0.9360032081604004, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7954575596816975, "calib/avg_num_step_conf": 6.09375, "calib/ece": 0.1889558232931728, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.40562248995983935, "calib/gap": 0.4704297082228117, "calib/mean_conf": 0.5177911646586346, "calib/mu_c": 0.7142758620689655, "calib/mu_w": 0.2438461538461538, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06220883534136556, "calib/std_conf": 0.4367012393525546, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5363191763191764, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.19448009585940623, "calib/step_q_w": 0.34183908045977013, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 475.67578125, "completions/mean_terminated_length": 483.2262268066406, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.14933333333333335, "grad_norm": 0.06325884163379669, "kl": 0.1822509765625, "learning_rate": 1.6666666666666667e-06, "loss": -0.1048, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.034293144941329956, "mask/share_reasoning": 0.8276205658912659, "mask/share_step_conf": 0.12246125936508179, "num_tokens": 32451258.0, "reward": 1.3602712154388428, "reward_std": 0.26220160722732544, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7613804340362549, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8264559507369995, "step": 140 }, { "adv/mean_abs_final_conf": 0.6900002956390381, "adv/mean_abs_reasoning": 0.5364717245101929, "adv/mean_abs_step_conf": 0.7486765384674072, "adv/ratio_final_to_reasoning": 1.2861820374019137, "adv/ratio_step_to_reasoning": 1.3955563811885159, "adv/std_final_conf": 0.8891619443893433, "adv/std_reasoning": 0.7928355932235718, "adv/std_step_conf": 0.9361876249313354, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8163955757970073, "calib/avg_num_step_conf": 5.87109375, "calib/ece": 0.15534262948207167, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.46215139442231074, "calib/gap": 0.572893558880937, "calib/mean_conf": 0.5417091633466136, "calib/mu_c": 0.783648275862069, "calib/mu_w": 0.21075471698113207, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.059681274900398415, "calib/std_conf": 0.4546037535970848, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5296890547263682, "calib/step_q_c_n": 804.0, "calib/step_q_gap": 0.1800610146691436, "calib/step_q_w": 0.3496280400572246, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 520.58203125, "completions/mean_terminated_length": 522.62353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1504, "grad_norm": 0.0328042171895504, "kl": 0.1726837158203125, "learning_rate": 1.638888888888889e-06, "loss": -0.0226, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03201006352901459, "mask/share_reasoning": 0.8415316343307495, "mask/share_step_conf": 0.12255203723907471, "num_tokens": 32691623.0, "reward": 1.4064525365829468, "reward_std": 0.2672592103481293, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.8114435076713562, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8464338779449463, "step": 141 }, { "adv/mean_abs_final_conf": 0.6953755617141724, "adv/mean_abs_reasoning": 0.5196616649627686, "adv/mean_abs_step_conf": 0.746796727180481, "adv/ratio_final_to_reasoning": 1.338131343138411, "adv/ratio_step_to_reasoning": 1.437082581864078, "adv/std_final_conf": 0.8990840911865234, "adv/std_reasoning": 0.7753031253814697, "adv/std_step_conf": 0.9361851811408997, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8236506746626686, "calib/avg_num_step_conf": 6.05859375, "calib/ece": 0.1650393700787402, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3858267716535433, "calib/gap": 0.5172363818090955, "calib/mean_conf": 0.4834645669291339, "calib/mu_c": 0.7644827586206896, "calib/mu_w": 0.24724637681159417, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09590551181102364, "calib/std_conf": 0.44580967294634705, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5294489164086688, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.14894062911585104, "calib/step_q_w": 0.3805082872928177, "calib/step_q_w_n": 905.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 507.08984375, "completions/mean_terminated_length": 509.0784606933594, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.15146666666666667, "grad_norm": 0.039702072739601135, "kl": 0.1666717529296875, "learning_rate": 1.6111111111111113e-06, "loss": -0.0314, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0334194153547287, "mask/share_reasoning": 0.8337759971618652, "mask/share_step_conf": 0.12889830768108368, "num_tokens": 32926598.0, "reward": 1.3794302940368652, "reward_std": 0.23660717904567719, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7992148399353027, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8352916240692139, "step": 142 }, { "adv/mean_abs_final_conf": 0.7410321831703186, "adv/mean_abs_reasoning": 0.6267070174217224, "adv/mean_abs_step_conf": 0.7720884084701538, "adv/ratio_final_to_reasoning": 1.1824220290669967, "adv/ratio_step_to_reasoning": 1.2319766445994678, "adv/std_final_conf": 0.9093261361122131, "adv/std_reasoning": 0.8429367542266846, "adv/std_step_conf": 0.9363172650337219, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7883629191321498, "calib/avg_num_step_conf": 6.21875, "calib/ece": 0.1897688259109311, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4008097165991903, "calib/gap": 0.45389829059829045, "calib/mean_conf": 0.499542914979757, "calib/mu_c": 0.738436752136752, "calib/mu_w": 0.2845384615384615, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10781376518218619, "calib/std_conf": 0.4385892954247043, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5287832692307692, "calib/step_q_c_n": 650.0, "calib/step_q_gap": 0.19730874693777556, "calib/step_q_w": 0.3314745222929937, "calib/step_q_w_n": 942.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 500.0625, "completions/mean_terminated_length": 508.0000305175781, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.15253333333333333, "grad_norm": 0.03443131968379021, "kl": 0.1702728271484375, "learning_rate": 1.5833333333333333e-06, "loss": -0.1148, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03215855732560158, "mask/share_reasoning": 0.8261256217956543, "mask/share_step_conf": 0.12609079480171204, "num_tokens": 33161950.0, "reward": 1.3402912616729736, "reward_std": 0.2977481484413147, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7564218044281006, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8198927640914917, "step": 143 }, { "adv/mean_abs_final_conf": 0.7304239273071289, "adv/mean_abs_reasoning": 0.613550066947937, "adv/mean_abs_step_conf": 0.7624738812446594, "adv/ratio_final_to_reasoning": 1.1904878943955999, "adv/ratio_step_to_reasoning": 1.242724795121503, "adv/std_final_conf": 0.9233477711677551, "adv/std_reasoning": 0.8429200053215027, "adv/std_step_conf": 0.9363064169883728, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7127394636015326, "calib/avg_num_step_conf": 6.1328125, "calib/ece": 0.2686454183266933, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.47410358565737054, "calib/gap": 0.3362190293742019, "calib/mean_conf": 0.5545418326693227, "calib/mu_c": 0.709925925925926, "calib/mu_w": 0.37370689655172407, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1426693227091634, "calib/std_conf": 0.4515340047143453, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5160025873221216, "calib/step_q_c_n": 773.0, "calib/step_q_gap": 0.13324223600468116, "calib/step_q_w": 0.3827603513174404, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 507.17578125, "completions/mean_terminated_length": 509.16473388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.1536, "grad_norm": 0.03177357837557793, "kl": 0.1861419677734375, "learning_rate": 1.5555555555555558e-06, "loss": -0.0211, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03406987339258194, "mask/share_reasoning": 0.8311489820480347, "mask/share_step_conf": 0.1308748722076416, "num_tokens": 33395915.0, "reward": 1.3351563215255737, "reward_std": 0.28177350759506226, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7004636526107788, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8341432809829712, "step": 144 }, { "adv/mean_abs_final_conf": 0.6659106016159058, "adv/mean_abs_reasoning": 0.5489901900291443, "adv/mean_abs_step_conf": 0.7527850866317749, "adv/ratio_final_to_reasoning": 1.2129735898933904, "adv/ratio_step_to_reasoning": 1.371217738138111, "adv/std_final_conf": 0.8503638505935669, "adv/std_reasoning": 0.8099202513694763, "adv/std_step_conf": 0.9362788796424866, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7704562402548979, "calib/avg_num_step_conf": 6.66015625, "calib/ece": 0.1995161290322581, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.3710514541387024, "calib/mean_conf": 0.6395967741935484, "calib/mu_c": 0.7877181208053691, "calib/mu_w": 0.4166666666666667, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11915322580645166, "calib/std_conf": 0.40531130845781704, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5009357997823721, "calib/step_q_c_n": 919.0, "calib/step_q_gap": 0.12276786085107438, "calib/step_q_w": 0.37816793893129774, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2130.0, "completions/max_terminated_length": 2130.0, "completions/mean_length": 478.796875, "completions/mean_terminated_length": 482.5669250488281, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.15466666666666667, "grad_norm": 0.039771489799022675, "kl": 0.1626129150390625, "learning_rate": 1.527777777777778e-06, "loss": -0.0248, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03535020351409912, "mask/share_reasoning": 0.8120906352996826, "mask/share_step_conf": 0.14474664628505707, "num_tokens": 33621191.0, "reward": 1.34993577003479, "reward_std": 0.280239462852478, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.748228132724762, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8207435607910156, "step": 145 }, { "adv/mean_abs_final_conf": 0.7082700729370117, "adv/mean_abs_reasoning": 0.5225511193275452, "adv/mean_abs_step_conf": 0.7382391095161438, "adv/ratio_final_to_reasoning": 1.3554082016864923, "adv/ratio_step_to_reasoning": 1.4127595984602632, "adv/std_final_conf": 0.8754147887229919, "adv/std_reasoning": 0.7753732800483704, "adv/std_step_conf": 0.936151385307312, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7566608649082875, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.2459760956175298, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4103585657370518, "calib/gap": 0.40891150087026373, "calib/mean_conf": 0.5188047808764941, "calib/mu_c": 0.7696907216494845, "calib/mu_w": 0.36077922077922076, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18916334661354578, "calib/std_conf": 0.44450537188487804, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4835314091680815, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.09526975662570863, "calib/step_q_w": 0.3882616525423729, "calib/step_q_w_n": 944.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 510.49609375, "completions/mean_terminated_length": 510.49609375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.15573333333333333, "grad_norm": 0.05050409957766533, "kl": 0.1649322509765625, "learning_rate": 1.5e-06, "loss": -0.0258, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03330950438976288, "mask/share_reasoning": 0.8382708430290222, "mask/share_step_conf": 0.1284196376800537, "num_tokens": 33859094.0, "reward": 1.312046766281128, "reward_std": 0.2700773775577545, "rewards/accuracy_reward_step": 0.37890625, "rewards/final_brier_reward_step": 0.7233164310455322, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8148416876792908, "step": 146 }, { "adv/mean_abs_final_conf": 0.6891236901283264, "adv/mean_abs_reasoning": 0.4682924151420593, "adv/mean_abs_step_conf": 0.7646053433418274, "adv/ratio_final_to_reasoning": 1.4715670547840853, "adv/ratio_step_to_reasoning": 1.6327519272544266, "adv/std_final_conf": 0.8768998980522156, "adv/std_reasoning": 0.7574830055236816, "adv/std_step_conf": 0.9362419247627258, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7668350168350169, "calib/avg_num_step_conf": 6.58984375, "calib/ece": 0.27280478087649396, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5059760956175299, "calib/gap": 0.3902264957264957, "calib/mean_conf": 0.6112908366533865, "calib/mu_c": 0.833611111111111, "calib/mu_w": 0.44338461538461527, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22690836653386448, "calib/std_conf": 0.4360786771044835, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5402945508100148, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.1667528841433481, "calib/step_q_w": 0.37354166666666666, "calib/step_q_w_n": 1008.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 520.25390625, "completions/mean_terminated_length": 522.2941284179688, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.1568, "grad_norm": 0.026210254058241844, "kl": 0.1572113037109375, "learning_rate": 1.4722222222222225e-06, "loss": -0.0103, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032685164362192154, "mask/share_reasoning": 0.8305171728134155, "mask/share_step_conf": 0.13289138674736023, "num_tokens": 34095959.0, "reward": 1.2982693910598755, "reward_std": 0.2956669330596924, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.7013327479362488, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8081498146057129, "step": 147 }, { "adv/mean_abs_final_conf": 0.6476035118103027, "adv/mean_abs_reasoning": 0.610388457775116, "adv/mean_abs_step_conf": 0.7429471611976624, "adv/ratio_final_to_reasoning": 1.0609694589750873, "adv/ratio_step_to_reasoning": 1.2171710518670795, "adv/std_final_conf": 0.852942705154419, "adv/std_reasoning": 0.8267537951469421, "adv/std_step_conf": 0.9362772107124329, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8134118967452301, "calib/avg_num_step_conf": 6.578125, "calib/ece": 0.19139917695473252, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.551440329218107, "calib/gap": 0.4754292929292929, "calib/mean_conf": 0.6388065843621399, "calib/mu_c": 0.8325, "calib/mu_w": 0.3570707070707071, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11880658436213992, "calib/std_conf": 0.43242523038594965, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4981235154394299, "calib/step_q_c_n": 842.0, "calib/step_q_gap": 0.1455819477434679, "calib/step_q_w": 0.352541567695962, "calib/step_q_w_n": 842.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 480.74609375, "completions/mean_terminated_length": 492.2840270996094, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.15786666666666666, "grad_norm": 0.036793362349271774, "kl": 0.1713104248046875, "learning_rate": 1.4444444444444445e-06, "loss": -0.1015, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03547060489654541, "mask/share_reasoning": 0.80145263671875, "mask/share_step_conf": 0.139639213681221, "num_tokens": 34324142.0, "reward": 1.3244909048080444, "reward_std": 0.3026449680328369, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7584339380264282, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7941020727157593, "step": 148 }, { "adv/mean_abs_final_conf": 0.7106456756591797, "adv/mean_abs_reasoning": 0.557725191116333, "adv/mean_abs_step_conf": 0.7336307764053345, "adv/ratio_final_to_reasoning": 1.27418608120742, "adv/ratio_step_to_reasoning": 1.3153983146016983, "adv/std_final_conf": 0.9167241454124451, "adv/std_reasoning": 0.8099157810211182, "adv/std_step_conf": 0.9361780881881714, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8617852071769717, "calib/avg_num_step_conf": 6.671875, "calib/ece": 0.13771084337349393, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5060240963855421, "calib/gap": 0.5810152317025944, "calib/mean_conf": 0.6085542168674699, "calib/mu_c": 0.8932283464566928, "calib/mu_w": 0.3122131147540984, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11811244979919672, "calib/std_conf": 0.43541550814275554, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5515058479532164, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.21153416826571647, "calib/step_q_w": 0.3399716796875, "calib/step_q_w_n": 1024.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 548.19140625, "completions/mean_terminated_length": 552.5078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.15893333333333334, "grad_norm": 0.03566073626279831, "kl": 0.157867431640625, "learning_rate": 1.4166666666666667e-06, "loss": 0.0299, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031022585928440094, "mask/share_reasoning": 0.8345236778259277, "mask/share_step_conf": 0.12664125859737396, "num_tokens": 34568935.0, "reward": 1.3916404247283936, "reward_std": 0.2857462763786316, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.818198025226593, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8356664180755615, "step": 149 }, { "adv/mean_abs_final_conf": 0.7155439257621765, "adv/mean_abs_reasoning": 0.5725694894790649, "adv/mean_abs_step_conf": 0.7481136918067932, "adv/ratio_final_to_reasoning": 1.249706697458142, "adv/ratio_step_to_reasoning": 1.3065902140322598, "adv/std_final_conf": 0.8824063539505005, "adv/std_reasoning": 0.8098407983779907, "adv/std_step_conf": 0.9362457990646362, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7600504095778198, "calib/avg_num_step_conf": 5.8515625, "calib/ece": 0.20920948616600785, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6007905138339921, "calib/gap": 0.4052028985507246, "calib/mean_conf": 0.688498023715415, "calib/mu_c": 0.8726811594202898, "calib/mu_w": 0.4674782608695652, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17612648221343868, "calib/std_conf": 0.4065629056384185, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5358167330677291, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.13855498810128614, "calib/step_q_w": 0.397261744966443, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 463.2421875, "completions/mean_terminated_length": 465.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.16, "grad_norm": 0.047138091176748276, "kl": 0.1817474365234375, "learning_rate": 1.3888888888888892e-06, "loss": -0.03, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.038503583520650864, "mask/share_reasoning": 0.8154249787330627, "mask/share_step_conf": 0.1421651542186737, "num_tokens": 34792485.0, "reward": 1.3712739944458008, "reward_std": 0.2648713290691376, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7582472562789917, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8394160270690918, "step": 150 }, { "adv/mean_abs_final_conf": 0.7568658590316772, "adv/mean_abs_reasoning": 0.5882517099380493, "adv/mean_abs_step_conf": 0.7372068762779236, "adv/ratio_final_to_reasoning": 1.2866360543369866, "adv/ratio_step_to_reasoning": 1.2532167162855528, "adv/std_final_conf": 0.9322020411491394, "adv/std_reasoning": 0.8429966568946838, "adv/std_step_conf": 0.9365764260292053, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7093149691917362, "calib/avg_num_step_conf": 6.69140625, "calib/ece": 0.2847540983606558, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.46311475409836067, "calib/gap": 0.3561826748822038, "calib/mean_conf": 0.565983606557377, "calib/mu_c": 0.792247191011236, "calib/mu_w": 0.4360645161290322, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24299180327868858, "calib/std_conf": 0.4431947354174538, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4878900709219857, "calib/step_q_c_n": 564.0, "calib/step_q_gap": 0.10971774716219462, "calib/step_q_w": 0.3781723237597911, "calib/step_q_w_n": 1149.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 524.9296875, "completions/mean_terminated_length": 537.5280151367188, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.16106666666666666, "grad_norm": 0.032195936888456345, "kl": 0.1460113525390625, "learning_rate": 1.3611111111111112e-06, "loss": -0.1395, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030664097517728806, "mask/share_reasoning": 0.8197251558303833, "mask/share_step_conf": 0.1261732429265976, "num_tokens": 35033891.0, "reward": 1.2379226684570312, "reward_std": 0.34976986050605774, "rewards/accuracy_reward_step": 0.34765625, "rewards/final_brier_reward_step": 0.6637921333312988, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7759484052658081, "step": 151 }, { "adv/mean_abs_final_conf": 0.7919584512710571, "adv/mean_abs_reasoning": 0.6617571115493774, "adv/mean_abs_step_conf": 0.7613580226898193, "adv/ratio_final_to_reasoning": 1.1967509490254487, "adv/ratio_step_to_reasoning": 1.1505097707333216, "adv/std_final_conf": 0.927963137626648, "adv/std_reasoning": 0.8590885996818542, "adv/std_step_conf": 0.9362490773200989, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6599537037037038, "calib/avg_num_step_conf": 6.19921875, "calib/ece": 0.31602409638554213, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.39759036144578314, "calib/gap": 0.22109325396825386, "calib/mean_conf": 0.5323293172690764, "calib/mu_c": 0.6601904761904761, "calib/mu_w": 0.43909722222222225, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21333333333333332, "calib/std_conf": 0.4290423116718848, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45851910828025483, "calib/step_q_c_n": 628.0, "calib/step_q_gap": 0.05716352955241333, "calib/step_q_w": 0.4013555787278415, "calib/step_q_w_n": 959.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 494.09765625, "completions/mean_terminated_length": 501.94049072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.16213333333333332, "grad_norm": 0.03179221972823143, "kl": 0.16754150390625, "learning_rate": 1.3333333333333334e-06, "loss": -0.108, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033517200499773026, "mask/share_reasoning": 0.8194787502288818, "mask/share_step_conf": 0.13137902319431305, "num_tokens": 35265772.0, "reward": 1.2691566944122314, "reward_std": 0.30439138412475586, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6493926048278809, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8061790466308594, "step": 152 }, { "adv/mean_abs_final_conf": 0.7420669198036194, "adv/mean_abs_reasoning": 0.49139267206192017, "adv/mean_abs_step_conf": 0.760850191116333, "adv/ratio_final_to_reasoning": 1.510130211526866, "adv/ratio_step_to_reasoning": 1.5483547768910535, "adv/std_final_conf": 0.9150649905204773, "adv/std_reasoning": 0.7575465440750122, "adv/std_step_conf": 0.9362543821334839, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6268711574445335, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.342326530612245, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5061224489795918, "calib/gap": 0.18099572306869827, "calib/mean_conf": 0.6292653061224489, "calib/mu_c": 0.7149612403100776, "calib/mu_w": 0.5339655172413793, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.22253061224489806, "calib/std_conf": 0.42697257836292707, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5168028169014085, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.15653910382123976, "calib/step_q_w": 0.36026371308016875, "calib/step_q_w_n": 948.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 523.55859375, "completions/mean_terminated_length": 529.766845703125, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.1632, "grad_norm": 0.042571209371089935, "kl": 0.1490936279296875, "learning_rate": 1.3055555555555556e-06, "loss": -0.002, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.032519422471523285, "mask/share_reasoning": 0.8294144868850708, "mask/share_step_conf": 0.1263473629951477, "num_tokens": 35507123.0, "reward": 1.244004726409912, "reward_std": 0.3157392144203186, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6202394366264343, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7877911329269409, "step": 153 }, { "adv/mean_abs_final_conf": 0.717631459236145, "adv/mean_abs_reasoning": 0.549142599105835, "adv/mean_abs_step_conf": 0.7597761154174805, "adv/ratio_final_to_reasoning": 1.3068216896752487, "adv/ratio_step_to_reasoning": 1.3835679778888372, "adv/std_final_conf": 0.9057055711746216, "adv/std_reasoning": 0.8098655939102173, "adv/std_step_conf": 0.9360241293907166, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.774935064935065, "calib/avg_num_step_conf": 6.35546875, "calib/ece": 0.2357200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.54, "calib/gap": 0.45655844155844166, "calib/mean_conf": 0.6286, "calib/mu_c": 0.8842727272727273, "calib/mu_w": 0.42771428571428566, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21216000000000007, "calib/std_conf": 0.43497590737878805, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5320440251572327, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.15046985764966458, "calib/step_q_w": 0.3815741675075681, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 480.4921875, "completions/mean_terminated_length": 488.11907958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.16426666666666667, "grad_norm": 0.0457763709127903, "kl": 0.167510986328125, "learning_rate": 1.2777777777777779e-06, "loss": -0.0284, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033241137862205505, "mask/share_reasoning": 0.8187060952186584, "mask/share_step_conf": 0.13242775201797485, "num_tokens": 35734569.0, "reward": 1.3308035135269165, "reward_std": 0.2905329465866089, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.728347659111023, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8267859220504761, "step": 154 }, { "adv/mean_abs_final_conf": 0.7934857606887817, "adv/mean_abs_reasoning": 0.6398835778236389, "adv/mean_abs_step_conf": 0.7779837846755981, "adv/ratio_final_to_reasoning": 1.240047077606792, "adv/ratio_step_to_reasoning": 1.2158208330985196, "adv/std_final_conf": 0.9311753511428833, "adv/std_reasoning": 0.8429872393608093, "adv/std_step_conf": 0.9361916184425354, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7321982424399069, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.2558862433862434, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4166666666666667, "calib/gap": 0.3731976393555614, "calib/mean_conf": 0.5428703703703704, "calib/mu_c": 0.7590880503144655, "calib/mu_w": 0.3858904109589041, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18906084656084654, "calib/std_conf": 0.4343976621428475, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49617021276595746, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.08285794980767686, "calib/step_q_w": 0.4133122629582806, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 448.10546875, "completions/mean_terminated_length": 453.4189758300781, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.16533333333333333, "grad_norm": 0.055559657514095306, "kl": 0.1867218017578125, "learning_rate": 1.25e-06, "loss": -0.0928, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035333290696144104, "mask/share_reasoning": 0.8192988634109497, "mask/share_step_conf": 0.13364915549755096, "num_tokens": 35956500.0, "reward": 1.3064546585083008, "reward_std": 0.294373095035553, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.7230759859085083, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8050729036331177, "step": 155 }, { "adv/mean_abs_final_conf": 0.723948061466217, "adv/mean_abs_reasoning": 0.5916285514831543, "adv/mean_abs_step_conf": 0.7519354820251465, "adv/ratio_final_to_reasoning": 1.2236530161557464, "adv/ratio_step_to_reasoning": 1.2709587462270349, "adv/std_final_conf": 0.9007982015609741, "adv/std_reasoning": 0.8098682761192322, "adv/std_step_conf": 0.9361743927001953, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7168635170603674, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.2771574898785426, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5263157894736842, "calib/gap": 0.32841032152230953, "calib/mean_conf": 0.6245429149797571, "calib/mu_c": 0.7840944881889762, "calib/mu_w": 0.45568416666666667, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.19376518218623492, "calib/std_conf": 0.43705373911094175, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.482020202020202, "calib/step_q_c_n": 792.0, "calib/step_q_gap": 0.09669536125587086, "calib/step_q_w": 0.38532484076433116, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 480.828125, "completions/mean_terminated_length": 486.5296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1664, "grad_norm": 0.024023277685046196, "kl": 0.1688385009765625, "learning_rate": 1.2222222222222223e-06, "loss": -0.0595, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033583179116249084, "mask/share_reasoning": 0.8209841251373291, "mask/share_step_conf": 0.13371390104293823, "num_tokens": 36184352.0, "reward": 1.29721999168396, "reward_std": 0.26833876967430115, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6821708679199219, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8100408315658569, "step": 156 }, { "adv/mean_abs_final_conf": 0.7223216891288757, "adv/mean_abs_reasoning": 0.617812991142273, "adv/mean_abs_step_conf": 0.7440441250801086, "adv/ratio_final_to_reasoning": 1.1691591136556985, "adv/ratio_step_to_reasoning": 1.204319325989645, "adv/std_final_conf": 0.9130797982215881, "adv/std_reasoning": 0.8430401086807251, "adv/std_step_conf": 0.9360954761505127, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7498316498316497, "calib/avg_num_step_conf": 6.734375, "calib/ece": 0.21815261044176704, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5742971887550201, "calib/gap": 0.39060000000000006, "calib/mean_conf": 0.6653012048192771, "calib/mu_c": 0.8206, "calib/mu_w": 0.42999999999999994, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14052208835341362, "calib/std_conf": 0.4235393155846786, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5152123142250531, "calib/step_q_c_n": 942.0, "calib/step_q_gap": 0.1559540022045927, "calib/step_q_w": 0.3592583120204604, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 521.7890625, "completions/mean_terminated_length": 523.8353271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.16746666666666668, "grad_norm": 0.037799037992954254, "kl": 0.149688720703125, "learning_rate": 1.1944444444444446e-06, "loss": -0.0506, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03428104519844055, "mask/share_reasoning": 0.8231499195098877, "mask/share_step_conf": 0.13866281509399414, "num_tokens": 36421658.0, "reward": 1.3527193069458008, "reward_std": 0.2752750515937805, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7433562278747559, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8251818418502808, "step": 157 }, { "adv/mean_abs_final_conf": 0.7296226024627686, "adv/mean_abs_reasoning": 0.5862867832183838, "adv/mean_abs_step_conf": 0.7673842906951904, "adv/ratio_final_to_reasoning": 1.2444807274309544, "adv/ratio_step_to_reasoning": 1.3088889476284686, "adv/std_final_conf": 0.9110372066497803, "adv/std_reasoning": 0.8098851442337036, "adv/std_step_conf": 0.9361823201179504, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6482356647763075, "calib/avg_num_step_conf": 5.90625, "calib/ece": 0.33415019762845855, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6561264822134387, "calib/gap": 0.17831884057970993, "calib/mean_conf": 0.7725691699604743, "calib/mu_c": 0.853623188405797, "calib/mu_w": 0.6753043478260871, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2806324110671937, "calib/std_conf": 0.3613087215264664, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5248849104859336, "calib/step_q_c_n": 782.0, "calib/step_q_gap": 0.09002189678730343, "calib/step_q_w": 0.4348630136986301, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1921.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 454.515625, "completions/mean_terminated_length": 456.2980651855469, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.16853333333333334, "grad_norm": 0.03632142394781113, "kl": 0.177001953125, "learning_rate": 1.1666666666666668e-06, "loss": -0.0558, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03808081895112991, "mask/share_reasoning": 0.8153369426727295, "mask/share_step_conf": 0.1426759660243988, "num_tokens": 36643254.0, "reward": 1.2715260982513428, "reward_std": 0.27559033036231995, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6506484150886536, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7934674024581909, "step": 158 }, { "adv/mean_abs_final_conf": 0.7299063801765442, "adv/mean_abs_reasoning": 0.5525414943695068, "adv/mean_abs_step_conf": 0.7550630569458008, "adv/ratio_final_to_reasoning": 1.320998309836304, "adv/ratio_step_to_reasoning": 1.3665273371140876, "adv/std_final_conf": 0.8906761407852173, "adv/std_reasoning": 0.7755322456359863, "adv/std_step_conf": 0.9361896514892578, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7569686411149826, "calib/avg_num_step_conf": 5.78515625, "calib/ece": 0.24634538152610447, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5180722891566265, "calib/gap": 0.3564614789005032, "calib/mean_conf": 0.6423293172690763, "calib/mu_c": 0.8184126984126984, "calib/mu_w": 0.46195121951219514, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19132530120481933, "calib/std_conf": 0.4102543485521629, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5589057750759878, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.12097138868473628, "calib/step_q_w": 0.43793438639125154, "calib/step_q_w_n": 823.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 459.9765625, "completions/mean_terminated_length": 467.2778015136719, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1696, "grad_norm": 0.027983862906694412, "kl": 0.1714935302734375, "learning_rate": 1.138888888888889e-06, "loss": -0.0764, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03498659282922745, "mask/share_reasoning": 0.8221287727355957, "mask/share_step_conf": 0.12725967168807983, "num_tokens": 36865792.0, "reward": 1.3289536237716675, "reward_std": 0.28263670206069946, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7210820317268372, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8219281435012817, "step": 159 }, { "adv/mean_abs_final_conf": 0.7490166425704956, "adv/mean_abs_reasoning": 0.6461282968521118, "adv/mean_abs_step_conf": 0.7792707681655884, "adv/ratio_final_to_reasoning": 1.1592382599859006, "adv/ratio_step_to_reasoning": 1.206061972462956, "adv/std_final_conf": 0.8929372429847717, "adv/std_reasoning": 0.8590587377548218, "adv/std_step_conf": 0.9364528059959412, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7202148437500001, "calib/avg_num_step_conf": 6.2109375, "calib/ece": 0.29913951612903233, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5403225806451613, "calib/gap": 0.2767598958333334, "calib/mean_conf": 0.6469895161290323, "calib/mu_c": 0.7898333333333334, "calib/mu_w": 0.5130734375, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23112903225806458, "calib/std_conf": 0.4231781631810817, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5374873668188737, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.1430393496698919, "calib/step_q_w": 0.39444801714898176, "calib/step_q_w_n": 933.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 525.546875, "completions/mean_terminated_length": 529.68505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.17066666666666666, "grad_norm": 0.028019089251756668, "kl": 0.1685791015625, "learning_rate": 1.111111111111111e-06, "loss": -0.0121, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03304993361234665, "mask/share_reasoning": 0.8344004154205322, "mask/share_step_conf": 0.12473714351654053, "num_tokens": 37105172.0, "reward": 1.2739359140396118, "reward_std": 0.3185603618621826, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6611646413803101, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7999942898750305, "step": 160 }, { "adv/mean_abs_final_conf": 0.6585201621055603, "adv/mean_abs_reasoning": 0.5039167404174805, "adv/mean_abs_step_conf": 0.7582094669342041, "adv/ratio_final_to_reasoning": 1.3068035040074186, "adv/ratio_step_to_reasoning": 1.5046324246065916, "adv/std_final_conf": 0.8666963577270508, "adv/std_reasoning": 0.7575646638870239, "adv/std_step_conf": 0.9358623027801514, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7797034662392306, "calib/avg_num_step_conf": 5.8359375, "calib/ece": 0.17889763779527565, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.531496062992126, "calib/gap": 0.4497709209911172, "calib/mean_conf": 0.6440157480314961, "calib/mu_c": 0.8086956521739129, "calib/mu_w": 0.3589247311827957, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09452755905511817, "calib/std_conf": 0.41721472377691976, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5666857798165137, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.17173401132776767, "calib/step_q_w": 0.394951768488746, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 478.234375, "completions/mean_terminated_length": 480.1098327636719, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.17173333333333332, "grad_norm": 0.06240729242563248, "kl": 0.1655120849609375, "learning_rate": 1.0833333333333335e-06, "loss": 0.011, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03659544885158539, "mask/share_reasoning": 0.8229581713676453, "mask/share_step_conf": 0.13654009997844696, "num_tokens": 37331520.0, "reward": 1.4037129878997803, "reward_std": 0.23486876487731934, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7923383116722107, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8458250761032104, "step": 161 }, { "adv/mean_abs_final_conf": 0.6904687881469727, "adv/mean_abs_reasoning": 0.6107890605926514, "adv/mean_abs_step_conf": 0.7381073832511902, "adv/ratio_final_to_reasoning": 1.1304537567798083, "adv/ratio_step_to_reasoning": 1.2084489243062102, "adv/std_final_conf": 0.878374457359314, "adv/std_reasoning": 0.8428692817687988, "adv/std_step_conf": 0.9361791014671326, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7453532892446919, "calib/avg_num_step_conf": 5.4765625, "calib/ece": 0.2343307086614173, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5984251968503937, "calib/gap": 0.3466982248520711, "calib/mean_conf": 0.6826771653543308, "calib/mu_c": 0.7986982248520711, "calib/mu_w": 0.452, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1258267716535433, "calib/std_conf": 0.41175435842660285, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5216063829787234, "calib/step_q_c_n": 940.0, "calib/step_q_gap": 0.06203928341162385, "calib/step_q_w": 0.45956709956709957, "calib/step_q_w_n": 462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 451.19921875, "completions/mean_terminated_length": 451.19921875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1728, "grad_norm": 0.034508116543293, "kl": 0.1730499267578125, "learning_rate": 1.0555555555555557e-06, "loss": 0.0406, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03664601221680641, "mask/share_reasoning": 0.8280225992202759, "mask/share_step_conf": 0.1353314220905304, "num_tokens": 37551171.0, "reward": 1.3770867586135864, "reward_std": 0.26874053478240967, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.748751163482666, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8382580280303955, "step": 162 }, { "adv/mean_abs_final_conf": 0.7308937311172485, "adv/mean_abs_reasoning": 0.555732011795044, "adv/mean_abs_step_conf": 0.7637782096862793, "adv/ratio_final_to_reasoning": 1.3151909834317856, "adv/ratio_step_to_reasoning": 1.3743642501702125, "adv/std_final_conf": 0.8922277688980103, "adv/std_reasoning": 0.8098985552787781, "adv/std_step_conf": 0.9362261891365051, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8334797000607984, "calib/avg_num_step_conf": 6.6484375, "calib/ece": 0.21786885245901638, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.44842396811457136, "calib/mean_conf": 0.6450000000000001, "calib/mu_c": 0.8857522123893805, "calib/mu_w": 0.43732824427480915, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19987704918032786, "calib/std_conf": 0.4110208203413768, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5253343465045592, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.1565220859681608, "calib/step_q_w": 0.36881226053639843, "calib/step_q_w_n": 1044.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 533.70703125, "completions/mean_terminated_length": 537.909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.17386666666666667, "grad_norm": 0.027746781706809998, "kl": 0.1472320556640625, "learning_rate": 1.0277777777777777e-06, "loss": 0.0927, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03390224650502205, "mask/share_reasoning": 0.8214752078056335, "mask/share_step_conf": 0.13681000471115112, "num_tokens": 37792632.0, "reward": 1.310382604598999, "reward_std": 0.3030407428741455, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.732288658618927, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8051756620407104, "step": 163 }, { "adv/mean_abs_final_conf": 0.73732590675354, "adv/mean_abs_reasoning": 0.5500204563140869, "adv/mean_abs_step_conf": 0.7625619173049927, "adv/ratio_final_to_reasoning": 1.3405426985291855, "adv/ratio_step_to_reasoning": 1.3864246475762618, "adv/std_final_conf": 0.9164314270019531, "adv/std_reasoning": 0.7928760051727295, "adv/std_step_conf": 0.9362967014312744, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7497416020671834, "calib/avg_num_step_conf": 5.98046875, "calib/ece": 0.22694779116465869, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4859437751004016, "calib/gap": 0.34820348837209314, "calib/mean_conf": 0.6128112449799197, "calib/mu_c": 0.7806201550387598, "calib/mu_w": 0.43241666666666667, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16084337349397596, "calib/std_conf": 0.4210706955329504, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5458677685950414, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.15917211642112833, "calib/step_q_w": 0.38669565217391305, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 522.65234375, "completions/mean_terminated_length": 524.7019653320312, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.17493333333333333, "grad_norm": 0.02840862236917019, "kl": 0.1704559326171875, "learning_rate": 1.0000000000000002e-06, "loss": -0.0138, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03115016594529152, "mask/share_reasoning": 0.8422520160675049, "mask/share_step_conf": 0.1226915642619133, "num_tokens": 38032567.0, "reward": 1.32122802734375, "reward_std": 0.2894755005836487, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7138882875442505, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8170182704925537, "step": 164 }, { "adv/mean_abs_final_conf": 0.7540109157562256, "adv/mean_abs_reasoning": 0.5042850971221924, "adv/mean_abs_step_conf": 0.7738252282142639, "adv/ratio_final_to_reasoning": 1.4952076118432718, "adv/ratio_step_to_reasoning": 1.5344994976656225, "adv/std_final_conf": 0.9058592319488525, "adv/std_reasoning": 0.757585883140564, "adv/std_step_conf": 0.936016321182251, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7407310875089592, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.3011111111111112, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5436507936507936, "calib/gap": 0.36137095197758523, "calib/mean_conf": 0.6255555555555556, "calib/mu_c": 0.8392233009708737, "calib/mu_w": 0.4778523489932885, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25896825396825407, "calib/std_conf": 0.4335769563437503, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5241541038525963, "calib/step_q_c_n": 597.0, "calib/step_q_gap": 0.13628108797958044, "calib/step_q_w": 0.38787301587301587, "calib/step_q_w_n": 945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 508.125, "completions/mean_terminated_length": 510.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.176, "grad_norm": 0.05751694738864899, "kl": 0.1658935546875, "learning_rate": 9.722222222222224e-07, "loss": -0.0496, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03207601606845856, "mask/share_reasoning": 0.8398910760879517, "mask/share_step_conf": 0.12412666529417038, "num_tokens": 38268223.0, "reward": 1.3164559602737427, "reward_std": 0.2493373155593872, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6870867013931274, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.834240734577179, "step": 165 }, { "adv/mean_abs_final_conf": 0.7017987966537476, "adv/mean_abs_reasoning": 0.5672744512557983, "adv/mean_abs_step_conf": 0.7582178711891174, "adv/ratio_final_to_reasoning": 1.2371415548508262, "adv/ratio_step_to_reasoning": 1.3365979545008946, "adv/std_final_conf": 0.8928700685501099, "adv/std_reasoning": 0.8098106384277344, "adv/std_step_conf": 0.9361168742179871, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8369572251342781, "calib/avg_num_step_conf": 6.4609375, "calib/ece": 0.17043307086614184, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5354330708661418, "calib/gap": 0.5123956513298391, "calib/mean_conf": 0.6342913385826772, "calib/mu_c": 0.8380392156862747, "calib/mu_w": 0.32564356435643566, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10118110236220484, "calib/std_conf": 0.43354618332374484, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5169125395152793, "calib/step_q_c_n": 949.0, "calib/step_q_gap": 0.1139044544088963, "calib/step_q_w": 0.403008085106383, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 524.88671875, "completions/mean_terminated_length": 526.9451293945312, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.17706666666666668, "grad_norm": 0.025736430659890175, "kl": 0.149139404296875, "learning_rate": 9.444444444444445e-07, "loss": -0.0181, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0328802764415741, "mask/share_reasoning": 0.8294801712036133, "mask/share_step_conf": 0.1337333470582962, "num_tokens": 38508778.0, "reward": 1.4159806966781616, "reward_std": 0.2332712709903717, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8105738162994385, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.851709246635437, "step": 166 }, { "adv/mean_abs_final_conf": 0.6965587735176086, "adv/mean_abs_reasoning": 0.5031703114509583, "adv/mean_abs_step_conf": 0.7456004619598389, "adv/ratio_final_to_reasoning": 1.3843399693216978, "adv/ratio_step_to_reasoning": 1.4818053549499, "adv/std_final_conf": 0.8917292952537537, "adv/std_reasoning": 0.7393972277641296, "adv/std_step_conf": 0.9362724423408508, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6883830455259028, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.2430155378486055, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6215139442231076, "calib/gap": 0.2955332548403976, "calib/mean_conf": 0.7614466135458168, "calib/mu_c": 0.8838986394557823, "calib/mu_w": 0.5883653846153847, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20940239043824693, "calib/std_conf": 0.36386610615123716, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.500891861761427, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.002577953310722747, "calib/step_q_w": 0.4983139084507042, "calib/step_q_w_n": 568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 477.23046875, "completions/mean_terminated_length": 480.9881896972656, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.17813333333333334, "grad_norm": 0.0267344880849123, "kl": 0.15435791015625, "learning_rate": 9.166666666666666e-07, "loss": -0.0979, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03314051032066345, "mask/share_reasoning": 0.834426760673523, "mask/share_step_conf": 0.12462019920349121, "num_tokens": 38736557.0, "reward": 1.3204123973846436, "reward_std": 0.28089213371276855, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7191629409790039, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8057527542114258, "step": 167 }, { "adv/mean_abs_final_conf": 0.750705897808075, "adv/mean_abs_reasoning": 0.5782451629638672, "adv/mean_abs_step_conf": 0.7345632314682007, "adv/ratio_final_to_reasoning": 1.2982484694904128, "adv/ratio_step_to_reasoning": 1.2703318220650663, "adv/std_final_conf": 0.8839114904403687, "adv/std_reasoning": 0.8267065286636353, "adv/std_step_conf": 0.936215877532959, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7323698648467818, "calib/avg_num_step_conf": 6.25, "calib/ece": 0.27145748987854246, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6072874493927125, "calib/gap": 0.2645269637361167, "calib/mean_conf": 0.7159109311740892, "calib/mu_c": 0.8294326241134753, "calib/mu_w": 0.5649056603773586, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20825910931174085, "calib/std_conf": 0.3865950712978174, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5384486873508353, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.16377677134033658, "calib/step_q_w": 0.3746719160104987, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 529.08984375, "completions/mean_terminated_length": 535.3636474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.1792, "grad_norm": 0.05579206719994545, "kl": 0.156463623046875, "learning_rate": 8.88888888888889e-07, "loss": -0.124, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.032205671072006226, "mask/share_reasoning": 0.8297091126441956, "mask/share_step_conf": 0.1263664960861206, "num_tokens": 38976676.0, "reward": 1.3209105730056763, "reward_std": 0.3096902370452881, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6887921690940857, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.82534259557724, "step": 168 }, { "adv/mean_abs_final_conf": 0.7418397068977356, "adv/mean_abs_reasoning": 0.4948415160179138, "adv/mean_abs_step_conf": 0.7277093529701233, "adv/ratio_final_to_reasoning": 1.4991460556249694, "adv/ratio_step_to_reasoning": 1.4705907435296504, "adv/std_final_conf": 0.9168627858161926, "adv/std_reasoning": 0.757630467414856, "adv/std_step_conf": 0.9362656474113464, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7354974160206718, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.25493975903614463, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5421686746987951, "calib/gap": 0.33155038759689937, "calib/mean_conf": 0.651566265060241, "calib/mu_c": 0.8233333333333335, "calib/mu_w": 0.4917829457364341, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21228915662650608, "calib/std_conf": 0.4101341202740365, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5571779141104295, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.1413897678304799, "calib/step_q_w": 0.4157881462799496, "calib/step_q_w_n": 793.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 506.43359375, "completions/mean_terminated_length": 512.4387817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.18026666666666666, "grad_norm": 0.046045560389757156, "kl": 0.16729736328125, "learning_rate": 8.611111111111112e-07, "loss": -0.0953, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032685041427612305, "mask/share_reasoning": 0.8343138694763184, "mask/share_step_conf": 0.12128231674432755, "num_tokens": 39210507.0, "reward": 1.3042843341827393, "reward_std": 0.3071857690811157, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6927363276481628, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8145567774772644, "step": 169 }, { "adv/mean_abs_final_conf": 0.72629714012146, "adv/mean_abs_reasoning": 0.5629629492759705, "adv/mean_abs_step_conf": 0.7413750290870667, "adv/ratio_final_to_reasoning": 1.2901331092135893, "adv/ratio_step_to_reasoning": 1.3169162021062895, "adv/std_final_conf": 0.8827546238899231, "adv/std_reasoning": 0.7929536700248718, "adv/std_step_conf": 0.9361873269081116, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8037874281181835, "calib/avg_num_step_conf": 5.98046875, "calib/ece": 0.26321422764227637, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.532520325203252, "calib/gap": 0.35340894308943105, "calib/mean_conf": 0.691907723577236, "calib/mu_c": 0.8686121951219513, "calib/mu_w": 0.5152032520325203, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22756097560975605, "calib/std_conf": 0.39446754890037317, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.523393351800554, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.11769495872268004, "calib/step_q_w": 0.405698393077874, "calib/step_q_w_n": 809.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 516.30078125, "completions/mean_terminated_length": 522.4229736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.18133333333333335, "grad_norm": 0.03416380286216736, "kl": 0.151458740234375, "learning_rate": 8.333333333333333e-07, "loss": -0.0485, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03184031695127487, "mask/share_reasoning": 0.8301719427108765, "mask/share_step_conf": 0.12626899778842926, "num_tokens": 39446832.0, "reward": 1.315016508102417, "reward_std": 0.29806241393089294, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7016829252243042, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8204249143600464, "step": 170 }, { "adv/mean_abs_final_conf": 0.7734802961349487, "adv/mean_abs_reasoning": 0.6048990488052368, "adv/mean_abs_step_conf": 0.7889896035194397, "adv/ratio_final_to_reasoning": 1.2786931929595264, "adv/ratio_step_to_reasoning": 1.3043326900212662, "adv/std_final_conf": 0.9199161529541016, "adv/std_reasoning": 0.8098820447921753, "adv/std_step_conf": 0.9362908005714417, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7333517801143701, "calib/avg_num_step_conf": 5.85546875, "calib/ece": 0.2887109375000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.578125, "calib/gap": 0.3302373485826725, "calib/mean_conf": 0.6685546875, "calib/mu_c": 0.847863247863248, "calib/mu_w": 0.5176258992805755, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25011718750000006, "calib/std_conf": 0.4151776974342159, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5139170506912443, "calib/step_q_c_n": 651.0, "calib/step_q_gap": 0.1098604469176594, "calib/step_q_w": 0.4040566037735849, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 472.85546875, "completions/mean_terminated_length": 474.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.1824, "grad_norm": 0.0407269224524498, "kl": 0.153961181640625, "learning_rate": 8.055555555555557e-07, "loss": -0.0005, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03458784520626068, "mask/share_reasoning": 0.8299896717071533, "mask/share_step_conf": 0.13151618838310242, "num_tokens": 39674779.0, "reward": 1.3298715353012085, "reward_std": 0.28158968687057495, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6986308693885803, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.834852933883667, "step": 171 }, { "adv/mean_abs_final_conf": 0.7229958772659302, "adv/mean_abs_reasoning": 0.5644388198852539, "adv/mean_abs_step_conf": 0.7528114318847656, "adv/ratio_final_to_reasoning": 1.2809109717381058, "adv/ratio_step_to_reasoning": 1.3337343310968697, "adv/std_final_conf": 0.8914034366607666, "adv/std_reasoning": 0.792880654335022, "adv/std_step_conf": 0.9360430836677551, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7570093457943925, "calib/avg_num_step_conf": 6.3828125, "calib/ece": 0.24393574297188755, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5502008032128514, "calib/gap": 0.26905752270633165, "calib/mean_conf": 0.7326907630522088, "calib/mu_c": 0.8483098591549297, "calib/mu_w": 0.5792523364485981, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20317269076305222, "calib/std_conf": 0.35359483307726064, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.521430412371134, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.12123227717299878, "calib/step_q_w": 0.4001981351981352, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 482.5234375, "completions/mean_terminated_length": 490.18255615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.18346666666666667, "grad_norm": 0.03096671588718891, "kl": 0.1573486328125, "learning_rate": 7.777777777777779e-07, "loss": -0.0987, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03464246541261673, "mask/share_reasoning": 0.8165166974067688, "mask/share_step_conf": 0.13321584463119507, "num_tokens": 39901657.0, "reward": 1.3283240795135498, "reward_std": 0.2723774313926697, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.711389422416687, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8202854990959167, "step": 172 }, { "adv/mean_abs_final_conf": 0.7424459457397461, "adv/mean_abs_reasoning": 0.4958227872848511, "adv/mean_abs_step_conf": 0.7755153775215149, "adv/ratio_final_to_reasoning": 1.4974018233518775, "adv/ratio_step_to_reasoning": 1.564097894266364, "adv/std_final_conf": 0.9158072471618652, "adv/std_reasoning": 0.7393432855606079, "adv/std_step_conf": 0.9363213777542114, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6958172458172459, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.28857312252964423, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7312252964426877, "calib/gap": 0.20472252252252243, "calib/mean_conf": 0.8242252964426877, "calib/mu_c": 0.9091891891891892, "calib/mu_w": 0.7044666666666668, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26390909090909087, "calib/std_conf": 0.31273512361407085, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5037248677248677, "calib/step_q_c_n": 945.0, "calib/step_q_gap": 0.05908201058201057, "calib/step_q_w": 0.4446428571428571, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 505.04296875, "completions/mean_terminated_length": 509.0196838378906, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.18453333333333333, "grad_norm": 0.05622625723481178, "kl": 0.1695098876953125, "learning_rate": 7.5e-07, "loss": -0.0635, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036126069724559784, "mask/share_reasoning": 0.8191242814064026, "mask/share_step_conf": 0.13693715631961823, "num_tokens": 40134108.0, "reward": 1.3052852153778076, "reward_std": 0.2680211067199707, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6933628916740417, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.801572322845459, "step": 173 }, { "adv/mean_abs_final_conf": 0.8037816882133484, "adv/mean_abs_reasoning": 0.693625271320343, "adv/mean_abs_step_conf": 0.761623740196228, "adv/ratio_final_to_reasoning": 1.1588125771186484, "adv/ratio_step_to_reasoning": 1.0980334363343585, "adv/std_final_conf": 0.9364737272262573, "adv/std_reasoning": 0.8591563701629639, "adv/std_step_conf": 0.9362601637840271, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6913145539906104, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.31102459016393447, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4385245901639344, "calib/gap": 0.2563117923225626, "calib/mean_conf": 0.5779918032786884, "calib/mu_c": 0.7271568627450979, "calib/mu_w": 0.4708450704225353, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23549180327868852, "calib/std_conf": 0.42763798455522295, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5175798319327731, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.0935757251565924, "calib/step_q_w": 0.4240041067761807, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 534.12890625, "completions/mean_terminated_length": 540.4624633789062, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.1856, "grad_norm": 0.05832623690366745, "kl": 0.149993896484375, "learning_rate": 7.222222222222222e-07, "loss": -0.0732, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03035867214202881, "mask/share_reasoning": 0.840134859085083, "mask/share_step_conf": 0.11778773367404938, "num_tokens": 40375077.0, "reward": 1.2217075824737549, "reward_std": 0.32435470819473267, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.6375941634178162, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7681448459625244, "step": 174 }, { "adv/mean_abs_final_conf": 0.7520076036453247, "adv/mean_abs_reasoning": 0.6014555096626282, "adv/mean_abs_step_conf": 0.7382874488830566, "adv/ratio_final_to_reasoning": 1.2503129351448539, "adv/ratio_step_to_reasoning": 1.2275013480168817, "adv/std_final_conf": 0.919256865978241, "adv/std_reasoning": 0.8266659379005432, "adv/std_step_conf": 0.9361310601234436, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7569778311965811, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.2595564516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4274193548387097, "calib/gap": 0.36887286324786345, "calib/mean_conf": 0.551008064516129, "calib/mu_c": 0.7651923076923078, "calib/mu_w": 0.3963194444444444, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1956048387096775, "calib/std_conf": 0.4343601522589836, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5354068716094034, "calib/step_q_c_n": 553.0, "calib/step_q_gap": 0.15549899034021197, "calib/step_q_w": 0.3799078812691914, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 538.66796875, "completions/mean_terminated_length": 540.7803955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.18666666666666668, "grad_norm": 0.03274182230234146, "kl": 0.146697998046875, "learning_rate": 6.944444444444446e-07, "loss": 0.0414, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03309997171163559, "mask/share_reasoning": 0.8338706493377686, "mask/share_step_conf": 0.12912318110466003, "num_tokens": 40618800.0, "reward": 1.3135071992874146, "reward_std": 0.28972986340522766, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.7034190893173218, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8250787854194641, "step": 175 }, { "adv/mean_abs_final_conf": 0.7331399917602539, "adv/mean_abs_reasoning": 0.6168496012687683, "adv/mean_abs_step_conf": 0.7408859729766846, "adv/ratio_final_to_reasoning": 1.1885230861012044, "adv/ratio_step_to_reasoning": 1.2010804115829723, "adv/std_final_conf": 0.8998392820358276, "adv/std_reasoning": 0.8429999947547913, "adv/std_step_conf": 0.9361100792884827, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7558459862899609, "calib/avg_num_step_conf": 5.671875, "calib/ece": 0.24537680000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.48, "calib/gap": 0.3795347684028445, "calib/mean_conf": 0.5871831999999999, "calib/mu_c": 0.7830231404958677, "calib/mu_w": 0.40348837209302324, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17428000000000005, "calib/std_conf": 0.4378722281645183, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5473282442748092, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.14457291177794596, "calib/step_q_w": 0.40275533249686324, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2071.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 487.06640625, "completions/mean_terminated_length": 490.9015808105469, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.18773333333333334, "grad_norm": 0.031166227534413338, "kl": 0.14788818359375, "learning_rate": 6.666666666666667e-07, "loss": -0.017, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0351436585187912, "mask/share_reasoning": 0.8261876106262207, "mask/share_step_conf": 0.1308562457561493, "num_tokens": 40847553.0, "reward": 1.346287727355957, "reward_std": 0.29176032543182373, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7162603139877319, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8436262607574463, "step": 176 }, { "adv/mean_abs_final_conf": 0.7423900365829468, "adv/mean_abs_reasoning": 0.5823933482170105, "adv/mean_abs_step_conf": 0.7375138401985168, "adv/ratio_final_to_reasoning": 1.2747227262395149, "adv/ratio_step_to_reasoning": 1.2663500406665111, "adv/std_final_conf": 0.9186911582946777, "adv/std_reasoning": 0.8266282081604004, "adv/std_step_conf": 0.9363012313842773, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7760194562899786, "calib/avg_num_step_conf": 6.390625, "calib/ece": 0.22763821138211385, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.43902439024390244, "calib/gap": 0.383294776119403, "calib/mean_conf": 0.5449634146341463, "calib/mu_c": 0.75375, "calib/mu_w": 0.37045522388059704, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15865853658536588, "calib/std_conf": 0.4306305674860772, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5332937181663837, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.14477413841471232, "calib/step_q_w": 0.3885195797516714, "calib/step_q_w_n": 1047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 521.9375, "completions/mean_terminated_length": 530.2222290039062, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.1888, "grad_norm": 0.0248885378241539, "kl": 0.151275634765625, "learning_rate": 6.388888888888889e-07, "loss": -0.1378, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03300992771983147, "mask/share_reasoning": 0.8253693580627441, "mask/share_step_conf": 0.1259956657886505, "num_tokens": 41085001.0, "reward": 1.2926164865493774, "reward_std": 0.31039512157440186, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7154799103736877, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7954233884811401, "step": 177 }, { "adv/mean_abs_final_conf": 0.7208993434906006, "adv/mean_abs_reasoning": 0.5811500549316406, "adv/mean_abs_step_conf": 0.7468301057815552, "adv/ratio_final_to_reasoning": 1.2404702320391217, "adv/ratio_step_to_reasoning": 1.2850899684925663, "adv/std_final_conf": 0.9046099781990051, "adv/std_reasoning": 0.8098732829093933, "adv/std_step_conf": 0.9358938932418823, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7845218146837956, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.18523622047244093, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4566929133858268, "calib/gap": 0.4271315087196672, "calib/mean_conf": 0.5947637795275591, "calib/mu_c": 0.8016030534351144, "calib/mu_w": 0.3744715447154472, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1321259842519685, "calib/std_conf": 0.4155611699042295, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5520152671755726, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.18025348057507634, "calib/step_q_w": 0.37176178660049625, "calib/step_q_w_n": 806.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 463.15625, "completions/mean_terminated_length": 464.9725646972656, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.18986666666666666, "grad_norm": 0.031120169907808304, "kl": 0.1669769287109375, "learning_rate": 6.111111111111112e-07, "loss": -0.0598, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034978143870830536, "mask/share_reasoning": 0.8296171426773071, "mask/share_step_conf": 0.13149845600128174, "num_tokens": 41309641.0, "reward": 1.3911949396133423, "reward_std": 0.24237877130508423, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7785370945930481, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8515357375144958, "step": 178 }, { "adv/mean_abs_final_conf": 0.748763918876648, "adv/mean_abs_reasoning": 0.6570069789886475, "adv/mean_abs_step_conf": 0.7516197562217712, "adv/ratio_final_to_reasoning": 1.1396590033628637, "adv/ratio_step_to_reasoning": 1.1440057415809561, "adv/std_final_conf": 0.9030925035476685, "adv/std_reasoning": 0.874788761138916, "adv/std_step_conf": 0.9364590644836426, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7651485788113694, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.2261847389558233, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.39759036144578314, "calib/gap": 0.3853565891472869, "calib/mean_conf": 0.5206425702811245, "calib/mu_c": 0.7063565891472868, "calib/mu_w": 0.32099999999999995, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11437751004016064, "calib/std_conf": 0.43710482870833245, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5291322314049587, "calib/step_q_c_n": 726.0, "calib/step_q_gap": 0.11379889807162535, "calib/step_q_w": 0.41533333333333333, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 492.3671875, "completions/mean_terminated_length": 496.24407958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.19093333333333334, "grad_norm": 0.03421265259385109, "kl": 0.165557861328125, "learning_rate": 5.833333333333334e-07, "loss": -0.0614, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03357543796300888, "mask/share_reasoning": 0.8331658840179443, "mask/share_step_conf": 0.12544618546962738, "num_tokens": 41541951.0, "reward": 1.33596932888031, "reward_std": 0.300996333360672, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7311320304870605, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8223564028739929, "step": 179 }, { "adv/mean_abs_final_conf": 0.7390499711036682, "adv/mean_abs_reasoning": 0.4804949462413788, "adv/mean_abs_step_conf": 0.743632435798645, "adv/ratio_final_to_reasoning": 1.5381014449471506, "adv/ratio_step_to_reasoning": 1.5476384124653788, "adv/std_final_conf": 0.903160810470581, "adv/std_reasoning": 0.7393597364425659, "adv/std_step_conf": 0.9363460540771484, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7838493223491895, "calib/avg_num_step_conf": 6.37890625, "calib/ece": 0.20865591397849462, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.45564516129032256, "calib/gap": 0.39409070776862437, "calib/mean_conf": 0.5893279569892472, "calib/mu_c": 0.7577699530516432, "calib/mu_w": 0.3636792452830188, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1127016129032258, "calib/std_conf": 0.42477539496987315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4794345898004435, "calib/step_q_c_n": 902.0, "calib/step_q_gap": 0.07114047215338476, "calib/step_q_w": 0.40829411764705875, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 563.41015625, "completions/mean_terminated_length": 570.0909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.192, "grad_norm": 0.04266770929098129, "kl": 0.1641082763671875, "learning_rate": 5.555555555555555e-07, "loss": -0.004, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03042653203010559, "mask/share_reasoning": 0.8367537260055542, "mask/share_step_conf": 0.12110098451375961, "num_tokens": 41790040.0, "reward": 1.3342862129211426, "reward_std": 0.26652398705482483, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7434638738632202, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8102104067802429, "step": 180 }, { "adv/mean_abs_final_conf": 0.7349035143852234, "adv/mean_abs_reasoning": 0.5623339414596558, "adv/mean_abs_step_conf": 0.7635202407836914, "adv/ratio_final_to_reasoning": 1.3068809477827839, "adv/ratio_step_to_reasoning": 1.357770151312251, "adv/std_final_conf": 0.908625066280365, "adv/std_reasoning": 0.7929151058197021, "adv/std_step_conf": 0.9361339211463928, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7744262295081967, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.2017004048582996, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3562753036437247, "calib/gap": 0.39174754098360665, "calib/mean_conf": 0.4978947368421053, "calib/mu_c": 0.6961475409836066, "calib/mu_w": 0.3044, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10283400809716597, "calib/std_conf": 0.4215448034412612, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5263936781609195, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.11937732938707757, "calib/step_q_w": 0.40701634877384196, "calib/step_q_w_n": 734.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 475.109375, "completions/mean_terminated_length": 478.85040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.19306666666666666, "grad_norm": 0.04156802222132683, "kl": 0.1792144775390625, "learning_rate": 5.277777777777779e-07, "loss": -0.1099, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03539499640464783, "mask/share_reasoning": 0.8246821761131287, "mask/share_step_conf": 0.1321103572845459, "num_tokens": 42017932.0, "reward": 1.3226896524429321, "reward_std": 0.2621150612831116, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7333539128303528, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8126533031463623, "step": 181 }, { "adv/mean_abs_final_conf": 0.7645466327667236, "adv/mean_abs_reasoning": 0.5872629880905151, "adv/mean_abs_step_conf": 0.7413380146026611, "adv/ratio_final_to_reasoning": 1.301881181466324, "adv/ratio_step_to_reasoning": 1.2623612072218628, "adv/std_final_conf": 0.9094211459159851, "adv/std_reasoning": 0.8265780210494995, "adv/std_step_conf": 0.9362402558326721, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7184949553370605, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.25443999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.46, "calib/gap": 0.29653042863569185, "calib/mean_conf": 0.59972, "calib/mu_c": 0.7384962406015038, "calib/mu_w": 0.4419658119658119, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16107999999999997, "calib/std_conf": 0.4127841101592938, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48672823218997363, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.07880393426567567, "calib/step_q_w": 0.40792429792429796, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 485.36328125, "completions/mean_terminated_length": 489.1850280761719, "completions/min_length": 0.0, "completions/min_terminated_length": 46.0, "epoch": 0.19413333333333332, "grad_norm": 0.030157707631587982, "kl": 0.1594390869140625, "learning_rate": 5.000000000000001e-07, "loss": -0.0533, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033575158566236496, "mask/share_reasoning": 0.8242987394332886, "mask/share_step_conf": 0.13431356847286224, "num_tokens": 42248345.0, "reward": 1.3270999193191528, "reward_std": 0.26894062757492065, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7067433595657349, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8241187930107117, "step": 182 }, { "adv/mean_abs_final_conf": 0.7454149723052979, "adv/mean_abs_reasoning": 0.6075682640075684, "adv/mean_abs_step_conf": 0.7380647659301758, "adv/ratio_final_to_reasoning": 1.2268826672882511, "adv/ratio_step_to_reasoning": 1.2147849215524231, "adv/std_final_conf": 0.9097917079925537, "adv/std_reasoning": 0.8431042432785034, "adv/std_step_conf": 0.9364758133888245, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7013360010326578, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.2636546184738956, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3815261044176707, "calib/gap": 0.27784368142506755, "calib/mean_conf": 0.5104819277108434, "calib/mu_c": 0.6466141732283462, "calib/mu_w": 0.3687704918032787, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13204819277108434, "calib/std_conf": 0.4287890025519976, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5201923076923076, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.08576493339063168, "calib/step_q_w": 0.43442737430167594, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 511.19921875, "completions/mean_terminated_length": 513.2039794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.1952, "grad_norm": 0.0336591973900795, "kl": 0.1542510986328125, "learning_rate": 4.7222222222222226e-07, "loss": -0.0883, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0331193245947361, "mask/share_reasoning": 0.8445234894752502, "mask/share_step_conf": 0.11845093220472336, "num_tokens": 42485892.0, "reward": 1.2918308973312378, "reward_std": 0.29553234577178955, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6858261823654175, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8020428419113159, "step": 183 }, { "adv/mean_abs_final_conf": 0.7678780555725098, "adv/mean_abs_reasoning": 0.6362776756286621, "adv/mean_abs_step_conf": 0.7713160514831543, "adv/ratio_final_to_reasoning": 1.2068285356920976, "adv/ratio_step_to_reasoning": 1.2122318305778528, "adv/std_final_conf": 0.9147010445594788, "adv/std_reasoning": 0.8429502248764038, "adv/std_step_conf": 0.9361193180084229, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7704157267160812, "calib/avg_num_step_conf": 6.2890625, "calib/ece": 0.23265873015873023, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47619047619047616, "calib/gap": 0.3899329680953915, "calib/mean_conf": 0.5756746031746032, "calib/mu_c": 0.7412413793103448, "calib/mu_w": 0.3513084112149533, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11646825396825404, "calib/std_conf": 0.4313672675502065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5164947245017585, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.14344320534720106, "calib/step_q_w": 0.3730515191545574, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 486.18359375, "completions/mean_terminated_length": 490.0118103027344, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.19626666666666667, "grad_norm": 0.04198385775089264, "kl": 0.1663665771484375, "learning_rate": 4.444444444444445e-07, "loss": 0.0095, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032702185213565826, "mask/share_reasoning": 0.8291609287261963, "mask/share_step_conf": 0.1303243637084961, "num_tokens": 42715635.0, "reward": 1.378580093383789, "reward_std": 0.25681740045547485, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.748262882232666, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8493705987930298, "step": 184 }, { "adv/mean_abs_final_conf": 0.7074645757675171, "adv/mean_abs_reasoning": 0.5239733457565308, "adv/mean_abs_step_conf": 0.751834511756897, "adv/ratio_final_to_reasoning": 1.3501919162434788, "adv/ratio_step_to_reasoning": 1.4348716739997, "adv/std_final_conf": 0.8876582384109497, "adv/std_reasoning": 0.7929244637489319, "adv/std_step_conf": 0.9362289309501648, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8213764844780888, "calib/avg_num_step_conf": 6.546875, "calib/ece": 0.18537500000000012, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4875, "calib/gap": 0.46675810820195845, "calib/mean_conf": 0.6085416666666668, "calib/mu_c": 0.8438655462184874, "calib/mu_w": 0.37710743801652896, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1490416666666668, "calib/std_conf": 0.4239093337777418, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5482189781021898, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.20910697002953588, "calib/step_q_w": 0.3391120080726539, "calib/step_q_w_n": 991.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 515.0390625, "completions/mean_terminated_length": 533.8056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.19733333333333333, "grad_norm": 0.03402863070368767, "kl": 0.1391143798828125, "learning_rate": 4.1666666666666667e-07, "loss": -0.1259, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.031878791749477386, "mask/share_reasoning": 0.8101688027381897, "mask/share_step_conf": 0.12279614061117172, "num_tokens": 42954405.0, "reward": 1.288568377494812, "reward_std": 0.3048917055130005, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7415418028831482, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7775630950927734, "step": 185 }, { "adv/mean_abs_final_conf": 0.7325637936592102, "adv/mean_abs_reasoning": 0.5513955950737, "adv/mean_abs_step_conf": 0.7554478645324707, "adv/ratio_final_to_reasoning": 1.3285630139306701, "adv/ratio_step_to_reasoning": 1.370065106217428, "adv/std_final_conf": 0.9124969840049744, "adv/std_reasoning": 0.7928171753883362, "adv/std_step_conf": 0.936065673828125, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7283317505539728, "calib/avg_num_step_conf": 5.89453125, "calib/ece": 0.20779404761904754, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.44841269841269843, "calib/gap": 0.3931548148148149, "calib/mean_conf": 0.5706186507936508, "calib/mu_c": 0.7531548148148149, "calib/mu_w": 0.36000000000000004, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12134920634920626, "calib/std_conf": 0.43167462008825397, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4990738255033557, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.08035654801644476, "calib/step_q_w": 0.41871727748691095, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 505.578125, "completions/mean_terminated_length": 505.578125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.1984, "grad_norm": 0.023764928802847862, "kl": 0.1626434326171875, "learning_rate": 3.8888888888888895e-07, "loss": 0.0044, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03325989097356796, "mask/share_reasoning": 0.840133786201477, "mask/share_step_conf": 0.12660633027553558, "num_tokens": 43188873.0, "reward": 1.3611667156219482, "reward_std": 0.24674609303474426, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7474247813224792, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.83628249168396, "step": 186 }, { "adv/mean_abs_final_conf": 0.7524943351745605, "adv/mean_abs_reasoning": 0.6573784947395325, "adv/mean_abs_step_conf": 0.7297996282577515, "adv/ratio_final_to_reasoning": 1.144689613664218, "adv/ratio_step_to_reasoning": 1.1101665693321985, "adv/std_final_conf": 0.9055358171463013, "adv/std_reasoning": 0.85908043384552, "adv/std_step_conf": 0.9362545609474182, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7031581038135593, "calib/avg_num_step_conf": 7.46875, "calib/ece": 0.2866666666666667, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4146341463414634, "calib/gap": 0.27080111228813564, "calib/mean_conf": 0.5493495934959349, "calib/mu_c": 0.6902542372881356, "calib/mu_w": 0.419453125, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1781707317073171, "calib/std_conf": 0.4234656470212806, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.49556300268096515, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.1635501381869685, "calib/step_q_w": 0.33201286449399664, "calib/step_q_w_n": 1166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 552.9453125, "completions/mean_terminated_length": 559.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.19946666666666665, "grad_norm": 0.02803722955286503, "kl": 0.153594970703125, "learning_rate": 3.611111111111111e-07, "loss": -0.1201, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03030928410589695, "mask/share_reasoning": 0.8272897601127625, "mask/share_step_conf": 0.13068220019340515, "num_tokens": 43431971.0, "reward": 1.2936794757843018, "reward_std": 0.31476491689682007, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6723042726516724, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8149491548538208, "step": 187 }, { "adv/mean_abs_final_conf": 0.7178840041160583, "adv/mean_abs_reasoning": 0.6043578386306763, "adv/mean_abs_step_conf": 0.7314138412475586, "adv/ratio_final_to_reasoning": 1.1878459386621079, "adv/ratio_step_to_reasoning": 1.210233068052463, "adv/std_final_conf": 0.90956711769104, "adv/std_reasoning": 0.8746255040168762, "adv/std_step_conf": 0.9362061619758606, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.801168646694215, "calib/avg_num_step_conf": 6.15234375, "calib/ece": 0.178714859437751, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.44176706827309237, "calib/gap": 0.44120544938016537, "calib/mean_conf": 0.6133333333333333, "calib/mu_c": 0.827734375, "calib/mu_w": 0.38652892561983465, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13899598393574297, "calib/std_conf": 0.4134482933796286, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5371942446043165, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.1473078809679529, "calib/step_q_w": 0.38988636363636364, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 520.61328125, "completions/mean_terminated_length": 526.7865600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.20053333333333334, "grad_norm": 0.036614056676626205, "kl": 0.1472015380859375, "learning_rate": 3.3333333333333335e-07, "loss": -0.0877, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0330745093524456, "mask/share_reasoning": 0.8302721381187439, "mask/share_step_conf": 0.12493464350700378, "num_tokens": 43669320.0, "reward": 1.336802363395691, "reward_std": 0.306719034910202, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7565621137619019, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8124275207519531, "step": 188 }, { "adv/mean_abs_final_conf": 0.755175769329071, "adv/mean_abs_reasoning": 0.5345438122749329, "adv/mean_abs_step_conf": 0.7734057307243347, "adv/ratio_final_to_reasoning": 1.4127481265102741, "adv/ratio_step_to_reasoning": 1.446851900563293, "adv/std_final_conf": 0.9205330014228821, "adv/std_reasoning": 0.7753702402114868, "adv/std_step_conf": 0.9360719323158264, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.753629875722899, "calib/avg_num_step_conf": 5.4140625, "calib/ece": 0.21056392156862744, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.3254901960784314, "calib/gap": 0.35917858988556667, "calib/mean_conf": 0.46504392156862745, "calib/mu_c": 0.6467460317460317, "calib/mu_w": 0.28756744186046507, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09074509803921568, "calib/std_conf": 0.4212535678470118, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5273546511627907, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.12582169987339242, "calib/step_q_w": 0.40153295128939825, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 488.1484375, "completions/mean_terminated_length": 488.1484375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2016, "grad_norm": 0.03007141873240471, "kl": 0.1778106689453125, "learning_rate": 3.055555555555556e-07, "loss": 0.0315, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034504421055316925, "mask/share_reasoning": 0.844056248664856, "mask/share_step_conf": 0.12143930792808533, "num_tokens": 43902054.0, "reward": 1.363713264465332, "reward_std": 0.21309733390808105, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7483644485473633, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.840703010559082, "step": 189 }, { "adv/mean_abs_final_conf": 0.7530200481414795, "adv/mean_abs_reasoning": 0.6540989875793457, "adv/mean_abs_step_conf": 0.7409170866012573, "adv/ratio_final_to_reasoning": 1.1512325541554735, "adv/ratio_step_to_reasoning": 1.132729297354829, "adv/std_final_conf": 0.926189124584198, "adv/std_reasoning": 0.8746927380561829, "adv/std_step_conf": 0.9360911846160889, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7530051150895141, "calib/avg_num_step_conf": 6.28515625, "calib/ece": 0.21569721115537854, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.38247011952191234, "calib/gap": 0.3762877237851662, "calib/mean_conf": 0.5174501992031872, "calib/mu_c": 0.6898529411764706, "calib/mu_w": 0.3135652173913044, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09565737051792834, "calib/std_conf": 0.42127092912643754, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49150823827629914, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.12434970169093329, "calib/step_q_w": 0.36715853658536585, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 550.61328125, "completions/mean_terminated_length": 552.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.20266666666666666, "grad_norm": 0.04317953810095787, "kl": 0.1549224853515625, "learning_rate": 2.7777777777777776e-07, "loss": -0.0855, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030800120905041695, "mask/share_reasoning": 0.8427908420562744, "mask/share_step_conf": 0.12250283360481262, "num_tokens": 44148619.0, "reward": 1.3544282913208008, "reward_std": 0.27702847123146057, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.745659351348877, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8300361037254333, "step": 190 }, { "adv/mean_abs_final_conf": 0.7219338417053223, "adv/mean_abs_reasoning": 0.5049686431884766, "adv/mean_abs_step_conf": 0.7618149518966675, "adv/ratio_final_to_reasoning": 1.4296607352624562, "adv/ratio_step_to_reasoning": 1.508638134610518, "adv/std_final_conf": 0.915317177772522, "adv/std_reasoning": 0.7576092481613159, "adv/std_step_conf": 0.9361116886138916, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7212732386636485, "calib/avg_num_step_conf": 6.703125, "calib/ece": 0.20628099173553713, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.45867768595041325, "calib/gap": 0.39857446662550583, "calib/mean_conf": 0.5647933884297521, "calib/mu_c": 0.7772566371681415, "calib/mu_w": 0.3786821705426357, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1520661157024793, "calib/std_conf": 0.4258003629016728, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5134523809523809, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.14288916256157624, "calib/step_q_w": 0.37056321839080464, "calib/step_q_w_n": 1044.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 497.3515625, "completions/mean_terminated_length": 513.3951416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.20373333333333332, "grad_norm": 0.0278775691986084, "kl": 0.1644744873046875, "learning_rate": 2.5000000000000004e-07, "loss": -0.0643, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03402933105826378, "mask/share_reasoning": 0.798499584197998, "mask/share_step_conf": 0.13622108101844788, "num_tokens": 44380109.0, "reward": 1.2860069274902344, "reward_std": 0.25992679595947266, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7171406745910645, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7887647151947021, "step": 191 }, { "adv/mean_abs_final_conf": 0.7224618792533875, "adv/mean_abs_reasoning": 0.5776307582855225, "adv/mean_abs_step_conf": 0.7494103312492371, "adv/ratio_final_to_reasoning": 1.250733048561578, "adv/ratio_step_to_reasoning": 1.2973864713741647, "adv/std_final_conf": 0.8930702209472656, "adv/std_reasoning": 0.809941291809082, "adv/std_step_conf": 0.9360520839691162, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8213797146765682, "calib/avg_num_step_conf": 6.2890625, "calib/ece": 0.17500000000000004, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4596774193548387, "calib/gap": 0.49003452543808224, "calib/mean_conf": 0.5714516129032258, "calib/mu_c": 0.8065891472868216, "calib/mu_w": 0.3165546218487394, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11314516129032262, "calib/std_conf": 0.43953273126382436, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5200802139037433, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.16219158281325602, "calib/step_q_w": 0.35788863109048724, "calib/step_q_w_n": 862.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 485.30078125, "completions/mean_terminated_length": 496.9480285644531, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.2048, "grad_norm": 0.024019304662942886, "kl": 0.1692352294921875, "learning_rate": 2.2222222222222224e-07, "loss": -0.1015, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03512055426836014, "mask/share_reasoning": 0.8116319179534912, "mask/share_step_conf": 0.12981006503105164, "num_tokens": 44609322.0, "reward": 1.3618290424346924, "reward_std": 0.28302812576293945, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7742304801940918, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8274481296539307, "step": 192 }, { "adv/mean_abs_final_conf": 0.7940802574157715, "adv/mean_abs_reasoning": 0.6738282442092896, "adv/mean_abs_step_conf": 0.7666121125221252, "adv/ratio_final_to_reasoning": 1.178460927157473, "adv/ratio_step_to_reasoning": 1.1376966149908332, "adv/std_final_conf": 0.9310760498046875, "adv/std_reasoning": 0.8748868703842163, "adv/std_step_conf": 0.9364278316497803, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7163778457356439, "calib/avg_num_step_conf": 6.9296875, "calib/ece": 0.21254098360655727, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.27459016393442626, "calib/gap": 0.31756574923547376, "calib/mean_conf": 0.44319672131147547, "calib/mu_c": 0.6188990825688071, "calib/mu_w": 0.30133333333333334, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1045081967213114, "calib/std_conf": 0.4121470499820693, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4966328257191201, "calib/step_q_c_n": 591.0, "calib/step_q_gap": 0.1378416169279113, "calib/step_q_w": 0.3587912087912088, "calib/step_q_w_n": 1183.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 521.84765625, "completions/mean_terminated_length": 536.51806640625, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.20586666666666667, "grad_norm": 0.03789614513516426, "kl": 0.1476287841796875, "learning_rate": 1.9444444444444447e-07, "loss": -0.1338, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03177924454212189, "mask/share_reasoning": 0.8226780891418457, "mask/share_step_conf": 0.1181989535689354, "num_tokens": 44848627.0, "reward": 1.2911839485168457, "reward_std": 0.30973750352859497, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7052562236785889, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8006651401519775, "step": 193 }, { "adv/mean_abs_final_conf": 0.7157238125801086, "adv/mean_abs_reasoning": 0.5523072481155396, "adv/mean_abs_step_conf": 0.7511518001556396, "adv/ratio_final_to_reasoning": 1.2958798115037289, "adv/ratio_step_to_reasoning": 1.3600252444967063, "adv/std_final_conf": 0.9128319621086121, "adv/std_reasoning": 0.7754966616630554, "adv/std_step_conf": 0.9362004995346069, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8436547256097561, "calib/avg_num_step_conf": 5.703125, "calib/ece": 0.140597609561753, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.43824701195219123, "calib/gap": 0.5422256097560976, "calib/mean_conf": 0.5455378486055777, "calib/mu_c": 0.81125, "calib/mu_w": 0.26902439024390246, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08808764940239043, "calib/std_conf": 0.43715579466195675, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5473294629898404, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.14849677816493767, "calib/step_q_w": 0.39883268482490275, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 479.7109375, "completions/mean_terminated_length": 481.5921936035156, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.20693333333333333, "grad_norm": 0.03265403211116791, "kl": 0.160736083984375, "learning_rate": 1.6666666666666668e-07, "loss": 0.0757, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03432423248887062, "mask/share_reasoning": 0.8371787071228027, "mask/share_step_conf": 0.12459082901477814, "num_tokens": 45077377.0, "reward": 1.3855798244476318, "reward_std": 0.2459278106689453, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.8086410164833069, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.833603024482727, "step": 194 }, { "adv/mean_abs_final_conf": 0.7741556167602539, "adv/mean_abs_reasoning": 0.5333757400512695, "adv/mean_abs_step_conf": 0.7351083755493164, "adv/ratio_final_to_reasoning": 1.4514263747463279, "adv/ratio_step_to_reasoning": 1.3782186184145828, "adv/std_final_conf": 0.9205122590065002, "adv/std_reasoning": 0.7577498555183411, "adv/std_step_conf": 0.9364016652107239, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7477688749918572, "calib/avg_num_step_conf": 5.87890625, "calib/ece": 0.22201612903225815, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.375, "calib/gap": 0.3719054133281219, "calib/mean_conf": 0.5161290322580645, "calib/mu_c": 0.7095798319327731, "calib/mu_w": 0.3376744186046512, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12915322580645167, "calib/std_conf": 0.43146358138359003, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5005189340813464, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.10002398458639689, "calib/step_q_w": 0.4004949494949495, "calib/step_q_w_n": 792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 501.7734375, "completions/mean_terminated_length": 507.72332763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.208, "grad_norm": 0.03370565548539162, "kl": 0.1640777587890625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0795, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03386705741286278, "mask/share_reasoning": 0.8268728256225586, "mask/share_step_conf": 0.12754136323928833, "num_tokens": 45311815.0, "reward": 1.2978198528289795, "reward_std": 0.30957794189453125, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7212796807289124, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7942112684249878, "step": 195 }, { "adv/mean_abs_final_conf": 0.7403755187988281, "adv/mean_abs_reasoning": 0.5320618748664856, "adv/mean_abs_step_conf": 0.7526148557662964, "adv/ratio_final_to_reasoning": 1.3915214635226707, "adv/ratio_step_to_reasoning": 1.4145250605583342, "adv/std_final_conf": 0.9307049512863159, "adv/std_reasoning": 0.8262983560562134, "adv/std_step_conf": 0.9359113574028015, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7265818858560794, "calib/avg_num_step_conf": 5.75, "calib/ece": 0.22925196850393706, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.421259842519685, "calib/gap": 0.34886972704714647, "calib/mean_conf": 0.6013779527559056, "calib/mu_c": 0.7716923076923077, "calib/mu_w": 0.4228225806451612, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1594094488188977, "calib/std_conf": 0.4127267580818497, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5210863509749304, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.06977335362745024, "calib/step_q_w": 0.45131299734748015, "calib/step_q_w_n": 754.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 438.05078125, "completions/mean_terminated_length": 438.05078125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.20906666666666668, "grad_norm": 0.029743339866399765, "kl": 0.1661529541015625, "learning_rate": 1.1111111111111112e-07, "loss": 0.017, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03638753294944763, "mask/share_reasoning": 0.8245117664337158, "mask/share_step_conf": 0.13910070061683655, "num_tokens": 45526500.0, "reward": 1.3533697128295898, "reward_std": 0.24870756268501282, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.740282416343689, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8332284688949585, "step": 196 }, { "adv/mean_abs_final_conf": 0.7403615713119507, "adv/mean_abs_reasoning": 0.5740138292312622, "adv/mean_abs_step_conf": 0.7356804609298706, "adv/ratio_final_to_reasoning": 1.2897974466982212, "adv/ratio_step_to_reasoning": 1.2816423986075693, "adv/std_final_conf": 0.9200189709663391, "adv/std_reasoning": 0.8099476099014282, "adv/std_step_conf": 0.9362643957138062, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7839766980346968, "calib/avg_num_step_conf": 6.39453125, "calib/ece": 0.19609279999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.38, "calib/gap": 0.40051044107291467, "calib/mean_conf": 0.5369472, "calib/mu_c": 0.7404065040650406, "calib/mu_w": 0.33989606299212594, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12051999999999996, "calib/std_conf": 0.4242358605447682, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4830428954423592, "calib/step_q_c_n": 746.0, "calib/step_q_gap": 0.1014604038598676, "calib/step_q_w": 0.3815824915824916, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 483.8515625, "completions/mean_terminated_length": 485.7490539550781, "completions/min_length": 0.0, "completions/min_terminated_length": 76.0, "epoch": 0.21013333333333334, "grad_norm": 0.024252887815237045, "kl": 0.171661376953125, "learning_rate": 8.333333333333334e-08, "loss": -0.0756, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03442458435893059, "mask/share_reasoning": 0.8235999345779419, "mask/share_step_conf": 0.1380692720413208, "num_tokens": 45755422.0, "reward": 1.3640518188476562, "reward_std": 0.2602362632751465, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7502653002738953, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8428254723548889, "step": 197 }, { "adv/mean_abs_final_conf": 0.7204113006591797, "adv/mean_abs_reasoning": 0.5719847679138184, "adv/mean_abs_step_conf": 0.7723501920700073, "adv/ratio_final_to_reasoning": 1.259493855556176, "adv/ratio_step_to_reasoning": 1.3502985313524611, "adv/std_final_conf": 0.917206883430481, "adv/std_reasoning": 0.792879045009613, "adv/std_step_conf": 0.9358345866203308, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8144945963127781, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.15970783532536514, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.3944223107569721, "calib/gap": 0.46558317440135627, "calib/mean_conf": 0.5441965471447543, "calib/mu_c": 0.7686410256410257, "calib/mu_w": 0.3030578512396694, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09298804780876489, "calib/std_conf": 0.42244257842246163, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5274160206718347, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.11324268733850135, "calib/step_q_w": 0.41417333333333334, "calib/step_q_w_n": 750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 477.4140625, "completions/mean_terminated_length": 481.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.2112, "grad_norm": 0.03475111722946167, "kl": 0.1588134765625, "learning_rate": 5.555555555555556e-08, "loss": 0.0329, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0367676243185997, "mask/share_reasoning": 0.8161733746528625, "mask/share_step_conf": 0.13924649357795715, "num_tokens": 45983024.0, "reward": 1.3777697086334229, "reward_std": 0.22293974459171295, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7879693508148193, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8349568247795105, "step": 198 }, { "adv/mean_abs_final_conf": 0.753053605556488, "adv/mean_abs_reasoning": 0.6612117290496826, "adv/mean_abs_step_conf": 0.7454158663749695, "adv/ratio_final_to_reasoning": 1.1388993456586196, "adv/ratio_step_to_reasoning": 1.1273482208888037, "adv/std_final_conf": 0.9273759126663208, "adv/std_reasoning": 0.8747655153274536, "adv/std_step_conf": 0.9363988041877747, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6334498834498836, "calib/avg_num_step_conf": 5.91015625, "calib/ece": 0.318433734939759, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4738955823293173, "calib/gap": 0.18263014763014762, "calib/mean_conf": 0.6010040160642571, "calib/mu_c": 0.6868181818181818, "calib/mu_w": 0.5041880341880342, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1946586345381526, "calib/std_conf": 0.4170418342925998, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5158849797023004, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.09007102621392826, "calib/step_q_w": 0.42581395348837214, "calib/step_q_w_n": 774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 552.53125, "completions/mean_terminated_length": 554.6980590820312, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.21226666666666666, "grad_norm": 0.03976122662425041, "kl": 0.1533050537109375, "learning_rate": 2.777777777777778e-08, "loss": -0.025, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03416062518954277, "mask/share_reasoning": 0.8343572616577148, "mask/share_step_conf": 0.1275758445262909, "num_tokens": 46228672.0, "reward": 1.2902672290802002, "reward_std": 0.31403863430023193, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.64481520652771, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8190314769744873, "step": 199 }, { "adv/mean_abs_final_conf": 0.7184133529663086, "adv/mean_abs_reasoning": 0.5303160548210144, "adv/mean_abs_step_conf": 0.746435284614563, "adv/ratio_final_to_reasoning": 1.3546890508694451, "adv/ratio_step_to_reasoning": 1.4075291099125604, "adv/std_final_conf": 0.9026347994804382, "adv/std_reasoning": 0.7754338383674622, "adv/std_step_conf": 0.9360775947570801, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8091293047433399, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.18152610441767067, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4859437751004016, "calib/gap": 0.49189473684210544, "calib/mean_conf": 0.5721285140562248, "calib/mu_c": 0.7973333333333334, "calib/mu_w": 0.305438596491228, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10574297188755019, "calib/std_conf": 0.44323188690852416, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.547486033519553, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.17153583692715457, "calib/step_q_w": 0.3759501965923985, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 518.390625, "completions/mean_terminated_length": 520.423583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.21333333333333335, "grad_norm": 0.028024643659591675, "kl": 0.15216064453125, "learning_rate": 0.0, "loss": -0.0122, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033296119421720505, "mask/share_reasoning": 0.8391964435577393, "mask/share_step_conf": 0.12360115349292755, "num_tokens": 46469428.0, "reward": 1.3780052661895752, "reward_std": 0.2766263782978058, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7767866849899292, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.839611828327179, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 12.39864625471324, "train_runtime": 13271.4088, "train_samples_per_second": 3.858, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 46469428, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }