{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.006959581281989813, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0031, "num_tokens": 229171.0, "reward": 0.40509337186813354, "reward_std": 0.17624244093894958, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.23456206917762756, "step": 1 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.0071344017051160336, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0015, "num_tokens": 458661.0, "reward": 0.3357120156288147, "reward_std": 0.18962696194648743, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.2669021487236023, "step": 2 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4613862050456253, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.25074803149606306, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.37401574803149606, "calib/gap": -0.003988191089640103, "calib/mean_conf": 0.8885433070866141, "calib/mu_c": 0.8870987654320989, "calib/mu_w": 0.891086956521739, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25074803149606306, "calib/std_conf": 0.04568641072021581, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8050111576011157, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.05370680977502895, "calib/step_q_w": 0.7513043478260868, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 497.15234375, "completions/mean_terminated_length": 497.15234375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.0032, "grad_norm": 0.0076317982748150826, "kl": 0.001775592565536499, "learning_rate": 7.5e-07, "loss": 0.0464, "num_tokens": 691188.0, "reward": 0.3898078203201294, "reward_std": 0.17493298649787903, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.694400429725647, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.23822227120399475, "step": 3 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.44732205778717404, "calib/avg_num_step_conf": 4.9375, "calib/ece": 0.22003984063745022, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.24302788844621515, "calib/gap": -0.004470049330514536, "calib/mean_conf": 0.8774103585657371, "calib/mu_c": 0.8758787878787879, "calib/mu_w": 0.8803488372093025, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.22003984063745022, "calib/std_conf": 0.047352906365984486, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7971202916160388, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.008820971888147677, "calib/step_q_w": 0.7882993197278911, "calib/step_q_w_n": 441.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 522.60546875, "completions/mean_terminated_length": 522.60546875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.004266666666666667, "grad_norm": 0.007802617270499468, "kl": 0.0003025531768798828, "learning_rate": 1.0000000000000002e-06, "loss": 0.0305, "num_tokens": 931143.0, "reward": 0.38953256607055664, "reward_std": 0.18929244577884674, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7030750513076782, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.24744734168052673, "step": 4 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.47041553748870824, "calib/avg_num_step_conf": 4.9375, "calib/ece": 0.3760240963855421, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.30120481927710846, "calib/gap": -0.005913666279519836, "calib/mean_conf": 0.8820481927710844, "calib/mu_c": 0.8791269841269842, "calib/mu_w": 0.885040650406504, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3760240963855421, "calib/std_conf": 0.043853325339365874, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7959670164917543, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.010941890863613635, "calib/step_q_w": 0.7850251256281406, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 543.76953125, "completions/mean_terminated_length": 545.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.005333333333333333, "grad_norm": 0.006589222699403763, "kl": 0.00029391050338745117, "learning_rate": 1.25e-06, "loss": 0.0478, "num_tokens": 1177036.0, "reward": 0.29454368352890015, "reward_std": 0.16011402010917664, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5850351452827454, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.287354052066803, "step": 5 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5200569440050617, "calib/avg_num_step_conf": 5.375, "calib/ece": 0.30322834645669294, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.2677165354330709, "calib/gap": 0.0029939892439100335, "calib/mean_conf": 0.8740944881889764, "calib/mu_c": 0.8753793103448275, "calib/mu_w": 0.8723853211009175, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30322834645669294, "calib/std_conf": 0.046063194023381486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7994713656387666, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.01912604189775935, "calib/step_q_w": 0.7803453237410073, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 450.3828125, "completions/mean_terminated_length": 450.3828125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0064, "grad_norm": 0.009682250209152699, "kl": 0.0006236135959625244, "learning_rate": 1.5e-06, "loss": 0.015, "num_tokens": 1398286.0, "reward": 0.3437407612800598, "reward_std": 0.16819913685321808, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6572445034980774, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.28148171305656433, "step": 6 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4516723356009071, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.220952380952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.32936507936507936, "calib/gap": -0.010833333333333584, "calib/mean_conf": 0.8801587301587303, "calib/mu_c": 0.876547619047619, "calib/mu_w": 0.8873809523809526, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21722222222222226, "calib/std_conf": 0.0510210682748932, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7923841807909604, "calib/step_q_c_n": 885.0, "calib/step_q_gap": 0.010961335099577751, "calib/step_q_w": 0.7814228456913827, "calib/step_q_w_n": 499.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 533.61328125, "completions/mean_terminated_length": 539.9407348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.007466666666666667, "grad_norm": 0.007666449528187513, "kl": 0.000966191291809082, "learning_rate": 1.75e-06, "loss": 0.0273, "num_tokens": 1642315.0, "reward": 0.39989861845970154, "reward_std": 0.17968562245368958, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7019117474555969, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.22789573669433594, "step": 7 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.45037819509650495, "calib/avg_num_step_conf": 4.8828125, "calib/ece": 0.3014000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.296, "calib/gap": 0.010517736045905002, "calib/mean_conf": 0.8694000000000001, "calib/mu_c": 0.873943661971831, "calib/mu_w": 0.863425925925926, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3014000000000001, "calib/std_conf": 0.09368479065462014, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8057534246575342, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.0400535932915983, "calib/step_q_w": 0.7656998313659359, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 536.734375, "completions/mean_terminated_length": 540.9606323242188, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.008533333333333334, "grad_norm": 0.007007604464888573, "kl": 0.0007126033306121826, "learning_rate": 2.0000000000000003e-06, "loss": -0.0149, "num_tokens": 1886231.0, "reward": 0.34858155250549316, "reward_std": 0.17785796523094177, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6407878398895264, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.24909357726573944, "step": 8 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5257313472893184, "calib/avg_num_step_conf": 4.859375, "calib/ece": 0.2355905511811025, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.23622047244094488, "calib/gap": 0.007353730542136394, "calib/mean_conf": 0.8733858267716536, "calib/mu_c": 0.8760493827160495, "calib/mu_w": 0.8686956521739131, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2355905511811025, "calib/std_conf": 0.048011161308004424, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7510578105781059, "calib/step_q_c_n": 813.0, "calib/step_q_gap": -0.04105355833140689, "calib/step_q_w": 0.7921113689095128, "calib/step_q_w_n": 431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 511.203125, "completions/mean_terminated_length": 513.2078857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0096, "grad_norm": 0.006651603616774082, "kl": 0.0003667175769805908, "learning_rate": 2.25e-06, "loss": 0.0019, "num_tokens": 2124635.0, "reward": 0.37856239080429077, "reward_std": 0.18048423528671265, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7026296854019165, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.2673799693584442, "step": 9 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.44472446236559143, "calib/avg_num_step_conf": 5.04296875, "calib/ece": 0.24976284584980227, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2885375494071146, "calib/gap": -0.007983870967742002, "calib/mean_conf": 0.8804347826086957, "calib/mu_c": 0.8774999999999998, "calib/mu_w": 0.8854838709677418, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24889328063241095, "calib/std_conf": 0.04371359236849638, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7954461942257218, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.005143736758803086, "calib/step_q_w": 0.7903024574669187, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 514.5625, "completions/mean_terminated_length": 514.5625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.010666666666666666, "grad_norm": 0.007601493038237095, "kl": 0.0004711151123046875, "learning_rate": 2.5e-06, "loss": 0.0195, "num_tokens": 2363163.0, "reward": 0.3792797327041626, "reward_std": 0.18951097130775452, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6921863555908203, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.25628310441970825, "step": 10 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.45581132785548517, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.2950390624999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.21875, "calib/gap": -0.010393903280436545, "calib/mean_conf": 0.8705859375, "calib/mu_c": 0.866241610738255, "calib/mu_w": 0.8766355140186916, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2917968749999998, "calib/std_conf": 0.05131140031460937, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7901703800786369, "calib/step_q_c_n": 763.0, "calib/step_q_gap": 0.003828916664002757, "calib/step_q_w": 0.7863414634146342, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 504.53125, "completions/mean_terminated_length": 506.50982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.011733333333333333, "grad_norm": 0.006837381515651941, "kl": 0.0004589557647705078, "learning_rate": 2.7500000000000004e-06, "loss": 0.0013, "num_tokens": 2596803.0, "reward": 0.3525460362434387, "reward_std": 0.14610227942466736, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6657754182815552, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.27708953619003296, "step": 11 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.45588134076674236, "calib/avg_num_step_conf": 5.42578125, "calib/ece": 0.22531496062992135, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2795275590551181, "calib/gap": -0.00828205657650205, "calib/mean_conf": 0.8769685039370079, "calib/mu_c": 0.8741317365269462, "calib/mu_w": 0.8824137931034483, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2224015748031497, "calib/std_conf": 0.04995138376743412, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7984235294117646, "calib/step_q_c_n": 850.0, "calib/step_q_gap": 0.022968983957219136, "calib/step_q_w": 0.7754545454545455, "calib/step_q_w_n": 539.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 484.28515625, "completions/mean_terminated_length": 484.28515625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0128, "grad_norm": 0.007849551737308502, "kl": 0.0010522007942199707, "learning_rate": 3e-06, "loss": 0.0313, "num_tokens": 2824956.0, "reward": 0.4141455888748169, "reward_std": 0.17251336574554443, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7109042406082153, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.21073806285858154, "step": 12 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5100972070098576, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.23007874015748023, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2992125984251969, "calib/gap": 0.00084200438116111, "calib/mean_conf": 0.8795275590551181, "calib/mu_c": 0.8798192771084338, "calib/mu_w": 0.8789772727272727, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22803149606299206, "calib/std_conf": 0.04864070858691373, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7896358907672302, "calib/step_q_c_n": 769.0, "calib/step_q_gap": -0.0006418870105476815, "calib/step_q_w": 0.7902777777777779, "calib/step_q_w_n": 468.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 504.5703125, "completions/mean_terminated_length": 504.5703125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.013866666666666666, "grad_norm": 0.00747161079198122, "kl": 0.0017485618591308594, "learning_rate": 3.2500000000000002e-06, "loss": 0.0506, "num_tokens": 3058718.0, "reward": 0.40975651144981384, "reward_std": 0.1711602509021759, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7134867310523987, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.22131745517253876, "step": 13 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4449854266030736, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.32983805668016203, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3076923076923077, "calib/gap": -0.00949920508744051, "calib/mean_conf": 0.8804453441295548, "calib/mu_c": 0.8761764705882352, "calib/mu_w": 0.8856756756756757, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32983805668016203, "calib/std_conf": 0.046114231169776115, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.804, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.015983471074380362, "calib/step_q_w": 0.7880165289256197, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2528.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 571.23828125, "completions/mean_terminated_length": 573.4784545898438, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.014933333333333333, "grad_norm": 0.0067930989898741245, "kl": 0.002413034439086914, "learning_rate": 3.5e-06, "loss": 0.0179, "num_tokens": 3310355.0, "reward": 0.33452653884887695, "reward_std": 0.19036152958869934, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6139481067657471, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.24333252012729645, "step": 14 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5114159512255817, "calib/avg_num_step_conf": 4.92578125, "calib/ece": 0.33219607843137255, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.37254901960784315, "calib/gap": 0.0015173572228442955, "calib/mean_conf": 0.8851372549019607, "calib/mu_c": 0.8858156028368794, "calib/mu_w": 0.8842982456140351, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33219607843137255, "calib/std_conf": 0.044772636779947896, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7979115853658537, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.004985965531143055, "calib/step_q_w": 0.7929256198347107, "calib/step_q_w_n": 605.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 488.13671875, "completions/mean_terminated_length": 490.0509948730469, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.016, "grad_norm": 0.006855354178696871, "kl": 0.005002260208129883, "learning_rate": 3.7500000000000005e-06, "loss": 0.0159, "num_tokens": 3543198.0, "reward": 0.3387652635574341, "reward_std": 0.17916326224803925, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6386894583702087, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.27053388953208923, "step": 15 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.44479110146500267, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.28172690763052205, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.41365461847389556, "calib/gap": 0.002976804123711152, "calib/mean_conf": 0.8875903614457831, "calib/mu_c": 0.8887499999999998, "calib/mu_w": 0.8857731958762887, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.279437751004016, "calib/std_conf": 0.07968699351343808, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.768563656147987, "calib/step_q_c_n": 919.0, "calib/step_q_gap": -0.016130933729848862, "calib/step_q_w": 0.7846945898778359, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 679.59765625, "completions/mean_terminated_length": 682.2627563476562, "completions/min_length": 0.0, "completions/min_terminated_length": 220.0, "epoch": 0.017066666666666667, "grad_norm": 0.006001787725836039, "kl": 0.005553245544433594, "learning_rate": 4.000000000000001e-06, "loss": 0.0197, "num_tokens": 3826023.0, "reward": 0.358783483505249, "reward_std": 0.1683286428451538, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.654166042804718, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.2490990310907364, "step": 16 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4394421335943235, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.13988, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.244, "calib/gap": -0.005791534132615572, "calib/mean_conf": 0.8718800000000001, "calib/mu_c": 0.870327868852459, "calib/mu_w": 0.8761194029850746, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.13988, "calib/std_conf": 0.05323969947323144, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7790087241003272, "calib/step_q_c_n": 917.0, "calib/step_q_gap": 0.000997674376570279, "calib/step_q_w": 0.7780110497237569, "calib/step_q_w_n": 362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 546.64453125, "completions/mean_terminated_length": 550.9487915039062, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.018133333333333335, "grad_norm": 0.0068136779591441154, "kl": 0.00926065444946289, "learning_rate": 4.25e-06, "loss": 0.0026, "num_tokens": 4069492.0, "reward": 0.45704808831214905, "reward_std": 0.19913692772388458, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7529730796813965, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": -0.17325183749198914, "step": 17 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.464132475194422, "calib/avg_num_step_conf": 4.50390625, "calib/ece": 0.34877551020408165, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3510204081632653, "calib/gap": -0.008856261732367976, "calib/mean_conf": 0.8793877551020409, "calib/mu_c": 0.8753030303030302, "calib/mu_w": 0.8841592920353982, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.34469387755102043, "calib/std_conf": 0.07845467073582399, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.7885454545454545, "calib/step_q_c_n": 550.0, "calib/step_q_gap": 0.07280747776270158, "calib/step_q_w": 0.715737976782753, "calib/step_q_w_n": 603.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 570.30078125, "completions/mean_terminated_length": 574.7913208007812, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.0192, "grad_norm": 0.006491835694760084, "kl": 0.013482093811035156, "learning_rate": 4.5e-06, "loss": -0.0022, "num_tokens": 4326209.0, "reward": 0.2919595241546631, "reward_std": 0.14282935857772827, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5701101422309875, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": -0.27290987968444824, "step": 18 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5346534653465347, "calib/avg_num_step_conf": 4.4453125, "calib/ece": 0.28625498007968125, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.38247011952191234, "calib/gap": 0.01972079207920774, "calib/mean_conf": 0.8838645418326693, "calib/mu_c": 0.8917999999999999, "calib/mu_w": 0.8720792079207922, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.28625498007968125, "calib/std_conf": 0.0766985976284869, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.789051987767584, "calib/step_q_c_n": 654.0, "calib/step_q_gap": 0.056799921651881524, "calib/step_q_w": 0.7322520661157025, "calib/step_q_w_n": 484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 542.04296875, "completions/mean_terminated_length": 548.4703979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.020266666666666665, "grad_norm": 0.007006355561316013, "kl": 0.020032882690429688, "learning_rate": 4.75e-06, "loss": 0.0123, "num_tokens": 4569732.0, "reward": 0.3702203035354614, "reward_std": 0.1917993575334549, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6583890914916992, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.22810472548007965, "step": 19 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49956510103037605, "calib/avg_num_step_conf": 4.80859375, "calib/ece": 0.31429149797570843, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3522267206477733, "calib/gap": -0.00024755787501651305, "calib/mean_conf": 0.8851417004048583, "calib/mu_c": 0.8850354609929078, "calib/mu_w": 0.8852830188679243, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31429149797570843, "calib/std_conf": 0.048096143611593256, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.794763948497854, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.0019443996256736096, "calib/step_q_w": 0.7928195488721804, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 565.83203125, "completions/mean_terminated_length": 565.83203125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.021333333333333333, "grad_norm": 0.01030268706381321, "kl": 0.0440673828125, "learning_rate": 5e-06, "loss": 0.0425, "num_tokens": 4819457.0, "reward": 0.3507111668586731, "reward_std": 0.17901304364204407, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6296054720878601, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.2305269092321396, "step": 20 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5494007704380083, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.23751004016064262, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3534136546184739, "calib/gap": 0.011117848480524994, "calib/mean_conf": 0.8853012048192771, "calib/mu_c": 0.8891411042944786, "calib/mu_w": 0.8780232558139536, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23409638554216872, "calib/std_conf": 0.054728332700301, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7738012422360248, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.009273345240316644, "calib/step_q_w": 0.7645278969957081, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 582.05078125, "completions/mean_terminated_length": 582.05078125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.0224, "grad_norm": 0.006925127934664488, "kl": 0.023862838745117188, "learning_rate": 4.9722222222222224e-06, "loss": 0.0254, "num_tokens": 5071422.0, "reward": 0.41713836789131165, "reward_std": 0.21328508853912354, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7029625177383423, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.19056078791618347, "step": 21 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5403197031328053, "calib/avg_num_step_conf": 4.39453125, "calib/ece": 0.1998031496062992, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3346456692913386, "calib/gap": 0.009304931135374561, "calib/mean_conf": 0.8809055118110235, "calib/mu_c": 0.8838728323699423, "calib/mu_w": 0.8745679012345677, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1998031496062992, "calib/std_conf": 0.04911386947028659, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7907911802853437, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.0250002198333662, "calib/step_q_w": 0.7657909604519775, "calib/step_q_w_n": 354.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 542.12109375, "completions/mean_terminated_length": 542.12109375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.023466666666666667, "grad_norm": 0.007413065526634455, "kl": 0.030803680419921875, "learning_rate": 4.944444444444445e-06, "loss": 0.0027, "num_tokens": 5312021.0, "reward": 0.4359588921070099, "reward_std": 0.12823528051376343, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7348839640617371, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.19577866792678833, "step": 22 }, { "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.5566195115441078, "calib/avg_num_step_conf": 4.09765625, "calib/ece": 0.3977551020408163, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.24897959183673468, "calib/gap": 0.005119444815160912, "calib/mean_conf": 0.8751428571428572, "calib/mu_c": 0.8777966101694916, "calib/mu_w": 0.8726771653543307, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.3956326530612245, "calib/std_conf": 0.05291194070706304, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.7744303797468355, "calib/step_q_c_n": 474.0, "calib/step_q_gap": -0.004352228948816594, "calib/step_q_w": 0.7787826086956521, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 583.55078125, "completions/mean_terminated_length": 583.55078125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.024533333333333334, "grad_norm": 0.007403541821986437, "kl": 0.03454780578613281, "learning_rate": 4.9166666666666665e-06, "loss": -0.004, "num_tokens": 5565346.0, "reward": 0.2809341549873352, "reward_std": 0.24287301301956177, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.5553504228591919, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": -0.26848214864730835, "step": 23 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.516395618984401, "calib/avg_num_step_conf": 3.78515625, "calib/ece": 0.3439024390243901, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.2967479674796748, "calib/gap": 0.001826750746763972, "calib/mean_conf": 0.8760162601626016, "calib/mu_c": 0.8768702290076337, "calib/mu_w": 0.8750434782608697, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.3436991869918698, "calib/std_conf": 0.05450053833400678, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.7803929273084479, "calib/step_q_c_n": 509.0, "calib/step_q_gap": 0.008936405569317407, "calib/step_q_w": 0.7714565217391305, "calib/step_q_w_n": 460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 577.32421875, "completions/mean_terminated_length": 581.8700561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.0256, "grad_norm": 0.006647349335253239, "kl": 0.030107498168945312, "learning_rate": 4.888888888888889e-06, "loss": -0.0012, "num_tokens": 5817653.0, "reward": 0.32674214243888855, "reward_std": 0.22721973061561584, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5986887216567993, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.23817317187786102, "step": 24 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5208709612683123, "calib/avg_num_step_conf": 3.76953125, "calib/ece": 0.26552, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.28, "calib/gap": 0.01258411933908632, "calib/mean_conf": 0.86952, "calib/mu_c": 0.8745033112582782, "calib/mu_w": 0.8619191919191919, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.26552, "calib/std_conf": 0.06923416497654897, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.7856052141527002, "calib/step_q_c_n": 537.0, "calib/step_q_gap": 0.02074540106858802, "calib/step_q_w": 0.7648598130841122, "calib/step_q_w_n": 428.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 522.05859375, "completions/mean_terminated_length": 524.1058959960938, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.02666666666666667, "grad_norm": 0.006916355807334185, "kl": 0.03505706787109375, "learning_rate": 4.861111111111111e-06, "loss": 0.0308, "num_tokens": 6054524.0, "reward": 0.3672490119934082, "reward_std": 0.1614658534526825, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6542297005653381, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": -0.22754418849945068, "step": 25 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4935005298481102, "calib/avg_num_step_conf": 3.5234375, "calib/ece": 0.26204918032786867, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.2827868852459016, "calib/gap": 0.004257152949487919, "calib/mean_conf": 0.8727049180327868, "calib/mu_c": 0.8743624161073825, "calib/mu_w": 0.8701052631578946, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.26204918032786867, "calib/std_conf": 0.054765324754675604, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8019260700389106, "calib/step_q_c_n": 514.0, "calib/step_q_gap": 0.018008544265714743, "calib/step_q_w": 0.7839175257731958, "calib/step_q_w_n": 388.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 551.98046875, "completions/mean_terminated_length": 554.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.027733333333333332, "grad_norm": 0.006535641383379698, "kl": 0.028377532958984375, "learning_rate": 4.833333333333333e-06, "loss": 0.0319, "num_tokens": 6301071.0, "reward": 0.3780941367149353, "reward_std": 0.1892511248588562, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6601343154907227, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.21097734570503235, "step": 26 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5303893637226972, "calib/avg_num_step_conf": 3.546875, "calib/ece": 0.3560082304526748, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.1934156378600823, "calib/gap": -0.00423076923076926, "calib/mean_conf": 0.8625925925925927, "calib/mu_c": 0.8605555555555555, "calib/mu_w": 0.8647863247863248, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.3500411522633744, "calib/std_conf": 0.0613183914524468, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.7823225806451612, "calib/step_q_c_n": 465.0, "calib/step_q_gap": 0.00038127139008226063, "calib/step_q_w": 0.781941309255079, "calib/step_q_w_n": 443.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 524.75390625, "completions/mean_terminated_length": 535.2072143554688, "completions/min_length": 0.0, "completions/min_terminated_length": 234.0, "epoch": 0.0288, "grad_norm": 0.0070977299474179745, "kl": 0.036457061767578125, "learning_rate": 4.805555555555556e-06, "loss": 0.0038, "num_tokens": 6540624.0, "reward": 0.30288949608802795, "reward_std": 0.21201197803020477, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5817859172821045, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": -0.26116320490837097, "step": 27 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6194423223834988, "calib/avg_num_step_conf": 3.0703125, "calib/ece": 0.22330543933054398, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.25523012552301255, "calib/gap": 0.0263071046600456, "calib/mean_conf": 0.8676569037656906, "calib/mu_c": 0.877012987012987, "calib/mu_w": 0.8507058823529414, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.22330543933054398, "calib/std_conf": 0.05225011725913397, "calib/step_conf_rate": 0.90234375, "calib/step_q_c": 0.7906796116504855, "calib/step_q_c_n": 515.0, "calib/step_q_gap": 0.013447139325762247, "calib/step_q_w": 0.7772324723247233, "calib/step_q_w_n": 271.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 588.74609375, "completions/mean_terminated_length": 595.727294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.029866666666666666, "grad_norm": 0.006945778150111437, "kl": 0.026153564453125, "learning_rate": 4.777777777777778e-06, "loss": 0.0547, "num_tokens": 6798287.0, "reward": 0.3795027434825897, "reward_std": 0.18767467141151428, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.643653929233551, "rewards/format_reward_step": 0.88671875, "rewards/step_l2_reward": -0.18230466544628143, "step": 28 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5269381946326918, "calib/avg_num_step_conf": 3.046875, "calib/ece": 0.38341563786008237, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.2222222222222222, "calib/gap": 0.004739766874491624, "calib/mean_conf": 0.8731275720164609, "calib/mu_c": 0.8755462184873948, "calib/mu_w": 0.8708064516129032, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.38341563786008237, "calib/std_conf": 0.04838218243560813, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.792140921409214, "calib/step_q_c_n": 369.0, "calib/step_q_gap": 0.015279607540600981, "calib/step_q_w": 0.776861313868613, "calib/step_q_w_n": 411.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 616.796875, "completions/mean_terminated_length": 616.796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.030933333333333334, "grad_norm": 0.005464597605168819, "kl": 0.026638031005859375, "learning_rate": 4.75e-06, "loss": 0.0254, "num_tokens": 7063315.0, "reward": 0.30837979912757874, "reward_std": 0.17728251218795776, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.5679382681846619, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": -0.232428640127182, "step": 29 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5185072610604524, "calib/avg_num_step_conf": 3.21875, "calib/ece": 0.2950000000000002, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.21951219512195122, "calib/gap": 0.01468085106382977, "calib/mean_conf": 0.8650813008130082, "calib/mu_c": 0.8713475177304966, "calib/mu_w": 0.8566666666666668, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.2934552845528457, "calib/std_conf": 0.07664054982829054, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.7726666666666667, "calib/step_q_c_n": 450.0, "calib/step_q_gap": 0.024324420677361958, "calib/step_q_w": 0.7483422459893048, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 609.2734375, "completions/mean_terminated_length": 609.2734375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.032, "grad_norm": 0.0061422535218298435, "kl": 0.033168792724609375, "learning_rate": 4.722222222222222e-06, "loss": -0.0106, "num_tokens": 7326273.0, "reward": 0.33184754848480225, "reward_std": 0.27306443452835083, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.614844560623169, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": -0.24568068981170654, "step": 30 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6292620865139948, "calib/avg_num_step_conf": 2.9765625, "calib/ece": 0.3940239043824701, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": 0.029038167938931436, "calib/mean_conf": 0.8688446215139443, "calib/mu_c": 0.8840000000000001, "calib/mu_w": 0.8549618320610687, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3923904382470119, "calib/std_conf": 0.08539024409036974, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8020317585301837, "calib/step_q_c_n": 381.0, "calib/step_q_gap": 0.028094750656167955, "calib/step_q_w": 0.7739370078740158, "calib/step_q_w_n": 381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 581.9375, "completions/mean_terminated_length": 584.2196655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.03306666666666667, "grad_norm": 0.0060067446902394295, "kl": 0.02797698974609375, "learning_rate": 4.694444444444445e-06, "loss": 0.0063, "num_tokens": 7581161.0, "reward": 0.29306304454803467, "reward_std": 0.19945606589317322, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5801421403884888, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.27995362877845764, "step": 31 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6632510013351135, "calib/avg_num_step_conf": 3.20703125, "calib/ece": 0.30012145748987856, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.2591093117408907, "calib/gap": 0.036122830440587395, "calib/mean_conf": 0.8669230769230769, "calib/mu_c": 0.8825714285714286, "calib/mu_w": 0.8464485981308412, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.30012145748987856, "calib/std_conf": 0.07387651197851379, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.8015618221258134, "calib/step_q_c_n": 461.0, "calib/step_q_gap": 0.07678404434803554, "calib/step_q_w": 0.7247777777777779, "calib/step_q_w_n": 360.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2846.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 579.53515625, "completions/mean_terminated_length": 581.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.034133333333333335, "grad_norm": 0.0065063112415373325, "kl": 0.030467987060546875, "learning_rate": 4.666666666666667e-06, "loss": 0.004, "num_tokens": 7836226.0, "reward": 0.36128664016723633, "reward_std": 0.20347735285758972, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6428042650222778, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.22023098170757294, "step": 32 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5393358876117497, "calib/avg_num_step_conf": 2.97265625, "calib/ece": 0.33298804780876473, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.24701195219123506, "calib/gap": 0.008127075351213486, "calib/mean_conf": 0.8708366533864541, "calib/mu_c": 0.8745925925925926, "calib/mu_w": 0.8664655172413791, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.33298804780876473, "calib/std_conf": 0.04760914500809453, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7982195121951222, "calib/step_q_c_n": 410.0, "calib/step_q_gap": 0.015797859773469813, "calib/step_q_w": 0.7824216524216524, "calib/step_q_w_n": 351.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 513.703125, "completions/mean_terminated_length": 519.7944946289062, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.0352, "grad_norm": 0.006140530575066805, "kl": 0.03408050537109375, "learning_rate": 4.638888888888889e-06, "loss": -0.0129, "num_tokens": 8074606.0, "reward": 0.34142816066741943, "reward_std": 0.17432644963264465, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6297796964645386, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.24848589301109314, "step": 33 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.44686621789425524, "calib/avg_num_step_conf": 3.3359375, "calib/ece": 0.30467999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.284, "calib/gap": -0.005585909417685109, "calib/mean_conf": 0.8731599999999999, "calib/mu_c": 0.8707692307692307, "calib/mu_w": 0.8763551401869158, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.30291999999999997, "calib/std_conf": 0.05450884698835594, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7898726114649681, "calib/step_q_c_n": 471.0, "calib/step_q_gap": 0.022274700237814082, "calib/step_q_w": 0.767597911227154, "calib/step_q_w_n": 383.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 520.2265625, "completions/mean_terminated_length": 524.3228149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.03626666666666667, "grad_norm": 0.007740476168692112, "kl": 0.03607749938964844, "learning_rate": 4.611111111111112e-06, "loss": 0.0164, "num_tokens": 8312896.0, "reward": 0.3710823357105255, "reward_std": 0.20131167769432068, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6339741945266724, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.1957157701253891, "step": 34 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5440991735537191, "calib/avg_num_step_conf": 2.4921875, "calib/ece": 0.3594715447154472, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.21138211382113822, "calib/gap": 0.011381818181818493, "calib/mean_conf": 0.8676016260162602, "calib/mu_c": 0.8732000000000002, "calib/mu_w": 0.8618181818181817, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.3594715447154472, "calib/std_conf": 0.054990204573015435, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.8078145695364237, "calib/step_q_c_n": 302.0, "calib/step_q_gap": 0.030820521917376165, "calib/step_q_w": 0.7769940476190476, "calib/step_q_w_n": 336.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 557.87890625, "completions/mean_terminated_length": 566.734130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 251.0, "epoch": 0.037333333333333336, "grad_norm": 0.006301682908087969, "kl": 0.02927398681640625, "learning_rate": 4.583333333333333e-06, "loss": -0.0127, "num_tokens": 8564969.0, "reward": 0.3017095923423767, "reward_std": 0.22571972012519836, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5847018957138062, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": -0.26722028851509094, "step": 35 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5850586611456177, "calib/avg_num_step_conf": 3.109375, "calib/ece": 0.12700404858299594, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.242914979757085, "calib/gap": 0.010705659075224139, "calib/mean_conf": 0.8719433198380567, "calib/mu_c": 0.8746739130434783, "calib/mu_w": 0.8639682539682542, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.12700404858299594, "calib/std_conf": 0.04224792865476782, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8208474576271185, "calib/step_q_c_n": 590.0, "calib/step_q_gap": 0.035701826559157324, "calib/step_q_w": 0.7851456310679612, "calib/step_q_w_n": 206.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 545.00390625, "completions/mean_terminated_length": 545.00390625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.0384, "grad_norm": 0.00643440755084157, "kl": 0.039272308349609375, "learning_rate": 4.555555555555556e-06, "loss": 0.0129, "num_tokens": 8807202.0, "reward": 0.4795789420604706, "reward_std": 0.1412101686000824, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7681589722633362, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.14571987092494965, "step": 36 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6778419779629132, "calib/avg_num_step_conf": 2.5625, "calib/ece": 0.36663934426229505, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.21311475409836064, "calib/gap": 0.03360655737704932, "calib/mean_conf": 0.8666393442622953, "calib/mu_c": 0.8834426229508198, "calib/mu_w": 0.8498360655737704, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.36663934426229505, "calib/std_conf": 0.0658564093217231, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.8231741573033708, "calib/step_q_c_n": 356.0, "calib/step_q_gap": 0.047040823970037504, "calib/step_q_w": 0.7761333333333333, "calib/step_q_w_n": 300.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 585.0625, "completions/mean_terminated_length": 589.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.039466666666666664, "grad_norm": 0.00586958322674036, "kl": 0.027469635009765625, "learning_rate": 4.527777777777778e-06, "loss": 0.0286, "num_tokens": 9064074.0, "reward": 0.3125787377357483, "reward_std": 0.1971425712108612, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.5849460959434509, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": -0.2418198585510254, "step": 37 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6426985370950888, "calib/avg_num_step_conf": 2.55859375, "calib/ece": 0.33435483870967736, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.1975806451612903, "calib/gap": 0.03169801462904909, "calib/mean_conf": 0.8666129032258065, "calib/mu_c": 0.8814393939393939, "calib/mu_w": 0.8497413793103448, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.33435483870967736, "calib/std_conf": 0.06886039051511297, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.8236464088397789, "calib/step_q_c_n": 362.0, "calib/step_q_gap": 0.06634265457356736, "calib/step_q_w": 0.7573037542662115, "calib/step_q_w_n": 293.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2634.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 510.89453125, "completions/mean_terminated_length": 516.9525756835938, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.04053333333333333, "grad_norm": 0.007309312000870705, "kl": 0.028207778930664062, "learning_rate": 4.5e-06, "loss": 0.0083, "num_tokens": 9301751.0, "reward": 0.33864691853523254, "reward_std": 0.2210758626461029, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6288172006607056, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.24761712551116943, "step": 38 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5811217948717949, "calib/avg_num_step_conf": 2.38671875, "calib/ece": 0.3505999999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.26, "calib/gap": 0.024551282051281986, "calib/mean_conf": 0.8706, "calib/mu_c": 0.8823846153846154, "calib/mu_w": 0.8578333333333334, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3505999999999999, "calib/std_conf": 0.08131445136013649, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.8277044025157234, "calib/step_q_c_n": 318.0, "calib/step_q_gap": 0.02500815678193502, "calib/step_q_w": 0.8026962457337884, "calib/step_q_w_n": 293.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 509.51953125, "completions/mean_terminated_length": 509.51953125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0416, "grad_norm": 0.0069496543146669865, "kl": 0.03579139709472656, "learning_rate": 4.472222222222223e-06, "loss": 0.0183, "num_tokens": 9538276.0, "reward": 0.33049649000167847, "reward_std": 0.18329477310180664, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6080214977264404, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.24077847599983215, "step": 39 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5396220472440945, "calib/avg_num_step_conf": 2.109375, "calib/ece": 0.3734126984126984, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.23015873015873015, "calib/gap": 0.0073826771653544565, "calib/mean_conf": 0.874920634920635, "calib/mu_c": 0.8785826771653544, "calib/mu_w": 0.8712, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.37218253968253967, "calib/std_conf": 0.05515844467209437, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.8313358778625954, "calib/step_q_c_n": 262.0, "calib/step_q_gap": 0.0030624965676313565, "calib/step_q_w": 0.8282733812949641, "calib/step_q_w_n": 278.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 503.171875, "completions/mean_terminated_length": 503.171875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.042666666666666665, "grad_norm": 0.006470134947448969, "kl": 0.0303955078125, "learning_rate": 4.444444444444444e-06, "loss": -0.0316, "num_tokens": 9773848.0, "reward": 0.2879479229450226, "reward_std": 0.22962933778762817, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5858784914016724, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": -0.3006076514720917, "step": 40 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4538764510779436, "calib/avg_num_step_conf": 2.421875, "calib/ece": 0.0720481927710844, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.21285140562248997, "calib/gap": 0.013168532338308392, "calib/mean_conf": 0.875421686746988, "calib/mu_c": 0.8779601990049751, "calib/mu_w": 0.8647916666666667, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.07012048192771092, "calib/std_conf": 0.06796469444003686, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.8239172749391728, "calib/step_q_c_n": 411.0, "calib/step_q_gap": 0.4042043562788856, "calib/step_q_w": 0.41971291866028715, "calib/step_q_w_n": 209.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3052.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 457.76171875, "completions/mean_terminated_length": 459.556884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.04373333333333333, "grad_norm": 0.006787601392716169, "kl": 0.0326080322265625, "learning_rate": 4.416666666666667e-06, "loss": 0.0792, "num_tokens": 9998283.0, "reward": 0.5055362582206726, "reward_std": 0.1886957734823227, "rewards/accuracy_reward_step": 0.78515625, "rewards/final_brier_reward_step": 0.7994054555892944, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": -0.13442669808864594, "step": 41 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5079997450280469, "calib/avg_num_step_conf": 2.19921875, "calib/ece": 0.3095669291338584, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3464566929133858, "calib/gap": -0.001931412544620037, "calib/mean_conf": 0.8892519685039371, "calib/mu_c": 0.8884459459459461, "calib/mu_w": 0.8903773584905661, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3080708661417324, "calib/std_conf": 0.040204099078814906, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8422923588039867, "calib/step_q_c_n": 301.0, "calib/step_q_gap": 0.005498465674215636, "calib/step_q_w": 0.836793893129771, "calib/step_q_w_n": 262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 416.74609375, "completions/mean_terminated_length": 418.38043212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.0448, "grad_norm": 0.006533120293170214, "kl": 0.03894805908203125, "learning_rate": 4.388888888888889e-06, "loss": 0.0001, "num_tokens": 10209338.0, "reward": 0.351402223110199, "reward_std": 0.161943718791008, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6509097814559937, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.2606053054332733, "step": 42 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.565625, "calib/avg_num_step_conf": 2.01953125, "calib/ece": 0.26007905138339915, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.383399209486166, "calib/gap": 0.007639784946236516, "calib/mean_conf": 0.8856916996047431, "calib/mu_c": 0.8885, "calib/mu_w": 0.8808602150537634, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.25667984189723314, "calib/std_conf": 0.05129947539851385, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8256676557863502, "calib/step_q_c_n": 337.0, "calib/step_q_gap": -0.012776788658094396, "calib/step_q_w": 0.8384444444444445, "calib/step_q_w_n": 180.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 451.59375, "completions/mean_terminated_length": 451.59375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.04586666666666667, "grad_norm": 0.006118349730968475, "kl": 0.03265380859375, "learning_rate": 4.361111111111112e-06, "loss": -0.002, "num_tokens": 10430170.0, "reward": 0.39366674423217773, "reward_std": 0.2639361321926117, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6941671967506409, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.22870871424674988, "step": 43 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6036445012787724, "calib/avg_num_step_conf": 1.83203125, "calib/ece": 0.3441035856573706, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4063745019920319, "calib/gap": 0.0287698209718672, "calib/mean_conf": 0.8859362549800797, "calib/mu_c": 0.8991176470588236, "calib/mu_w": 0.8703478260869564, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3441035856573706, "calib/std_conf": 0.07584464568578271, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8515450643776825, "calib/step_q_c_n": 233.0, "calib/step_q_gap": 0.024426420309885843, "calib/step_q_w": 0.8271186440677967, "calib/step_q_w_n": 236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 456.8203125, "completions/mean_terminated_length": 460.4173278808594, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.046933333333333334, "grad_norm": 0.007396138738840818, "kl": 0.033664703369140625, "learning_rate": 4.333333333333334e-06, "loss": 0.014, "num_tokens": 10653436.0, "reward": 0.31293433904647827, "reward_std": 0.2180282473564148, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6254706978797913, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.3019458055496216, "step": 44 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5447098302408212, "calib/avg_num_step_conf": 1.82421875, "calib/ece": 0.30597609561752986, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.49800796812749004, "calib/gap": 0.01829714436110008, "calib/mean_conf": 0.892430278884462, "calib/mu_c": 0.8998657718120804, "calib/mu_w": 0.8815686274509803, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.30239043824701195, "calib/std_conf": 0.09723298363772212, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8685123966942149, "calib/step_q_c_n": 242.0, "calib/step_q_gap": 0.054823507805325966, "calib/step_q_w": 0.8136888888888889, "calib/step_q_w_n": 225.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 429.51171875, "completions/mean_terminated_length": 432.8937072753906, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.048, "grad_norm": 0.007078626658767462, "kl": 0.041751861572265625, "learning_rate": 4.305555555555556e-06, "loss": -0.0111, "num_tokens": 10868439.0, "reward": 0.35687994956970215, "reward_std": 0.22213514149188995, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6519414186477661, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.24990034103393555, "step": 45 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5891987179487179, "calib/avg_num_step_conf": 1.80078125, "calib/ece": 0.3853199999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.556, "calib/gap": 0.01215384615384596, "calib/mean_conf": 0.90532, "calib/mu_c": 0.9111538461538461, "calib/mu_w": 0.8990000000000001, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.3853199999999999, "calib/std_conf": 0.03882393076441384, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.8754716981132077, "calib/step_q_c_n": 212.0, "calib/step_q_gap": 0.052138364779874435, "calib/step_q_w": 0.8233333333333333, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 440.96875, "completions/mean_terminated_length": 440.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.04906666666666667, "grad_norm": 0.006802982650697231, "kl": 0.0423736572265625, "learning_rate": 4.277777777777778e-06, "loss": 0.0166, "num_tokens": 11086095.0, "reward": 0.2893574833869934, "reward_std": 0.20869553089141846, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.581676185131073, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": -0.2959299087524414, "step": 46 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5944444444444444, "calib/avg_num_step_conf": 1.42578125, "calib/ece": 0.30052845528455285, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6138211382113821, "calib/gap": 0.011570833333333197, "calib/mean_conf": 0.9102845528455284, "calib/mu_c": 0.9148, "calib/mu_w": 0.9032291666666667, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.30052845528455285, "calib/std_conf": 0.03952358981202645, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.8533183856502243, "calib/step_q_c_n": 223.0, "calib/step_q_gap": -0.013090065054001121, "calib/step_q_w": 0.8664084507042255, "calib/step_q_w_n": 142.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 450.23828125, "completions/mean_terminated_length": 453.7834777832031, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.050133333333333335, "grad_norm": 0.007135537452995777, "kl": 0.040294647216796875, "learning_rate": 4.25e-06, "loss": -0.0103, "num_tokens": 11307332.0, "reward": 0.3479617238044739, "reward_std": 0.20676106214523315, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6331160068511963, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": -0.24266132712364197, "step": 47 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5348680840163934, "calib/avg_num_step_conf": 1.7578125, "calib/ece": 0.4072, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.62, "calib/gap": -0.0017546106557376762, "calib/mean_conf": 0.9032, "calib/mu_c": 0.9023437499999999, "calib/mu_w": 0.9040983606557376, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3992, "calib/std_conf": 0.10269255084961128, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.8484304932735426, "calib/step_q_c_n": 223.0, "calib/step_q_gap": -0.01430078425949699, "calib/step_q_w": 0.8627312775330396, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 420.06640625, "completions/mean_terminated_length": 425.0474548339844, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.0512, "grad_norm": 0.006774710491299629, "kl": 0.046047210693359375, "learning_rate": 4.222222222222223e-06, "loss": 0.0519, "num_tokens": 11518557.0, "reward": 0.2743079960346222, "reward_std": 0.24875454604625702, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5700136423110962, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.3151476979255676, "step": 48 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5793089513032937, "calib/avg_num_step_conf": 1.53515625, "calib/ece": 0.3375000000000001, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8225806451612904, "calib/gap": 0.010764464201522217, "calib/mean_conf": 0.9302419354838709, "calib/mu_c": 0.9346258503401361, "calib/mu_w": 0.9238613861386139, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.3375000000000001, "calib/std_conf": 0.034964583686681205, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.908986784140969, "calib/step_q_c_n": 227.0, "calib/step_q_gap": 0.05904702510482429, "calib/step_q_w": 0.8499397590361447, "calib/step_q_w_n": 166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 415.40625, "completions/mean_terminated_length": 417.0353088378906, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.05226666666666667, "grad_norm": 0.00680144689977169, "kl": 0.053955078125, "learning_rate": 4.194444444444445e-06, "loss": -0.0279, "num_tokens": 11729437.0, "reward": 0.32491326332092285, "reward_std": 0.20095054805278778, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.623964786529541, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.2811695337295532, "step": 49 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6212825933756165, "calib/avg_num_step_conf": 1.57421875, "calib/ece": 0.2795617529880476, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8725099601593626, "calib/gap": 0.015852008456659794, "calib/mean_conf": 0.9369322709163347, "calib/mu_c": 0.9423636363636364, "calib/mu_w": 0.9265116279069766, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2795617529880476, "calib/std_conf": 0.034832430314519346, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.9162256809338523, "calib/step_q_c_n": 257.0, "calib/step_q_gap": 0.049033900111934314, "calib/step_q_w": 0.867191780821918, "calib/step_q_w_n": 146.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 452.24609375, "completions/mean_terminated_length": 452.24609375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.05333333333333334, "grad_norm": 0.006964311469346285, "kl": 0.045654296875, "learning_rate": 4.166666666666667e-06, "loss": 0.0524, "num_tokens": 11950572.0, "reward": 0.37954503297805786, "reward_std": 0.21893005073070526, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6810219287872314, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.24536937475204468, "step": 50 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5048956546598056, "calib/avg_num_step_conf": 1.4453125, "calib/ece": 0.3005668016194331, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9271255060728745, "calib/gap": 0.003136077758719269, "calib/mean_conf": 0.9442914979757084, "calib/mu_c": 0.9454088050314465, "calib/mu_w": 0.9422727272727273, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.3005668016194331, "calib/std_conf": 0.02931408942209797, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.9142060085836909, "calib/step_q_c_n": 233.0, "calib/step_q_gap": 0.07362206697785145, "calib/step_q_w": 0.8405839416058395, "calib/step_q_w_n": 137.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 436.2578125, "completions/mean_terminated_length": 441.43084716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.0544, "grad_norm": 0.0068186987191438675, "kl": 0.0561676025390625, "learning_rate": 4.138888888888889e-06, "loss": -0.0299, "num_tokens": 12171550.0, "reward": 0.3507247269153595, "reward_std": 0.22434017062187195, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6525163650512695, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.26591068506240845, "step": 51 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6222362105594461, "calib/avg_num_step_conf": 1.375, "calib/ece": 0.22187999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.05683924777716587, "calib/mean_conf": 0.93788, "calib/mu_c": 0.9540223463687152, "calib/mu_w": 0.8971830985915493, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.22187999999999994, "calib/std_conf": 0.09374809651400928, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.9365725806451612, "calib/step_q_c_n": 248.0, "calib/step_q_gap": 0.08916873449131513, "calib/step_q_w": 0.8474038461538461, "calib/step_q_w_n": 104.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 431.87890625, "completions/mean_terminated_length": 435.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.055466666666666664, "grad_norm": 0.013513702899217606, "kl": 0.07970809936523438, "learning_rate": 4.111111111111111e-06, "loss": 0.0306, "num_tokens": 12390063.0, "reward": 0.4429655075073242, "reward_std": 0.21031616628170013, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7162933349609375, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": -0.1592685878276825, "step": 52 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6598223824786325, "calib/avg_num_step_conf": 1.3671875, "calib/ece": 0.3287698412698412, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9206349206349206, "calib/gap": 0.022051282051281706, "calib/mean_conf": 0.9478174603174603, "calib/mu_c": 0.9562179487179484, "calib/mu_w": 0.9341666666666667, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3287698412698412, "calib/std_conf": 0.038426367599388646, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.9468877551020408, "calib/step_q_c_n": 196.0, "calib/step_q_gap": 0.07597866419294985, "calib/step_q_w": 0.870909090909091, "calib/step_q_w_n": 154.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 425.6953125, "completions/mean_terminated_length": 427.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.05653333333333333, "grad_norm": 0.006831077393144369, "kl": 0.040790557861328125, "learning_rate": 4.083333333333334e-06, "loss": -0.0349, "num_tokens": 12604865.0, "reward": 0.34930217266082764, "reward_std": 0.19114413857460022, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6468698978424072, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.26545315980911255, "step": 53 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6534778225806452, "calib/avg_num_step_conf": 1.828125, "calib/ece": 0.1966929133858267, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9330708661417323, "calib/gap": 0.05971774193548396, "calib/mean_conf": 0.9441732283464568, "calib/mu_c": 0.9587500000000001, "calib/mu_w": 0.8990322580645161, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19248031496062984, "calib/std_conf": 0.11960197075827311, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9289714285714286, "calib/step_q_c_n": 350.0, "calib/step_q_gap": 0.09202227602905577, "calib/step_q_w": 0.8369491525423728, "calib/step_q_w_n": 118.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 374.33984375, "completions/mean_terminated_length": 375.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.0576, "grad_norm": 0.007281064055860043, "kl": 0.052005767822265625, "learning_rate": 4.055555555555556e-06, "loss": -0.004, "num_tokens": 12806928.0, "reward": 0.4682023525238037, "reward_std": 0.15885430574417114, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7816210985183716, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.19365385174751282, "step": 54 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5886446886446887, "calib/avg_num_step_conf": 1.80078125, "calib/ece": 0.4545703125000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9375, "calib/gap": 0.04373748473748473, "calib/mean_conf": 0.9467578125, "calib/mu_c": 0.9689682539682539, "calib/mu_w": 0.9252307692307692, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4545703125000001, "calib/std_conf": 0.10804384570726294, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9283193277310924, "calib/step_q_c_n": 238.0, "calib/step_q_gap": 0.050561480197460096, "calib/step_q_w": 0.8777578475336323, "calib/step_q_w_n": 223.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 412.046875, "completions/mean_terminated_length": 413.66278076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.058666666666666666, "grad_norm": 0.00746389152482152, "kl": 0.04831695556640625, "learning_rate": 4.027777777777779e-06, "loss": 0.0122, "num_tokens": 13020236.0, "reward": 0.2420576512813568, "reward_std": 0.26765334606170654, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5534613132476807, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.36622104048728943, "step": 55 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5570409982174688, "calib/avg_num_step_conf": 2.00390625, "calib/ece": 0.35388888888888886, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9523809523809523, "calib/gap": 0.013107546048722551, "calib/mean_conf": 0.9600793650793652, "calib/mu_c": 0.9652287581699347, "calib/mu_w": 0.9521212121212121, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.3534126984126984, "calib/std_conf": 0.0583910738958275, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.8669716088328077, "calib/step_q_c_n": 317.0, "calib/step_q_gap": 0.008502221077705485, "calib/step_q_w": 0.8584693877551022, "calib/step_q_w_n": 196.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 455.26953125, "completions/mean_terminated_length": 455.26953125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.05973333333333333, "grad_norm": 0.006870965473353863, "kl": 0.040874481201171875, "learning_rate": 4.000000000000001e-06, "loss": 0.047, "num_tokens": 13243625.0, "reward": 0.3485686480998993, "reward_std": 0.21139225363731384, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6283090114593506, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.24289044737815857, "step": 56 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5856104651162791, "calib/avg_num_step_conf": 1.62890625, "calib/ece": 0.2908730158730158, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9603174603174603, "calib/gap": 0.015034883720930203, "calib/mean_conf": 0.9647619047619047, "calib/mu_c": 0.9695348837209302, "calib/mu_w": 0.9545, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.28654761904761894, "calib/std_conf": 0.08153015106621486, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.942695035460993, "calib/step_q_c_n": 282.0, "calib/step_q_gap": 0.01780614657210422, "calib/step_q_w": 0.9248888888888888, "calib/step_q_w_n": 135.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3005.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 431.2265625, "completions/mean_terminated_length": 432.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0608, "grad_norm": 0.0062557547353208065, "kl": 0.048091888427734375, "learning_rate": 3.972222222222223e-06, "loss": 0.0289, "num_tokens": 13460811.0, "reward": 0.38320237398147583, "reward_std": 0.21609210968017578, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6925468444824219, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.25739216804504395, "step": 57 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.47277458961697594, "calib/avg_num_step_conf": 1.75390625, "calib/ece": 0.45485714285714296, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9755102040816327, "calib/gap": -0.0031923128253035227, "calib/mean_conf": 0.9678367346938775, "calib/mu_c": 0.9662992125984252, "calib/mu_w": 0.9694915254237287, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.45216326530612255, "calib/std_conf": 0.04296906668032938, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.8950500000000001, "calib/step_q_c_n": 200.0, "calib/step_q_gap": 0.08822269076305223, "calib/step_q_w": 0.8068273092369479, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 514.9296875, "completions/mean_terminated_length": 518.9842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.06186666666666667, "grad_norm": 0.006043759640306234, "kl": 0.043193817138671875, "learning_rate": 3.944444444444445e-06, "loss": -0.0193, "num_tokens": 13698953.0, "reward": 0.2258208990097046, "reward_std": 0.29372695088386536, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5175651907920837, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.355767160654068, "step": 58 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5737896169550845, "calib/avg_num_step_conf": 1.9921875, "calib/ece": 0.40988000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.932, "calib/gap": 0.030208697906539905, "calib/mean_conf": 0.9507599999999999, "calib/mu_c": 0.9641726618705037, "calib/mu_w": 0.9339639639639638, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4023200000000001, "calib/std_conf": 0.1336855354928124, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.9121455938697319, "calib/step_q_c_n": 261.0, "calib/step_q_gap": 0.15475603563680007, "calib/step_q_w": 0.7573895582329319, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 437.203125, "completions/mean_terminated_length": 440.6456604003906, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.06293333333333333, "grad_norm": 0.006019299384206533, "kl": 0.050403594970703125, "learning_rate": 3.916666666666667e-06, "loss": 0.0107, "num_tokens": 13917125.0, "reward": 0.2786928415298462, "reward_std": 0.25457003712654114, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5765078067779541, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.32224711775779724, "step": 59 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5588255124056095, "calib/avg_num_step_conf": 2.07421875, "calib/ece": 0.38817813765182196, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9676113360323887, "calib/gap": 0.011173139158576206, "calib/mean_conf": 0.971174089068826, "calib/mu_c": 0.9758333333333332, "calib/mu_w": 0.964660194174757, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.38817813765182196, "calib/std_conf": 0.04108643570655636, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.9142356687898091, "calib/step_q_c_n": 314.0, "calib/step_q_gap": 0.044420000587044095, "calib/step_q_w": 0.869815668202765, "calib/step_q_w_n": 217.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 463.4609375, "completions/mean_terminated_length": 468.95654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.064, "grad_norm": 0.006463638506829739, "kl": 0.053012847900390625, "learning_rate": 3.88888888888889e-06, "loss": -0.0161, "num_tokens": 14144627.0, "reward": 0.2865564227104187, "reward_std": 0.24216191470623016, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5803898572921753, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": -0.30962073802948, "step": 60 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5601492965834052, "calib/avg_num_step_conf": 2.45703125, "calib/ece": 0.28454545454545443, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9407114624505929, "calib/gap": 0.03187697387309785, "calib/mean_conf": 0.9643873517786562, "calib/mu_c": 0.9745930232558139, "calib/mu_w": 0.942716049382716, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.28454545454545443, "calib/std_conf": 0.07765409363964547, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8914861460957177, "calib/step_q_c_n": 397.0, "calib/step_q_gap": 0.10708959437157972, "calib/step_q_w": 0.784396551724138, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 416.92578125, "completions/mean_terminated_length": 418.5608215332031, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.06506666666666666, "grad_norm": 0.0071130190044641495, "kl": 0.05157470703125, "learning_rate": 3.861111111111112e-06, "loss": 0.014, "num_tokens": 14355424.0, "reward": 0.384852796792984, "reward_std": 0.2306276559829712, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6892913579940796, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.2500545382499695, "step": 61 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5288929840726603, "calib/avg_num_step_conf": 2.08203125, "calib/ece": 0.40590361445783146, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9317269076305221, "calib/gap": -0.00025536395945779944, "calib/mean_conf": 0.9554618473895583, "calib/mu_c": 0.9553521126760561, "calib/mu_w": 0.9556074766355139, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.39554216867469894, "calib/std_conf": 0.1157717579982303, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.8620863309352517, "calib/step_q_c_n": 278.0, "calib/step_q_gap": 0.09330201720976139, "calib/step_q_w": 0.7687843137254903, "calib/step_q_w_n": 255.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 466.50390625, "completions/mean_terminated_length": 468.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.06613333333333334, "grad_norm": 0.005557631608098745, "kl": 0.043121337890625, "learning_rate": 3.833333333333334e-06, "loss": -0.0031, "num_tokens": 14581929.0, "reward": 0.25923866033554077, "reward_std": 0.3093259930610657, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5729343891143799, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.35914450883865356, "step": 62 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6637687969924813, "calib/avg_num_step_conf": 1.73046875, "calib/ece": 0.36468, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.944, "calib/gap": 0.020064446831364324, "calib/mean_conf": 0.9692400000000002, "calib/mu_c": 0.9771052631578948, "calib/mu_w": 0.9570408163265305, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.36296, "calib/std_conf": 0.04494243429099051, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.918637992831541, "calib/step_q_c_n": 279.0, "calib/step_q_gap": 0.07058921234373627, "calib/step_q_w": 0.8480487804878047, "calib/step_q_w_n": 164.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 501.34765625, "completions/mean_terminated_length": 503.3137512207031, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.0672, "grad_norm": 0.005798683501780033, "kl": 0.044086456298828125, "learning_rate": 3.8055555555555556e-06, "loss": 0.0373, "num_tokens": 14818914.0, "reward": 0.3223249912261963, "reward_std": 0.2504933476448059, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6075301170349121, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": -0.2722550630569458, "step": 63 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5043233082706767, "calib/avg_num_step_conf": 1.9765625, "calib/ece": 0.2878486055776894, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8605577689243028, "calib/gap": 0.004703759398496543, "calib/mean_conf": 0.9174900398406375, "calib/mu_c": 0.9189142857142857, "calib/mu_w": 0.9142105263157891, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.25406374501992046, "calib/std_conf": 0.18744368317696553, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.8397640117994102, "calib/step_q_c_n": 339.0, "calib/step_q_gap": 0.02072209563174565, "calib/step_q_w": 0.8190419161676645, "calib/step_q_w_n": 167.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 435.98828125, "completions/mean_terminated_length": 437.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.06826666666666667, "grad_norm": 0.006303312722593546, "kl": 0.049251556396484375, "learning_rate": 3.777777777777778e-06, "loss": 0.0548, "num_tokens": 15034303.0, "reward": 0.3864745497703552, "reward_std": 0.2799364924430847, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6870027184486389, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.24530363082885742, "step": 64 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5792896283092362, "calib/avg_num_step_conf": 1.7890625, "calib/ece": 0.37591269841269853, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9563492063492064, "calib/gap": 0.0002792632204396961, "calib/mean_conf": 0.9728968253968254, "calib/mu_c": 0.9730065359477126, "calib/mu_w": 0.9727272727272729, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.37083333333333346, "calib/std_conf": 0.0524625836986189, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.9113553113553114, "calib/step_q_c_n": 273.0, "calib/step_q_gap": -0.008158202158202177, "calib/step_q_w": 0.9195135135135136, "calib/step_q_w_n": 185.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 380.93359375, "completions/mean_terminated_length": 383.9330749511719, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.06933333333333333, "grad_norm": 0.006032762583345175, "kl": 0.0626678466796875, "learning_rate": 3.7500000000000005e-06, "loss": -0.011, "num_tokens": 15236846.0, "reward": 0.2991948425769806, "reward_std": 0.1917351335287094, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.5997199416160583, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.3146114945411682, "step": 65 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6051844783715014, "calib/avg_num_step_conf": 1.91796875, "calib/ece": 0.4349800796812749, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9083665338645418, "calib/gap": 0.03393702290076328, "calib/mean_conf": 0.9444621513944225, "calib/mu_c": 0.9606870229007634, "calib/mu_w": 0.9267500000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.4287649402390438, "calib/std_conf": 0.14269185575553064, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.9092703862660945, "calib/step_q_c_n": 233.0, "calib/step_q_gap": 0.1638052699870246, "calib/step_q_w": 0.7454651162790699, "calib/step_q_w_n": 258.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2388.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 492.7421875, "completions/mean_terminated_length": 494.6745300292969, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.0704, "grad_norm": 0.005543500185012817, "kl": 0.049045562744140625, "learning_rate": 3.7222222222222225e-06, "loss": -0.0261, "num_tokens": 15469340.0, "reward": 0.26769766211509705, "reward_std": 0.24292393028736115, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5495148301124573, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.31021326780319214, "step": 66 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6902941778718176, "calib/avg_num_step_conf": 1.7890625, "calib/ece": 0.32178571428571445, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9404761904761905, "calib/gap": 0.04612040133779283, "calib/mean_conf": 0.9606746031746033, "calib/mu_c": 0.977329192546584, "calib/mu_w": 0.9312087912087912, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.32178571428571445, "calib/std_conf": 0.10923226523193443, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.9151438848920863, "calib/step_q_c_n": 278.0, "calib/step_q_gap": 0.14947721822541948, "calib/step_q_w": 0.7656666666666668, "calib/step_q_w_n": 180.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 461.453125, "completions/mean_terminated_length": 461.453125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07146666666666666, "grad_norm": 0.006038357503712177, "kl": 0.05561065673828125, "learning_rate": 3.694444444444445e-06, "loss": -0.0241, "num_tokens": 15692480.0, "reward": 0.36475488543510437, "reward_std": 0.1881740391254425, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.660184383392334, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.2517683506011963, "step": 67 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7296124927703874, "calib/avg_num_step_conf": 2.328125, "calib/ece": 0.41103999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.864, "calib/gap": 0.06692500481974173, "calib/mean_conf": 0.94304, "calib/mu_c": 0.9743609022556391, "calib/mu_w": 0.9074358974358974, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.41103999999999996, "calib/std_conf": 0.1097212759677903, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.873956834532374, "calib/step_q_c_n": 278.0, "calib/step_q_gap": 0.26574928736256265, "calib/step_q_w": 0.6082075471698114, "calib/step_q_w_n": 318.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 498.98828125, "completions/mean_terminated_length": 500.94512939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.07253333333333334, "grad_norm": 0.005618997849524021, "kl": 0.052722930908203125, "learning_rate": 3.6666666666666666e-06, "loss": 0.0285, "num_tokens": 15924309.0, "reward": 0.2997767925262451, "reward_std": 0.23451535403728485, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5811023712158203, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.2784237563610077, "step": 68 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6614257812500001, "calib/avg_num_step_conf": 1.93359375, "calib/ece": 0.4521774193548388, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8024193548387096, "calib/gap": 0.04785937500000015, "calib/mean_conf": 0.9110483870967742, "calib/mu_c": 0.9357500000000002, "calib/mu_w": 0.887890625, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.4396774193548388, "calib/std_conf": 0.17850100997449472, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.8308000000000001, "calib/step_q_c_n": 200.0, "calib/step_q_gap": 0.09486779661016953, "calib/step_q_w": 0.7359322033898306, "calib/step_q_w_n": 295.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 543.10546875, "completions/mean_terminated_length": 547.3818969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.0736, "grad_norm": 0.005082065239548683, "kl": 0.041614532470703125, "learning_rate": 3.638888888888889e-06, "loss": 0.0109, "num_tokens": 16167840.0, "reward": 0.2741231620311737, "reward_std": 0.29141664505004883, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.5398664474487305, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": -0.2775576114654541, "step": 69 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6298890429958391, "calib/avg_num_step_conf": 1.99609375, "calib/ece": 0.35407407407407404, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.8271604938271605, "calib/gap": 0.08578224687933456, "calib/mean_conf": 0.8983539094650206, "calib/mu_c": 0.9347142857142858, "calib/mu_w": 0.8489320388349513, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.3381481481481481, "calib/std_conf": 0.21164398210110583, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.8400694444444444, "calib/step_q_c_n": 288.0, "calib/step_q_gap": 0.13997975834578968, "calib/step_q_w": 0.7000896860986547, "calib/step_q_w_n": 223.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2751.0, "completions/max_terminated_length": 2751.0, "completions/mean_length": 541.81640625, "completions/mean_terminated_length": 543.9412231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.07466666666666667, "grad_norm": 0.005732155870646238, "kl": 0.042018890380859375, "learning_rate": 3.6111111111111115e-06, "loss": 0.0214, "num_tokens": 16413537.0, "reward": 0.3209608197212219, "reward_std": 0.2563707232475281, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6122055053710938, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": -0.2687213718891144, "step": 70 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6134592910011687, "calib/avg_num_step_conf": 1.984375, "calib/ece": 0.3526482213438736, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8181818181818182, "calib/gap": 0.021333593039864707, "calib/mean_conf": 0.9184189723320159, "calib/mu_c": 0.9270198675496686, "calib/mu_w": 0.9056862745098039, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3371146245059289, "calib/std_conf": 0.1589039482781675, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.8082517482517484, "calib/step_q_c_n": 286.0, "calib/step_q_gap": 0.07293643293643304, "calib/step_q_w": 0.7353153153153154, "calib/step_q_w_n": 222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 455.125, "completions/mean_terminated_length": 455.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.07573333333333333, "grad_norm": 0.005377875175327063, "kl": 0.052242279052734375, "learning_rate": 3.5833333333333335e-06, "loss": -0.0012, "num_tokens": 16634457.0, "reward": 0.34045037627220154, "reward_std": 0.27277880907058716, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6282824277877808, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.2614441514015198, "step": 71 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7194603174603174, "calib/avg_num_step_conf": 2.109375, "calib/ece": 0.3391764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8274509803921568, "calib/gap": 0.091447619047619, "calib/mean_conf": 0.9274117647058824, "calib/mu_c": 0.9650666666666666, "calib/mu_w": 0.8736190476190476, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.3391764705882353, "calib/std_conf": 0.14417706658223547, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.8171304347826087, "calib/step_q_c_n": 345.0, "calib/step_q_gap": 0.12800222965440367, "calib/step_q_w": 0.689128205128205, "calib/step_q_w_n": 195.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 424.66015625, "completions/mean_terminated_length": 426.32550048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0768, "grad_norm": 0.006489574443548918, "kl": 0.058135986328125, "learning_rate": 3.555555555555556e-06, "loss": -0.0029, "num_tokens": 16847578.0, "reward": 0.3816695511341095, "reward_std": 0.19125699996948242, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.658865213394165, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.20880737900733948, "step": 72 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6533471359558316, "calib/avg_num_step_conf": 1.953125, "calib/ece": 0.29110756972111546, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8406374501992032, "calib/gap": 0.09448378191856455, "calib/mean_conf": 0.9244940239043826, "calib/mu_c": 0.9583726708074535, "calib/mu_w": 0.8638888888888889, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.2870836653386454, "calib/std_conf": 0.1598070891455655, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.8070538922155687, "calib/step_q_c_n": 334.0, "calib/step_q_gap": 0.042234615107135, "calib/step_q_w": 0.7648192771084337, "calib/step_q_w_n": 166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 472.703125, "completions/mean_terminated_length": 472.703125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.07786666666666667, "grad_norm": 0.005386182572692633, "kl": 0.046176910400390625, "learning_rate": 3.5277777777777784e-06, "loss": 0.0025, "num_tokens": 17075622.0, "reward": 0.3789359927177429, "reward_std": 0.2643182575702667, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6753011345863342, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": -0.23461660742759705, "step": 73 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6607167566071677, "calib/avg_num_step_conf": 2.27734375, "calib/ece": 0.32689795918367354, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.7795918367346939, "calib/gap": 0.07555901480559035, "calib/mean_conf": 0.8896734693877553, "calib/mu_c": 0.920205479452055, "calib/mu_w": 0.8446464646464646, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.31032653061224497, "calib/std_conf": 0.21682659183960734, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7695512820512821, "calib/step_q_c_n": 312.0, "calib/step_q_gap": 0.15338892042766605, "calib/step_q_w": 0.616162361623616, "calib/step_q_w_n": 271.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 471.70703125, "completions/mean_terminated_length": 477.3004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.07893333333333333, "grad_norm": 0.0063706678338348866, "kl": 0.051799774169921875, "learning_rate": 3.5e-06, "loss": 0.0317, "num_tokens": 17300307.0, "reward": 0.3643389344215393, "reward_std": 0.22409476339817047, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6295297145843506, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": -0.20475810766220093, "step": 74 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7152218152218153, "calib/avg_num_step_conf": 2.171875, "calib/ece": 0.16940944881889786, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7755905511811023, "calib/gap": 0.13941961741961728, "calib/mean_conf": 0.9071259842519686, "calib/mu_c": 0.9428042328042328, "calib/mu_w": 0.8033846153846155, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1662204724409451, "calib/std_conf": 0.16891970287012711, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7913853904282114, "calib/step_q_c_n": 397.0, "calib/step_q_gap": 0.17094513885588436, "calib/step_q_w": 0.6204402515723271, "calib/step_q_w_n": 159.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 413.25390625, "completions/mean_terminated_length": 414.8745422363281, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.08, "grad_norm": 0.006375730037689209, "kl": 0.05806732177734375, "learning_rate": 3.4722222222222224e-06, "loss": 0.0151, "num_tokens": 17510852.0, "reward": 0.49380946159362793, "reward_std": 0.20836231112480164, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.78885817527771, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.14420804381370544, "step": 75 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.728522920203735, "calib/avg_num_step_conf": 1.8359375, "calib/ece": 0.27392, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.68, "calib/gap": 0.14338879456706288, "calib/mean_conf": 0.84848, "calib/mu_c": 0.9029677419354838, "calib/mu_w": 0.7595789473684209, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2512, "calib/std_conf": 0.2451923522461498, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7692057761732852, "calib/step_q_c_n": 277.0, "calib/step_q_gap": 0.17977572435981382, "calib/step_q_w": 0.5894300518134714, "calib/step_q_w_n": 193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 464.3984375, "completions/mean_terminated_length": 466.2196350097656, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.08106666666666666, "grad_norm": 0.0066471709869802, "kl": 0.058727264404296875, "learning_rate": 3.444444444444445e-06, "loss": 0.062, "num_tokens": 17732794.0, "reward": 0.4030284285545349, "reward_std": 0.258221834897995, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.702775776386261, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.21312521398067474, "step": 76 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6567358400109581, "calib/avg_num_step_conf": 2.41015625, "calib/ece": 0.2280799999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.536, "calib/gap": 0.13451133484007938, "calib/mean_conf": 0.782, "calib/mu_c": 0.8320382165605096, "calib/mu_w": 0.6975268817204302, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.19103999999999982, "calib/std_conf": 0.2722631080407333, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6617705735660848, "calib/step_q_c_n": 401.0, "calib/step_q_gap": 0.12436316615867737, "calib/step_q_w": 0.5374074074074074, "calib/step_q_w_n": 216.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 483.44140625, "completions/mean_terminated_length": 485.3372802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.08213333333333334, "grad_norm": 0.0059607732109725475, "kl": 0.056232452392578125, "learning_rate": 3.416666666666667e-06, "loss": 0.0241, "num_tokens": 17961219.0, "reward": 0.4120379686355591, "reward_std": 0.20473268628120422, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7089191675186157, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.20124945044517517, "step": 77 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6365384615384616, "calib/avg_num_step_conf": 2.375, "calib/ece": 0.23976095617529875, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5418326693227091, "calib/gap": 0.14284821428571448, "calib/mean_conf": 0.7667729083665339, "calib/mu_c": 0.8185625000000002, "calib/mu_w": 0.6757142857142857, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18454183266932267, "calib/std_conf": 0.29337088865439276, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5924303797468354, "calib/step_q_c_n": 395.0, "calib/step_q_gap": 0.04768859570927664, "calib/step_q_w": 0.5447417840375588, "calib/step_q_w_n": 213.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 510.5078125, "completions/mean_terminated_length": 512.5098266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0832, "grad_norm": 0.005696735344827175, "kl": 0.05327606201171875, "learning_rate": 3.3888888888888893e-06, "loss": 0.022, "num_tokens": 18199933.0, "reward": 0.4230232834815979, "reward_std": 0.22738784551620483, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7163605690002441, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.19062650203704834, "step": 78 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6093189964157706, "calib/avg_num_step_conf": 2.15625, "calib/ece": 0.2187450980392157, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.47843137254901963, "calib/gap": 0.10993628036638792, "calib/mean_conf": 0.7760784313725491, "calib/mu_c": 0.8161728395061728, "calib/mu_w": 0.7062365591397849, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.179764705882353, "calib/std_conf": 0.2641440369947661, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6336994219653179, "calib/step_q_c_n": 346.0, "calib/step_q_gap": 0.0751071889556092, "calib/step_q_w": 0.5585922330097087, "calib/step_q_w_n": 206.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 475.4921875, "completions/mean_terminated_length": 475.4921875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08426666666666667, "grad_norm": 0.005750508978962898, "kl": 0.047275543212890625, "learning_rate": 3.3611111111111117e-06, "loss": -0.0058, "num_tokens": 18428035.0, "reward": 0.42249900102615356, "reward_std": 0.21679085493087769, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7268054485321045, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.20758873224258423, "step": 79 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6096211153682418, "calib/avg_num_step_conf": 2.63671875, "calib/ece": 0.2477647058823531, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5803921568627451, "calib/gap": 0.037711792252022036, "calib/mean_conf": 0.8214117647058824, "calib/mu_c": 0.8333908045977011, "calib/mu_w": 0.7956790123456791, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1934117647058825, "calib/std_conf": 0.2444817789810696, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6234232209737828, "calib/step_q_c_n": 445.0, "calib/step_q_gap": 0.053944960104217565, "calib/step_q_w": 0.5694782608695652, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 409.6953125, "completions/mean_terminated_length": 409.6953125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.08533333333333333, "grad_norm": 0.005894418340176344, "kl": 0.0623931884765625, "learning_rate": 3.3333333333333333e-06, "loss": 0.018, "num_tokens": 18635077.0, "reward": 0.4095434546470642, "reward_std": 0.21538078784942627, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7176773548126221, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.23374667763710022, "step": 80 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.623547335600907, "calib/avg_num_step_conf": 2.45703125, "calib/ece": 0.24091269841269836, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5119047619047619, "calib/gap": 0.08642857142857152, "calib/mean_conf": 0.7660714285714286, "calib/mu_c": 0.7948809523809525, "calib/mu_w": 0.708452380952381, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1701587301587301, "calib/std_conf": 0.2805678108501771, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6212747875354108, "calib/step_q_c_n": 353.0, "calib/step_q_gap": 0.1786298599991788, "calib/step_q_w": 0.44264492753623197, "calib/step_q_w_n": 276.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2770.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 441.99609375, "completions/mean_terminated_length": 445.47637939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0864, "grad_norm": 0.006212920416146517, "kl": 0.050567626953125, "learning_rate": 3.3055555555555558e-06, "loss": 0.0056, "num_tokens": 18854476.0, "reward": 0.4199260473251343, "reward_std": 0.2087436318397522, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7162222266197205, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.2044951617717743, "step": 81 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7415920010388262, "calib/avg_num_step_conf": 2.515625, "calib/ece": 0.23395256916996054, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5217391304347826, "calib/gap": 0.15813465783664438, "calib/mean_conf": 0.8060474308300396, "calib/mu_c": 0.8698013245033113, "calib/mu_w": 0.7116666666666669, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22158102766798424, "calib/std_conf": 0.2462384800445783, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6318965517241381, "calib/step_q_c_n": 348.0, "calib/step_q_gap": 0.1533830382106246, "calib/step_q_w": 0.47851351351351346, "calib/step_q_w_n": 296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 427.6484375, "completions/mean_terminated_length": 427.6484375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.08746666666666666, "grad_norm": 0.006339720916002989, "kl": 0.059291839599609375, "learning_rate": 3.277777777777778e-06, "loss": 0.0334, "num_tokens": 19069506.0, "reward": 0.4325372874736786, "reward_std": 0.19591914117336273, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7225097417831421, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.17306017875671387, "step": 82 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7263938124395747, "calib/avg_num_step_conf": 2.27734375, "calib/ece": 0.2421428571428571, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5198412698412699, "calib/gap": 0.2121882049629391, "calib/mean_conf": 0.7561111111111112, "calib/mu_c": 0.8462068965517241, "calib/mu_w": 0.634018691588785, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2114285714285714, "calib/std_conf": 0.30521257807149943, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6393235294117647, "calib/step_q_c_n": 340.0, "calib/step_q_gap": 0.16500254175744378, "calib/step_q_w": 0.47432098765432096, "calib/step_q_w_n": 243.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 461.54296875, "completions/mean_terminated_length": 465.1771545410156, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.08853333333333334, "grad_norm": 0.005662387236952782, "kl": 0.052764892578125, "learning_rate": 3.2500000000000002e-06, "loss": 0.0617, "num_tokens": 19294925.0, "reward": 0.42235541343688965, "reward_std": 0.21044637262821198, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7220921516418457, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.18753761053085327, "step": 83 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6842507645259939, "calib/avg_num_step_conf": 2.26171875, "calib/ece": 0.20549407114624516, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.40711462450592883, "calib/gap": 0.17332441386340436, "calib/mean_conf": 0.7326877470355732, "calib/mu_c": 0.8073611111111111, "calib/mu_w": 0.6340366972477067, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18450592885375502, "calib/std_conf": 0.28295848333032897, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.662012012012012, "calib/step_q_c_n": 333.0, "calib/step_q_gap": 0.1669099794916868, "calib/step_q_w": 0.4951020325203252, "calib/step_q_w_n": 246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 412.18359375, "completions/mean_terminated_length": 413.8000183105469, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.0896, "grad_norm": 0.006523535121232271, "kl": 0.056682586669921875, "learning_rate": 3.2222222222222227e-06, "loss": 0.0174, "num_tokens": 19506364.0, "reward": 0.40939784049987793, "reward_std": 0.222348153591156, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7226859331130981, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.21404653787612915, "step": 84 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7190209790209792, "calib/avg_num_step_conf": 2.66015625, "calib/ece": 0.21930041152263377, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.448559670781893, "calib/gap": 0.17769020979020966, "calib/mean_conf": 0.7566666666666667, "calib/mu_c": 0.8297902097902098, "calib/mu_w": 0.6521000000000001, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.19374485596707822, "calib/std_conf": 0.27149032374059606, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.6223906705539358, "calib/step_q_c_n": 343.0, "calib/step_q_gap": 0.16008297824624346, "calib/step_q_w": 0.46230769230769236, "calib/step_q_w_n": 338.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 482.578125, "completions/mean_terminated_length": 492.1912536621094, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.09066666666666667, "grad_norm": 0.005401519127190113, "kl": 0.051334381103515625, "learning_rate": 3.1944444444444443e-06, "loss": 0.0206, "num_tokens": 19737728.0, "reward": 0.42077910900115967, "reward_std": 0.20061340928077698, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6981011629104614, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": -0.1573241800069809, "step": 85 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7217864923747277, "calib/avg_num_step_conf": 2.390625, "calib/ece": 0.16023622047244102, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2992125984251969, "calib/gap": 0.23603112356053513, "calib/mean_conf": 0.6639370078740157, "calib/mu_c": 0.7745185185185184, "calib/mu_w": 0.5384873949579833, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.14633858267716543, "calib/std_conf": 0.30225180481232367, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6139563862928349, "calib/step_q_c_n": 321.0, "calib/step_q_gap": 0.10969521790795511, "calib/step_q_w": 0.5042611683848798, "calib/step_q_w_n": 291.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 456.72265625, "completions/mean_terminated_length": 458.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 0.09173333333333333, "grad_norm": 0.00633582565933466, "kl": 0.056629180908203125, "learning_rate": 3.1666666666666667e-06, "loss": 0.0374, "num_tokens": 19960161.0, "reward": 0.43442198634147644, "reward_std": 0.18732504546642303, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7422605752944946, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.17419779300689697, "step": 86 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6566960218645612, "calib/avg_num_step_conf": 2.45703125, "calib/ece": 0.18626984126984128, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4642857142857143, "calib/gap": 0.10433647130276336, "calib/mean_conf": 0.793968253968254, "calib/mu_c": 0.8246067415730336, "calib/mu_w": 0.7202702702702702, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13694444444444448, "calib/std_conf": 0.2305981847320427, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6517913832199547, "calib/step_q_c_n": 441.0, "calib/step_q_gap": 0.11157861726250784, "calib/step_q_w": 0.5402127659574468, "calib/step_q_w_n": 188.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 416.15234375, "completions/mean_terminated_length": 421.08697509765625, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.0928, "grad_norm": 0.006363210268318653, "kl": 0.05908203125, "learning_rate": 3.138888888888889e-06, "loss": 0.035, "num_tokens": 20172192.0, "reward": 0.4662480652332306, "reward_std": 0.17796066403388977, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7597273588180542, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.16160625219345093, "step": 87 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7926428571428571, "calib/avg_num_step_conf": 2.234375, "calib/ece": 0.13345098039215686, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.28195357142857136, "calib/mean_conf": 0.7933725490196079, "calib/mu_c": 0.8818285714285714, "calib/mu_w": 0.599875, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12027450980392158, "calib/std_conf": 0.26565272633863474, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6617391304347826, "calib/step_q_c_n": 368.0, "calib/step_q_gap": 0.18865089514066502, "calib/step_q_w": 0.4730882352941176, "calib/step_q_w_n": 204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 440.140625, "completions/mean_terminated_length": 440.140625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.09386666666666667, "grad_norm": 0.0057291858829557896, "kl": 0.0485076904296875, "learning_rate": 3.1111111111111116e-06, "loss": 0.0265, "num_tokens": 20394716.0, "reward": 0.5039481520652771, "reward_std": 0.183674156665802, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.8173171877861023, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.14457716047763824, "step": 88 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7872451306103837, "calib/avg_num_step_conf": 1.90625, "calib/ece": 0.27201612903225797, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.43951612903225806, "calib/gap": 0.21800664451827267, "calib/mean_conf": 0.7491129032258066, "calib/mu_c": 0.8537209302325584, "calib/mu_w": 0.6357142857142857, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.25048387096774183, "calib/std_conf": 0.27707723418656127, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7401333333333332, "calib/step_q_c_n": 225.0, "calib/step_q_gap": 0.21549455006337115, "calib/step_q_w": 0.524638783269962, "calib/step_q_w_n": 263.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 494.5859375, "completions/mean_terminated_length": 502.4365234375, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.09493333333333333, "grad_norm": 0.005575645249336958, "kl": 0.04261016845703125, "learning_rate": 3.0833333333333336e-06, "loss": 0.0259, "num_tokens": 20630218.0, "reward": 0.4111081659793854, "reward_std": 0.18856662511825562, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7072281241416931, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.17954307794570923, "step": 89 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6472014260249553, "calib/avg_num_step_conf": 3.0078125, "calib/ece": 0.18515999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.54, "calib/gap": 0.15693761140819973, "calib/mean_conf": 0.80252, "calib/mu_c": 0.8558787878787879, "calib/mu_w": 0.6989411764705882, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16383999999999996, "calib/std_conf": 0.2593585348508894, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6243856332703214, "calib/step_q_c_n": 529.0, "calib/step_q_gap": 0.08670928472260364, "calib/step_q_w": 0.5376763485477177, "calib/step_q_w_n": 241.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 476.01171875, "completions/mean_terminated_length": 481.6561584472656, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.096, "grad_norm": 0.005511470139026642, "kl": 0.050212860107421875, "learning_rate": 3.055555555555556e-06, "loss": 0.0152, "num_tokens": 20855397.0, "reward": 0.4338238537311554, "reward_std": 0.20431192219257355, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7368851900100708, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.19267499446868896, "step": 90 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6859926470588235, "calib/avg_num_step_conf": 2.53515625, "calib/ece": 0.22027999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.592, "calib/gap": 0.10050735294117663, "calib/mean_conf": 0.83172, "calib/mu_c": 0.8638823529411765, "calib/mu_w": 0.7633749999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18599999999999997, "calib/std_conf": 0.2354787497843489, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6749728260869565, "calib/step_q_c_n": 368.0, "calib/step_q_gap": 0.14499738124709882, "calib/step_q_w": 0.5299754448398577, "calib/step_q_w_n": 281.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 464.1875, "completions/mean_terminated_length": 466.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.09706666666666666, "grad_norm": 0.09468050301074982, "kl": 0.8864479064941406, "learning_rate": 3.0277777777777776e-06, "loss": 0.0433, "num_tokens": 21081941.0, "reward": 0.4435563385486603, "reward_std": 0.20487895607948303, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7295480370521545, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.16977913677692413, "step": 91 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7207602339181286, "calib/avg_num_step_conf": 2.40234375, "calib/ece": 0.21690196078431362, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6588235294117647, "calib/gap": 0.15466165413533828, "calib/mean_conf": 0.838, "calib/mu_c": 0.8889473684210526, "calib/mu_w": 0.7342857142857143, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1921568627450979, "calib/std_conf": 0.2520790414478262, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7228042328042329, "calib/step_q_c_n": 378.0, "calib/step_q_gap": 0.1458000133949502, "calib/step_q_w": 0.5770042194092827, "calib/step_q_w_n": 237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 417.71484375, "completions/mean_terminated_length": 419.35296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.09813333333333334, "grad_norm": 0.006382127292454243, "kl": 0.05200958251953125, "learning_rate": 3e-06, "loss": -0.0076, "num_tokens": 21295596.0, "reward": 0.4563332200050354, "reward_std": 0.20540866255760193, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7529066801071167, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.17305275797843933, "step": 92 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6466410373362872, "calib/avg_num_step_conf": 2.78515625, "calib/ece": 0.26186507936507925, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5793650793650794, "calib/gap": 0.1254623053365479, "calib/mean_conf": 0.8167063492063492, "calib/mu_c": 0.867986577181208, "calib/mu_w": 0.7425242718446601, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24365079365079353, "calib/std_conf": 0.2537728507956622, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7077556818181818, "calib/step_q_c_n": 352.0, "calib/step_q_gap": 0.14222474922353723, "calib/step_q_w": 0.5655309325946446, "calib/step_q_w_n": 361.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 479.4375, "completions/mean_terminated_length": 479.4375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0992, "grad_norm": 0.006033513229340315, "kl": 0.053585052490234375, "learning_rate": 2.9722222222222225e-06, "loss": 0.0558, "num_tokens": 21524108.0, "reward": 0.40865015983581543, "reward_std": 0.25605103373527527, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6888468265533447, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.18404650688171387, "step": 93 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6965517241379311, "calib/avg_num_step_conf": 2.5546875, "calib/ece": 0.2849402390438247, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6095617529880478, "calib/gap": 0.13012296681847768, "calib/mean_conf": 0.8539442231075699, "calib/mu_c": 0.9088965517241381, "calib/mu_w": 0.7787735849056604, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28059760956175295, "calib/std_conf": 0.2095812103732773, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7510869565217392, "calib/step_q_c_n": 322.0, "calib/step_q_gap": 0.19261858302776314, "calib/step_q_w": 0.558468373493976, "calib/step_q_w_n": 332.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 431.9453125, "completions/mean_terminated_length": 431.9453125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.10026666666666667, "grad_norm": 0.0064744469709694386, "kl": 0.056850433349609375, "learning_rate": 2.944444444444445e-06, "loss": 0.0457, "num_tokens": 21743366.0, "reward": 0.39161694049835205, "reward_std": 0.24791404604911804, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6812667846679688, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.2058454155921936, "step": 94 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6644457135605051, "calib/avg_num_step_conf": 2.171875, "calib/ece": 0.2740625000000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.63671875, "calib/gap": 0.13748461196776463, "calib/mean_conf": 0.8322656250000001, "calib/mu_c": 0.8875816993464053, "calib/mu_w": 0.7500970873786407, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2543359375000001, "calib/std_conf": 0.24190879881343585, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7588580246913581, "calib/step_q_c_n": 324.0, "calib/step_q_gap": 0.18584078331204779, "calib/step_q_w": 0.5730172413793103, "calib/step_q_w_n": 232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 427.07421875, "completions/mean_terminated_length": 428.7490539550781, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.10133333333333333, "grad_norm": 0.00599477207288146, "kl": 0.05338287353515625, "learning_rate": 2.916666666666667e-06, "loss": 0.0132, "num_tokens": 21958825.0, "reward": 0.4048115015029907, "reward_std": 0.2196621149778366, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7120952606201172, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.22200357913970947, "step": 95 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7623239436619718, "calib/avg_num_step_conf": 2.2734375, "calib/ece": 0.1780392156862746, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7372549019607844, "calib/gap": 0.16595606246172678, "calib/mean_conf": 0.8996078431372551, "calib/mu_c": 0.9458152173913044, "calib/mu_w": 0.7798591549295776, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1780392156862746, "calib/std_conf": 0.17185904831432297, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8053508951406649, "calib/step_q_c_n": 391.0, "calib/step_q_gap": 0.22629330351762833, "calib/step_q_w": 0.5790575916230366, "calib/step_q_w_n": 191.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 389.19140625, "completions/mean_terminated_length": 389.19140625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.1024, "grad_norm": 0.006308667361736298, "kl": 0.062023162841796875, "learning_rate": 2.888888888888889e-06, "loss": 0.0673, "num_tokens": 22164274.0, "reward": 0.4969533085823059, "reward_std": 0.19700340926647186, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.8014000654220581, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.15046215057373047, "step": 96 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6387672505307854, "calib/avg_num_step_conf": 2.46875, "calib/ece": 0.24383399209486162, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6245059288537549, "calib/gap": 0.12807523885350303, "calib/mean_conf": 0.8454150197628459, "calib/mu_c": 0.8940127388535031, "calib/mu_w": 0.7659375, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23434782608695648, "calib/std_conf": 0.22668152009606526, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7184895833333332, "calib/step_q_c_n": 384.0, "calib/step_q_gap": 0.09064079301075256, "calib/step_q_w": 0.6278487903225807, "calib/step_q_w_n": 248.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 417.40234375, "completions/mean_terminated_length": 420.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.10346666666666667, "grad_norm": 0.006150359287858009, "kl": 0.05887603759765625, "learning_rate": 2.861111111111111e-06, "loss": 0.0149, "num_tokens": 22376201.0, "reward": 0.40991461277008057, "reward_std": 0.21251186728477478, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7105230093002319, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.21022510528564453, "step": 97 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6510576923076923, "calib/avg_num_step_conf": 2.1171875, "calib/ece": 0.2783070866141732, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6732283464566929, "calib/gap": 0.15268333333333317, "calib/mean_conf": 0.8364173228346456, "calib/mu_c": 0.8989333333333331, "calib/mu_w": 0.74625, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2620866141732283, "calib/std_conf": 0.25232025459923296, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7754846416382253, "calib/step_q_c_n": 293.0, "calib/step_q_gap": 0.17363725207999225, "calib/step_q_w": 0.601847389558233, "calib/step_q_w_n": 249.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 432.28125, "completions/mean_terminated_length": 432.28125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.10453333333333334, "grad_norm": 0.00583996158093214, "kl": 0.0562286376953125, "learning_rate": 2.8333333333333335e-06, "loss": 0.0185, "num_tokens": 22593049.0, "reward": 0.39500892162323, "reward_std": 0.22828838229179382, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6983347535133362, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.22237937152385712, "step": 98 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6685130992196208, "calib/avg_num_step_conf": 2.5078125, "calib/ece": 0.3617355371900826, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5206611570247934, "calib/gap": 0.16863154960981064, "calib/mean_conf": 0.757107438016529, "calib/mu_c": 0.8532692307692309, "calib/mu_w": 0.6846376811594203, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.34454545454545454, "calib/std_conf": 0.2961388941956455, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6900972, "calib/step_q_c_n": 250.0, "calib/step_q_gap": 0.15017219999999998, "calib/step_q_w": 0.539925, "calib/step_q_w_n": 392.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 471.0703125, "completions/mean_terminated_length": 490.219482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.1056, "grad_norm": 0.006908495910465717, "kl": 0.0541229248046875, "learning_rate": 2.805555555555556e-06, "loss": -0.0288, "num_tokens": 22819443.0, "reward": 0.30328214168548584, "reward_std": 0.2629527449607849, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.6006648540496826, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": -0.2628505229949951, "step": 99 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6835552426065804, "calib/avg_num_step_conf": 2.71484375, "calib/ece": 0.25205533596837937, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6047430830039525, "calib/gap": 0.1741339137114326, "calib/mean_conf": 0.8114229249011857, "calib/mu_c": 0.8850684931506849, "calib/mu_w": 0.7109345794392523, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2432015810276679, "calib/std_conf": 0.25954115598178634, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6737652811735941, "calib/step_q_c_n": 409.0, "calib/step_q_gap": 0.09583171474002761, "calib/step_q_w": 0.5779335664335665, "calib/step_q_w_n": 286.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 455.66015625, "completions/mean_terminated_length": 455.66015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10666666666666667, "grad_norm": 0.006140739191323519, "kl": 0.05904388427734375, "learning_rate": 2.7777777777777783e-06, "loss": 0.0308, "num_tokens": 23043500.0, "reward": 0.400008887052536, "reward_std": 0.22616682946681976, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7102363109588623, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.22193726897239685, "step": 100 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7193985996499124, "calib/avg_num_step_conf": 2.7734375, "calib/ece": 0.33383399209486164, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6126482213438735, "calib/gap": 0.16348462115528872, "calib/mean_conf": 0.8145454545454545, "calib/mu_c": 0.8979032258064515, "calib/mu_w": 0.7344186046511628, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.32913043478260867, "calib/std_conf": 0.2489287143960703, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6824866449511401, "calib/step_q_c_n": 307.0, "calib/step_q_gap": 0.12615910152682241, "calib/step_q_w": 0.5563275434243177, "calib/step_q_w_n": 403.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 467.17578125, "completions/mean_terminated_length": 469.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.10773333333333333, "grad_norm": 0.005982357542961836, "kl": 0.059112548828125, "learning_rate": 2.7500000000000004e-06, "loss": -0.0133, "num_tokens": 23270089.0, "reward": 0.35126781463623047, "reward_std": 0.25247329473495483, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6566468477249146, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.24786126613616943, "step": 101 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7154881060704537, "calib/avg_num_step_conf": 3.0078125, "calib/ece": 0.2201176470588235, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5647058823529412, "calib/gap": 0.18852398284154415, "calib/mean_conf": 0.8247450980392158, "calib/mu_c": 0.8971974522292993, "calib/mu_w": 0.7086734693877551, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21458823529411764, "calib/std_conf": 0.23791379581831032, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.715117994100295, "calib/step_q_c_n": 452.0, "calib/step_q_gap": 0.15436013246507474, "calib/step_q_w": 0.5607578616352202, "calib/step_q_w_n": 318.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 383.58984375, "completions/mean_terminated_length": 385.0941467285156, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.1088, "grad_norm": 0.006579132750630379, "kl": 0.07131195068359375, "learning_rate": 2.7222222222222224e-06, "loss": -0.025, "num_tokens": 23474984.0, "reward": 0.43535923957824707, "reward_std": 0.1856844127178192, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.749351978302002, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.2005085051059723, "step": 102 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6476994372724263, "calib/avg_num_step_conf": 2.49609375, "calib/ece": 0.2566929133858268, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5551181102362205, "calib/gap": 0.11292287322078787, "calib/mean_conf": 0.8185826771653544, "calib/mu_c": 0.8608176100628931, "calib/mu_w": 0.7478947368421053, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22464566929133858, "calib/std_conf": 0.2499644838061726, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7548011363636365, "calib/step_q_c_n": 352.0, "calib/step_q_gap": 0.20926106667722544, "calib/step_q_w": 0.5455400696864111, "calib/step_q_w_n": 287.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 466.2734375, "completions/mean_terminated_length": 466.2734375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.10986666666666667, "grad_norm": 0.0054487548768520355, "kl": 0.05999755859375, "learning_rate": 2.6944444444444444e-06, "loss": 0.0057, "num_tokens": 23698902.0, "reward": 0.4195878803730011, "reward_std": 0.216641366481781, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7135539054870605, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.19703443348407745, "step": 103 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6941060661990894, "calib/avg_num_step_conf": 3.2734375, "calib/ece": 0.2768627450980392, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.43529411764705883, "calib/gap": 0.17975636766334446, "calib/mean_conf": 0.751921568627451, "calib/mu_c": 0.8428571428571429, "calib/mu_w": 0.6631007751937984, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2673333333333333, "calib/std_conf": 0.2707546348262433, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6844289044289045, "calib/step_q_c_n": 429.0, "calib/step_q_gap": 0.1231086110303714, "calib/step_q_w": 0.5613202933985331, "calib/step_q_w_n": 409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 450.63671875, "completions/mean_terminated_length": 452.4039611816406, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.11093333333333333, "grad_norm": 0.006285542622208595, "kl": 0.06972503662109375, "learning_rate": 2.666666666666667e-06, "loss": -0.0145, "num_tokens": 23920945.0, "reward": 0.38449519872665405, "reward_std": 0.21741212904453278, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6973944902420044, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.22606037557125092, "step": 104 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6722027972027972, "calib/avg_num_step_conf": 3.4921875, "calib/ece": 0.2310588235294118, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4627450980392157, "calib/gap": 0.17593531468531465, "calib/mean_conf": 0.7574117647058823, "calib/mu_c": 0.8346853146853147, "calib/mu_w": 0.6587500000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.21384313725490198, "calib/std_conf": 0.2732526789040839, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6315923566878981, "calib/step_q_c_n": 471.0, "calib/step_q_gap": 0.126320489075605, "calib/step_q_w": 0.5052718676122931, "calib/step_q_w_n": 423.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 467.0078125, "completions/mean_terminated_length": 468.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.112, "grad_norm": 0.00584400026127696, "kl": 0.0659332275390625, "learning_rate": 2.6388888888888893e-06, "loss": 0.019, "num_tokens": 24146259.0, "reward": 0.413488507270813, "reward_std": 0.2676308751106262, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.718665599822998, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.2010636329650879, "step": 105 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7931652414486923, "calib/avg_num_step_conf": 3.0078125, "calib/ece": 0.23413385826771643, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5393700787401575, "calib/gap": 0.2596755533199194, "calib/mean_conf": 0.7857086614173228, "calib/mu_c": 0.9002112676056336, "calib/mu_w": 0.6405357142857142, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23039370078740148, "calib/std_conf": 0.2775328769695504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7003703703703702, "calib/step_q_c_n": 378.0, "calib/step_q_gap": 0.18276832955404376, "calib/step_q_w": 0.5176020408163264, "calib/step_q_w_n": 392.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 443.4296875, "completions/mean_terminated_length": 445.16864013671875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.11306666666666666, "grad_norm": 0.005779189057648182, "kl": 0.0670166015625, "learning_rate": 2.6111111111111113e-06, "loss": 0.0032, "num_tokens": 24364361.0, "reward": 0.4434106647968292, "reward_std": 0.17971467971801758, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7472339868545532, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.1697876900434494, "step": 106 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6255549121791161, "calib/avg_num_step_conf": 3.34375, "calib/ece": 0.2551953124999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.53515625, "calib/gap": 0.07818374831113672, "calib/mean_conf": 0.7985546875, "calib/mu_c": 0.8287898089171973, "calib/mu_w": 0.7506060606060606, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22023437499999984, "calib/std_conf": 0.2446566889475482, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6505349794238683, "calib/step_q_c_n": 486.0, "calib/step_q_gap": 0.0916160605049493, "calib/step_q_w": 0.558918918918919, "calib/step_q_w_n": 370.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 430.19921875, "completions/mean_terminated_length": 431.88629150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.11413333333333334, "grad_norm": 0.0060838377103209496, "kl": 0.075714111328125, "learning_rate": 2.5833333333333337e-06, "loss": 0.0091, "num_tokens": 24579108.0, "reward": 0.4143851101398468, "reward_std": 0.19063109159469604, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7057347297668457, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.19962078332901, "step": 107 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6504048904414099, "calib/avg_num_step_conf": 3.1953125, "calib/ece": 0.17356862745098045, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5686274509803921, "calib/gap": 0.10877500793902806, "calib/mean_conf": 0.8009411764705883, "calib/mu_c": 0.8295212765957446, "calib/mu_w": 0.7207462686567165, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11862745098039224, "calib/std_conf": 0.2620984257394281, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6984483430799221, "calib/step_q_c_n": 513.0, "calib/step_q_gap": 0.1552352283258237, "calib/step_q_w": 0.5432131147540984, "calib/step_q_w_n": 305.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 453.625, "completions/mean_terminated_length": 453.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1152, "grad_norm": 0.0059150392189621925, "kl": 0.07221221923828125, "learning_rate": 2.5555555555555557e-06, "loss": 0.0307, "num_tokens": 24798468.0, "reward": 0.4753592908382416, "reward_std": 0.20767410099506378, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7676601409912109, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.16147282719612122, "step": 108 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7909703091521274, "calib/avg_num_step_conf": 3.4453125, "calib/ece": 0.26847656250000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.390625, "calib/gap": 0.2549207223752681, "calib/mean_conf": 0.7273046875, "calib/mu_c": 0.8617355371900828, "calib/mu_w": 0.6068148148148147, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26156250000000003, "calib/std_conf": 0.27908360246622754, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6148711340206187, "calib/step_q_c_n": 388.0, "calib/step_q_gap": 0.13713834049835144, "calib/step_q_w": 0.4777327935222672, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 456.94140625, "completions/mean_terminated_length": 458.7333679199219, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.11626666666666667, "grad_norm": 0.005637164227664471, "kl": 0.07299041748046875, "learning_rate": 2.5277777777777778e-06, "loss": 0.0214, "num_tokens": 25020045.0, "reward": 0.4240063428878784, "reward_std": 0.17739006876945496, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.735093355178833, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.18161195516586304, "step": 109 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6757722007722008, "calib/avg_num_step_conf": 3.3984375, "calib/ece": 0.17805533596837952, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.383399209486166, "calib/gap": 0.18335971685971686, "calib/mean_conf": 0.7340237154150197, "calib/mu_c": 0.8101216216216216, "calib/mu_w": 0.6267619047619047, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16354940711462457, "calib/std_conf": 0.2678300140641943, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6310274261603375, "calib/step_q_c_n": 474.0, "calib/step_q_gap": 0.12784560797851918, "calib/step_q_w": 0.5031818181818183, "calib/step_q_w_n": 396.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2620.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 429.2734375, "completions/mean_terminated_length": 430.9568786621094, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.11733333333333333, "grad_norm": 0.0061676702462136745, "kl": 0.07923126220703125, "learning_rate": 2.5e-06, "loss": 0.0147, "num_tokens": 25234859.0, "reward": 0.42947834730148315, "reward_std": 0.21718741953372955, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7396499514579773, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.19319328665733337, "step": 110 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7256516211061665, "calib/avg_num_step_conf": 2.921875, "calib/ece": 0.18086956521739125, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.43873517786561267, "calib/gap": 0.2383286713286712, "calib/mean_conf": 0.7358893280632411, "calib/mu_c": 0.8395104895104893, "calib/mu_w": 0.6011818181818182, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17577075098814227, "calib/std_conf": 0.29037882372774526, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6409819121447029, "calib/step_q_c_n": 387.0, "calib/step_q_gap": 0.15715919746326246, "calib/step_q_w": 0.48382271468144045, "calib/step_q_w_n": 361.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 439.7578125, "completions/mean_terminated_length": 441.4823913574219, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.1184, "grad_norm": 0.006372205447405577, "kl": 0.0758514404296875, "learning_rate": 2.4722222222222226e-06, "loss": 0.0532, "num_tokens": 25454845.0, "reward": 0.44394993782043457, "reward_std": 0.1870567500591278, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7490593791007996, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.17053453624248505, "step": 111 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7689470365699873, "calib/avg_num_step_conf": 2.81640625, "calib/ece": 0.16642857142857145, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.32142857142857145, "calib/gap": 0.2834804539722572, "calib/mean_conf": 0.6296825396825397, "calib/mu_c": 0.7669230769230769, "calib/mu_w": 0.48344262295081974, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14011904761904764, "calib/std_conf": 0.31763745058257475, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6238775510204082, "calib/step_q_c_n": 294.0, "calib/step_q_gap": 0.224135162261626, "calib/step_q_w": 0.3997423887587822, "calib/step_q_w_n": 427.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 480.234375, "completions/mean_terminated_length": 480.234375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.11946666666666667, "grad_norm": 0.006004595663398504, "kl": 0.0638885498046875, "learning_rate": 2.4444444444444447e-06, "loss": 0.0478, "num_tokens": 25685705.0, "reward": 0.44127148389816284, "reward_std": 0.2001304030418396, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7588914036750793, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.17322342097759247, "step": 112 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8027427531522163, "calib/avg_num_step_conf": 2.78515625, "calib/ece": 0.1334117647058823, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4235294117647059, "calib/gap": 0.3040881320681138, "calib/mean_conf": 0.7180392156862746, "calib/mu_c": 0.834904458598726, "calib/mu_w": 0.5308163265306122, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11788235294117644, "calib/std_conf": 0.29488861046126724, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.60359375, "calib/step_q_c_n": 384.0, "calib/step_q_gap": 0.18702840045592717, "calib/step_q_w": 0.41656534954407287, "calib/step_q_w_n": 329.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 398.43359375, "completions/mean_terminated_length": 398.43359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.12053333333333334, "grad_norm": 0.006771065294742584, "kl": 0.08171844482421875, "learning_rate": 2.4166666666666667e-06, "loss": 0.0509, "num_tokens": 25892904.0, "reward": 0.49621909856796265, "reward_std": 0.1860068142414093, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.8066890239715576, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.1361258625984192, "step": 113 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8219507242452957, "calib/avg_num_step_conf": 2.79296875, "calib/ece": 0.1100392156862745, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.403921568627451, "calib/gap": 0.3354501150670095, "calib/mean_conf": 0.7237647058823531, "calib/mu_c": 0.8408433734939758, "calib/mu_w": 0.5053932584269663, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09141176470588235, "calib/std_conf": 0.2902006464929282, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6508450704225351, "calib/step_q_c_n": 426.0, "calib/step_q_gap": 0.2275924752668258, "calib/step_q_w": 0.42325259515570934, "calib/step_q_w_n": 289.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 414.73046875, "completions/mean_terminated_length": 414.73046875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1216, "grad_norm": 0.006451824214309454, "kl": 0.068511962890625, "learning_rate": 2.388888888888889e-06, "loss": -0.006, "num_tokens": 26104099.0, "reward": 0.5097230076789856, "reward_std": 0.17535501718521118, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.8283358812332153, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.1362336277961731, "step": 114 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6537506367804381, "calib/avg_num_step_conf": 2.5625, "calib/ece": 0.195593725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.39215686274509803, "calib/gap": 0.16332058074375955, "calib/mean_conf": 0.6910729411764706, "calib/mu_c": 0.757682119205298, "calib/mu_w": 0.5943615384615385, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14725490196078425, "calib/std_conf": 0.30040811066629775, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5597662337662338, "calib/step_q_c_n": 385.0, "calib/step_q_gap": 0.08283449944889049, "calib/step_q_w": 0.4769317343173433, "calib/step_q_w_n": 271.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 382.875, "completions/mean_terminated_length": 384.3764953613281, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.12266666666666666, "grad_norm": 0.006533203646540642, "kl": 0.0797576904296875, "learning_rate": 2.361111111111111e-06, "loss": 0.0256, "num_tokens": 26307379.0, "reward": 0.42707669734954834, "reward_std": 0.19787093997001648, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7344694137573242, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.1975034773349762, "step": 115 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7142095022908429, "calib/avg_num_step_conf": 2.83203125, "calib/ece": 0.21039370078740158, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3779527559055118, "calib/gap": 0.17867947028180498, "calib/mean_conf": 0.7144094488188977, "calib/mu_c": 0.7939007092198581, "calib/mu_w": 0.6152212389380531, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18484251968503934, "calib/std_conf": 0.28248518473675605, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6328875379939211, "calib/step_q_c_n": 329.0, "calib/step_q_gap": 0.184056729913113, "calib/step_q_w": 0.44883080808080805, "calib/step_q_w_n": 396.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 446.05859375, "completions/mean_terminated_length": 447.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.12373333333333333, "grad_norm": 0.006134877912700176, "kl": 0.06531524658203125, "learning_rate": 2.3333333333333336e-06, "loss": -0.019, "num_tokens": 26526090.0, "reward": 0.4186154007911682, "reward_std": 0.2001335471868515, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.730369508266449, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.20173242688179016, "step": 116 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7597947761194029, "calib/avg_num_step_conf": 2.48828125, "calib/ece": 0.2484645669291338, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.32677165354330706, "calib/gap": 0.23092910447761184, "calib/mean_conf": 0.7149212598425199, "calib/mu_c": 0.83675, "calib/mu_w": 0.6058208955223882, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24547244094488183, "calib/std_conf": 0.2675887403413453, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6150537634408603, "calib/step_q_c_n": 279.0, "calib/step_q_gap": 0.11784705953024577, "calib/step_q_w": 0.4972067039106145, "calib/step_q_w_n": 358.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2332.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 427.9453125, "completions/mean_terminated_length": 427.9453125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.1248, "grad_norm": 0.005825493484735489, "kl": 0.06906890869140625, "learning_rate": 2.305555555555556e-06, "loss": 0.0049, "num_tokens": 26742244.0, "reward": 0.40940552949905396, "reward_std": 0.20684045553207397, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7297269701957703, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.20310339331626892, "step": 117 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.714480182347727, "calib/avg_num_step_conf": 3.05859375, "calib/ece": 0.14988235294117655, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.3843137254901961, "calib/gap": 0.21608079017348358, "calib/mean_conf": 0.7328627450980392, "calib/mu_c": 0.8226845637583894, "calib/mu_w": 0.6066037735849058, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14921568627450987, "calib/std_conf": 0.27010971043428966, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5166139954853274, "calib/step_q_c_n": 443.0, "calib/step_q_gap": 0.05520223077944503, "calib/step_q_w": 0.46141176470588235, "calib/step_q_w_n": 340.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 437.41796875, "completions/mean_terminated_length": 439.13336181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.12586666666666665, "grad_norm": 0.005717393942177296, "kl": 0.069976806640625, "learning_rate": 2.277777777777778e-06, "loss": -0.0478, "num_tokens": 26958231.0, "reward": 0.43301814794540405, "reward_std": 0.18237316608428955, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7607734203338623, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.20958086848258972, "step": 118 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7422657952069718, "calib/avg_num_step_conf": 2.39453125, "calib/ece": 0.2096456692913386, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.421259842519685, "calib/gap": 0.24138748832866486, "calib/mean_conf": 0.708464566929134, "calib/mu_c": 0.8215555555555556, "calib/mu_w": 0.5801680672268907, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19330708661417328, "calib/std_conf": 0.30811658738481257, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6629113924050634, "calib/step_q_c_n": 237.0, "calib/step_q_gap": 0.2757039455965527, "calib/step_q_w": 0.38720744680851066, "calib/step_q_w_n": 376.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 458.72265625, "completions/mean_terminated_length": 460.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 81.0, "epoch": 0.12693333333333334, "grad_norm": 0.005962693598121405, "kl": 0.0665283203125, "learning_rate": 2.25e-06, "loss": 0.0114, "num_tokens": 27180728.0, "reward": 0.43463796377182007, "reward_std": 0.2322581708431244, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7391331791877747, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.17376351356506348, "step": 119 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7753120224146716, "calib/avg_num_step_conf": 2.40625, "calib/ece": 0.15745098039215694, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.43529411764705883, "calib/gap": 0.2925458481915436, "calib/mean_conf": 0.7361176470588237, "calib/mu_c": 0.855430463576159, "calib/mu_w": 0.5628846153846154, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15070588235294125, "calib/std_conf": 0.3018352782210479, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6148901098901098, "calib/step_q_c_n": 364.0, "calib/step_q_gap": 0.12766788766788761, "calib/step_q_w": 0.4872222222222222, "calib/step_q_w_n": 252.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 404.77734375, "completions/mean_terminated_length": 404.77734375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.128, "grad_norm": 0.006345272064208984, "kl": 0.071075439453125, "learning_rate": 2.222222222222222e-06, "loss": 0.0323, "num_tokens": 27391039.0, "reward": 0.47319233417510986, "reward_std": 0.17949917912483215, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7807586193084717, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.14999887347221375, "step": 120 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6862980769230769, "calib/avg_num_step_conf": 2.265625, "calib/ece": 0.22480468749999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.1558653846153848, "calib/mean_conf": 0.7729296875, "calib/mu_c": 0.83625, "calib/mu_w": 0.6803846153846153, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20199218749999995, "calib/std_conf": 0.2713587248572493, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6637457044673539, "calib/step_q_c_n": 291.0, "calib/step_q_gap": 0.1868252892424404, "calib/step_q_w": 0.4769204152249135, "calib/step_q_w_n": 289.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 426.74609375, "completions/mean_terminated_length": 428.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.12906666666666666, "grad_norm": 0.006708329077810049, "kl": 0.067962646484375, "learning_rate": 2.1944444444444445e-06, "loss": -0.0081, "num_tokens": 27605342.0, "reward": 0.42762500047683716, "reward_std": 0.23545682430267334, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7204293012619019, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.182366743683815, "step": 121 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7024892759651632, "calib/avg_num_step_conf": 2.265625, "calib/ece": 0.18407843137254898, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5764705882352941, "calib/gap": 0.21602495775380204, "calib/mean_conf": 0.7934117647058825, "calib/mu_c": 0.8764331210191082, "calib/mu_w": 0.6604081632653062, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18090196078431367, "calib/std_conf": 0.275026784896692, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6653571428571429, "calib/step_q_c_n": 308.0, "calib/step_q_gap": 0.2497321428571429, "calib/step_q_w": 0.41562499999999997, "calib/step_q_w_n": 272.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 409.15234375, "completions/mean_terminated_length": 409.15234375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.13013333333333332, "grad_norm": 0.0067081572487950325, "kl": 0.08014678955078125, "learning_rate": 2.166666666666667e-06, "loss": -0.0015, "num_tokens": 27817429.0, "reward": 0.4447178244590759, "reward_std": 0.26026856899261475, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.748012900352478, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.17888976633548737, "step": 122 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6532738095238096, "calib/avg_num_step_conf": 2.5546875, "calib/ece": 0.25507874015748033, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.421259842519685, "calib/gap": 0.16622519841269845, "calib/mean_conf": 0.7196456692913387, "calib/mu_c": 0.8034126984126985, "calib/mu_w": 0.6371875, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2393307086614173, "calib/std_conf": 0.29290607528342233, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5598722044728435, "calib/step_q_c_n": 313.0, "calib/step_q_gap": 0.11740886136433909, "calib/step_q_w": 0.4424633431085044, "calib/step_q_w_n": 341.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 486.5703125, "completions/mean_terminated_length": 488.47845458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1312, "grad_norm": 0.00602144468575716, "kl": 0.06238555908203125, "learning_rate": 2.138888888888889e-06, "loss": -0.0127, "num_tokens": 28047279.0, "reward": 0.38009756803512573, "reward_std": 0.2720668613910675, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6759315729141235, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.2079240083694458, "step": 123 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7034875966924513, "calib/avg_num_step_conf": 2.30859375, "calib/ece": 0.18203921568627454, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4980392156862745, "calib/gap": 0.17056881835156024, "calib/mean_conf": 0.7839215686274511, "calib/mu_c": 0.8454601226993863, "calib/mu_w": 0.6748913043478261, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16337254901960788, "calib/std_conf": 0.2511673017260864, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6243681318681319, "calib/step_q_c_n": 364.0, "calib/step_q_gap": 0.09388355037033458, "calib/step_q_w": 0.5304845814977973, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 403.75390625, "completions/mean_terminated_length": 405.3372802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.13226666666666667, "grad_norm": 0.006141428370028734, "kl": 0.0720672607421875, "learning_rate": 2.1111111111111114e-06, "loss": -0.0239, "num_tokens": 28257456.0, "reward": 0.44975244998931885, "reward_std": 0.19885006546974182, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7609667778015137, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.1880244016647339, "step": 124 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6631318818818819, "calib/avg_num_step_conf": 1.9375, "calib/ece": 0.21301960784313734, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47058823529411764, "calib/gap": 0.18230480480480493, "calib/mean_conf": 0.7492549019607844, "calib/mu_c": 0.8286111111111112, "calib/mu_w": 0.6463063063063063, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19878431372549027, "calib/std_conf": 0.2917127141110068, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6556554307116105, "calib/step_q_c_n": 267.0, "calib/step_q_gap": 0.09836285429239644, "calib/step_q_w": 0.557292576419214, "calib/step_q_w_n": 229.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 439.15234375, "completions/mean_terminated_length": 440.8745422363281, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.13333333333333333, "grad_norm": 0.006332618184387684, "kl": 0.06296539306640625, "learning_rate": 2.0833333333333334e-06, "loss": 0.0098, "num_tokens": 28474687.0, "reward": 0.4046638011932373, "reward_std": 0.2163892388343811, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7109558582305908, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.21100321412086487, "step": 125 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7924464417631127, "calib/avg_num_step_conf": 2.453125, "calib/ece": 0.20345098039215695, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5098039215686274, "calib/gap": 0.35185360748584105, "calib/mean_conf": 0.7103529411764706, "calib/mu_c": 0.8814503816793895, "calib/mu_w": 0.5295967741935484, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20003921568627459, "calib/std_conf": 0.33382905238585764, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6417880794701988, "calib/step_q_c_n": 302.0, "calib/step_q_gap": 0.23789237394872637, "calib/step_q_w": 0.4038957055214724, "calib/step_q_w_n": 326.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 437.0, "completions/mean_terminated_length": 437.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1344, "grad_norm": 0.005750678479671478, "kl": 0.0669403076171875, "learning_rate": 2.0555555555555555e-06, "loss": -0.0252, "num_tokens": 28692023.0, "reward": 0.4464186429977417, "reward_std": 0.19056807458400726, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7728476524353027, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.1815728098154068, "step": 126 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7651209677419355, "calib/avg_num_step_conf": 1.98046875, "calib/ece": 0.250952380952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.49603174603174605, "calib/gap": 0.291938004032258, "calib/mean_conf": 0.7237301587301587, "calib/mu_c": 0.872016129032258, "calib/mu_w": 0.580078125, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.24130952380952386, "calib/std_conf": 0.3258486700250864, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6589912280701754, "calib/step_q_c_n": 228.0, "calib/step_q_gap": 0.18368656857196758, "calib/step_q_w": 0.4753046594982078, "calib/step_q_w_n": 279.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 395.6875, "completions/mean_terminated_length": 397.2392272949219, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.13546666666666668, "grad_norm": 0.006842640228569508, "kl": 0.076812744140625, "learning_rate": 2.027777777777778e-06, "loss": 0.0037, "num_tokens": 28896991.0, "reward": 0.40660762786865234, "reward_std": 0.22128602862358093, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7246460914611816, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.20596206188201904, "step": 127 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6736004108885464, "calib/avg_num_step_conf": 1.91015625, "calib/ece": 0.23653999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.44, "calib/gap": 0.2019042116076012, "calib/mean_conf": 0.70758, "calib/mu_c": 0.8028787878787879, "calib/mu_w": 0.6009745762711867, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.20806, "calib/std_conf": 0.31591366478834054, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.6228517110266161, "calib/step_q_c_n": 263.0, "calib/step_q_gap": 0.14678976412396116, "calib/step_q_w": 0.4760619469026549, "calib/step_q_w_n": 226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2831.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 444.1015625, "completions/mean_terminated_length": 447.5984191894531, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.13653333333333334, "grad_norm": 0.007062236778438091, "kl": 0.06587600708007812, "learning_rate": 2.0000000000000003e-06, "loss": 0.0319, "num_tokens": 29117345.0, "reward": 0.39006370306015015, "reward_std": 0.22584298253059387, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6938701868057251, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": -0.20983658730983734, "step": 128 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6701629395078832, "calib/avg_num_step_conf": 1.9453125, "calib/ece": 0.197421875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.5625, "calib/gap": 0.14036941750775134, "calib/mean_conf": 0.8085156250000001, "calib/mu_c": 0.8595092024539879, "calib/mu_w": 0.7191397849462365, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18460937500000002, "calib/std_conf": 0.24441125000879024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.692340425531915, "calib/step_q_c_n": 329.0, "calib/step_q_gap": 0.048612614881027416, "calib/step_q_w": 0.6437278106508876, "calib/step_q_w_n": 169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 362.53515625, "completions/mean_terminated_length": 363.9568786621094, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.1376, "grad_norm": 0.006954402197152376, "kl": 0.073272705078125, "learning_rate": 1.9722222222222224e-06, "loss": -0.0052, "num_tokens": 29312538.0, "reward": 0.43850159645080566, "reward_std": 0.20004130899906158, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7443780899047852, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.19471865892410278, "step": 129 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7535458642629905, "calib/avg_num_step_conf": 1.90234375, "calib/ece": 0.19234374999999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.59765625, "calib/gap": 0.25480381760339355, "calib/mean_conf": 0.803125, "calib/mu_c": 0.8946951219512196, "calib/mu_w": 0.639891304347826, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17742187499999995, "calib/std_conf": 0.2773232826954852, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7463197026022306, "calib/step_q_c_n": 269.0, "calib/step_q_gap": 0.22274172095085432, "calib/step_q_w": 0.5235779816513763, "calib/step_q_w_n": 218.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 374.10546875, "completions/mean_terminated_length": 375.57257080078125, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.13866666666666666, "grad_norm": 0.006909600459039211, "kl": 0.0675201416015625, "learning_rate": 1.944444444444445e-06, "loss": -0.0234, "num_tokens": 29513597.0, "reward": 0.47459420561790466, "reward_std": 0.1551002562046051, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7759792804718018, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.15335342288017273, "step": 130 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7306662515566624, "calib/avg_num_step_conf": 2.4140625, "calib/ece": 0.28429687499999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.3828125, "calib/gap": 0.23855541718555395, "calib/mean_conf": 0.7003125, "calib/mu_c": 0.8363636363636362, "calib/mu_w": 0.5978082191780822, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27746093749999995, "calib/std_conf": 0.29775970151071485, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6309505703422054, "calib/step_q_c_n": 263.0, "calib/step_q_gap": 0.18106324639854338, "calib/step_q_w": 0.449887323943662, "calib/step_q_w_n": 355.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 368.26953125, "completions/mean_terminated_length": 369.7137451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.13973333333333332, "grad_norm": 0.006882078945636749, "kl": 0.076446533203125, "learning_rate": 1.916666666666667e-06, "loss": -0.0246, "num_tokens": 29714082.0, "reward": 0.38815802335739136, "reward_std": 0.21172058582305908, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.7099640369415283, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.21958552300930023, "step": 131 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7970082577501015, "calib/avg_num_step_conf": 2.359375, "calib/ece": 0.1489372549019608, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5450980392156862, "calib/gap": 0.30514627047515885, "calib/mean_conf": 0.773250980392157, "calib/mu_c": 0.8797530120481927, "calib/mu_w": 0.5746067415730338, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1356039215686275, "calib/std_conf": 0.2877999437058285, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7027909090909091, "calib/step_q_c_n": 330.0, "calib/step_q_gap": 0.2830098871930989, "calib/step_q_w": 0.41978102189781025, "calib/step_q_w_n": 274.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 421.4296875, "completions/mean_terminated_length": 423.0823669433594, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.1408, "grad_norm": 0.006782933603972197, "kl": 0.067291259765625, "learning_rate": 1.888888888888889e-06, "loss": 0.0141, "num_tokens": 29927560.0, "reward": 0.4930591583251953, "reward_std": 0.19035959243774414, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7980226278305054, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.1369042992591858, "step": 132 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7284993144708961, "calib/avg_num_step_conf": 2.109375, "calib/ece": 0.24494117647058822, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.34901960784313724, "calib/gap": 0.24423656986164777, "calib/mean_conf": 0.6661176470588236, "calib/mu_c": 0.8021238938053097, "calib/mu_w": 0.5578873239436619, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23396078431372547, "calib/std_conf": 0.3196263533675105, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6283255813953488, "calib/step_q_c_n": 215.0, "calib/step_q_gap": 0.18069481216457967, "calib/step_q_w": 0.4476307692307691, "calib/step_q_w_n": 325.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 488.4453125, "completions/mean_terminated_length": 488.4453125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.14186666666666667, "grad_norm": 0.005817534402012825, "kl": 0.053134918212890625, "learning_rate": 1.8611111111111113e-06, "loss": 0.003, "num_tokens": 30158946.0, "reward": 0.39914774894714355, "reward_std": 0.23423807322978973, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.7104480266571045, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.19730885326862335, "step": 133 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7303877940241577, "calib/avg_num_step_conf": 1.8125, "calib/ece": 0.18675889328063244, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47035573122529645, "calib/gap": 0.24748251748251726, "calib/mean_conf": 0.7198814229249013, "calib/mu_c": 0.8274825174825173, "calib/mu_w": 0.5800000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1707114624505929, "calib/std_conf": 0.31804054507408136, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6810256410256411, "calib/step_q_c_n": 234.0, "calib/step_q_gap": 0.18351564102564116, "calib/step_q_w": 0.49750999999999995, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 491.453125, "completions/mean_terminated_length": 493.38043212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.14293333333333333, "grad_norm": 0.005839070305228233, "kl": 0.05161285400390625, "learning_rate": 1.8333333333333333e-06, "loss": 0.0228, "num_tokens": 30393710.0, "reward": 0.4265187382698059, "reward_std": 0.2271723449230194, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7398152351379395, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.19537153840065002, "step": 134 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6970262535772054, "calib/avg_num_step_conf": 2.1328125, "calib/ece": 0.23145098039215684, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4549019607843137, "calib/gap": 0.19408361328854062, "calib/mean_conf": 0.707843137254902, "calib/mu_c": 0.7946099290780142, "calib/mu_w": 0.6005263157894736, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19317647058823526, "calib/std_conf": 0.3222598739751373, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6618283582089552, "calib/step_q_c_n": 268.0, "calib/step_q_gap": 0.18118087619456674, "calib/step_q_w": 0.4806474820143885, "calib/step_q_w_n": 278.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 442.09375, "completions/mean_terminated_length": 442.09375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.144, "grad_norm": 0.006755627226084471, "kl": 0.06070709228515625, "learning_rate": 1.8055555555555557e-06, "loss": 0.0245, "num_tokens": 30612766.0, "reward": 0.4173700213432312, "reward_std": 0.20016345381736755, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7141886949539185, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.18804235756397247, "step": 135 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8194513099357389, "calib/avg_num_step_conf": 2.6796875, "calib/ece": 0.21752941176470592, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.43529411764705883, "calib/gap": 0.36043067226890735, "calib/mean_conf": 0.6706274509803922, "calib/mu_c": 0.8628571428571427, "calib/mu_w": 0.5024264705882353, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21074509803921573, "calib/std_conf": 0.33725593707091495, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6736032388663967, "calib/step_q_c_n": 247.0, "calib/step_q_gap": 0.30733900196434655, "calib/step_q_w": 0.3662642369020502, "calib/step_q_w_n": 439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 412.2421875, "completions/mean_terminated_length": 413.8588562011719, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.14506666666666668, "grad_norm": 0.007109965663403273, "kl": 0.06912994384765625, "learning_rate": 1.777777777777778e-06, "loss": 0.0012, "num_tokens": 30826788.0, "reward": 0.4410070478916168, "reward_std": 0.16643013060092926, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7721558809280396, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.1823292225599289, "step": 136 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7205128205128205, "calib/avg_num_step_conf": 2.44921875, "calib/ece": 0.2232539682539683, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5357142857142857, "calib/gap": 0.2512706552706552, "calib/mean_conf": 0.7311904761904762, "calib/mu_c": 0.8478518518518519, "calib/mu_w": 0.5965811965811967, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2093650793650794, "calib/std_conf": 0.32522545816626314, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6836758893280633, "calib/step_q_c_n": 253.0, "calib/step_q_gap": 0.3525528946756569, "calib/step_q_w": 0.3311229946524064, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 441.5546875, "completions/mean_terminated_length": 443.28631591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.14613333333333334, "grad_norm": 0.006158465053886175, "kl": 0.06252288818359375, "learning_rate": 1.75e-06, "loss": -0.0077, "num_tokens": 31046810.0, "reward": 0.4185337424278259, "reward_std": 0.1904926896095276, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7140519618988037, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.17776569724082947, "step": 137 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6729764138300725, "calib/avg_num_step_conf": 2.1953125, "calib/ece": 0.18301960784313717, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.2131687215223802, "calib/mean_conf": 0.7287450980392157, "calib/mu_c": 0.8048170731707318, "calib/mu_w": 0.5916483516483516, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13431372549019602, "calib/std_conf": 0.31718491193359705, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.601949860724234, "calib/step_q_c_n": 359.0, "calib/step_q_gap": 0.12815675727595804, "calib/step_q_w": 0.4737931034482759, "calib/step_q_w_n": 203.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 430.37890625, "completions/mean_terminated_length": 430.37890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.1472, "grad_norm": 0.0068092262372374535, "kl": 0.06418609619140625, "learning_rate": 1.7222222222222224e-06, "loss": 0.0295, "num_tokens": 31261323.0, "reward": 0.4481315016746521, "reward_std": 0.227791890501976, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.753614068031311, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.18391355872154236, "step": 138 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7020090821521948, "calib/avg_num_step_conf": 2.0625, "calib/ece": 0.18509803921568624, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5176470588235295, "calib/gap": 0.20241021054080055, "calib/mean_conf": 0.7465882352941179, "calib/mu_c": 0.8148520710059171, "calib/mu_w": 0.6124418604651165, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13447058823529406, "calib/std_conf": 0.3077465965770035, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6642196531791908, "calib/step_q_c_n": 346.0, "calib/step_q_gap": 0.15026360922314685, "calib/step_q_w": 0.513956043956044, "calib/step_q_w_n": 182.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 381.12109375, "completions/mean_terminated_length": 382.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.14826666666666666, "grad_norm": 0.007338706869632006, "kl": 0.07012939453125, "learning_rate": 1.6944444444444446e-06, "loss": -0.0152, "num_tokens": 31461986.0, "reward": 0.46961668133735657, "reward_std": 0.17519153654575348, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7622421979904175, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.15504010021686554, "step": 139 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7917524257571302, "calib/avg_num_step_conf": 2.23828125, "calib/ece": 0.10454901960784307, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.31926712731549545, "calib/mean_conf": 0.7508235294117648, "calib/mu_c": 0.845977653631285, "calib/mu_w": 0.5267105263157895, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07670588235294112, "calib/std_conf": 0.3000112877538112, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.661927680798005, "calib/step_q_c_n": 401.0, "calib/step_q_gap": 0.22181140172823754, "calib/step_q_w": 0.44011627906976747, "calib/step_q_w_n": 172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 421.5234375, "completions/mean_terminated_length": 423.1764831542969, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.14933333333333335, "grad_norm": 0.0073651643469929695, "kl": 0.06573486328125, "learning_rate": 1.6666666666666667e-06, "loss": 0.0359, "num_tokens": 31674912.0, "reward": 0.5161871910095215, "reward_std": 0.1584603190422058, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.8248147964477539, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.12994034588336945, "step": 140 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7992788461538464, "calib/avg_num_step_conf": 2.08984375, "calib/ece": 0.11972440944881882, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5826771653543307, "calib/gap": 0.3176442307692309, "calib/mean_conf": 0.7793307086614174, "calib/mu_c": 0.8768750000000001, "calib/mu_w": 0.5592307692307692, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1030708661417322, "calib/std_conf": 0.30069071548654897, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6873584905660377, "calib/step_q_c_n": 371.0, "calib/step_q_gap": 0.20705361251725718, "calib/step_q_w": 0.4803048780487805, "calib/step_q_w_n": 164.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 449.04296875, "completions/mean_terminated_length": 449.04296875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1504, "grad_norm": 0.006539446301758289, "kl": 0.061206817626953125, "learning_rate": 1.638888888888889e-06, "loss": 0.0566, "num_tokens": 31896963.0, "reward": 0.5025902390480042, "reward_std": 0.19069364666938782, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.814516007900238, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.1444917619228363, "step": 141 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.77997227997228, "calib/avg_num_step_conf": 2.16015625, "calib/ece": 0.17862204724409453, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.3354671454671455, "calib/mean_conf": 0.7025590551181102, "calib/mu_c": 0.8491608391608392, "calib/mu_w": 0.5136936936936937, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15909448818897642, "calib/std_conf": 0.3424531499319575, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6738333333333334, "calib/step_q_c_n": 300.0, "calib/step_q_gap": 0.25288471673254287, "calib/step_q_w": 0.4209486166007905, "calib/step_q_w_n": 253.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 469.6484375, "completions/mean_terminated_length": 469.6484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.15146666666666667, "grad_norm": 0.006188663654029369, "kl": 0.061065673828125, "learning_rate": 1.6111111111111113e-06, "loss": 0.0125, "num_tokens": 32122353.0, "reward": 0.4513563811779022, "reward_std": 0.18066132068634033, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7761745452880859, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.18361811339855194, "step": 142 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7944169232829027, "calib/avg_num_step_conf": 2.8671875, "calib/ece": 0.12199203187250987, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.398406374501992, "calib/gap": 0.30510242334984594, "calib/mean_conf": 0.6853386454183269, "calib/mu_c": 0.8032467532467532, "calib/mu_w": 0.49814432989690727, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.09689243027888436, "calib/std_conf": 0.30802316759208864, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5960880829015545, "calib/step_q_c_n": 386.0, "calib/step_q_gap": 0.2187604966946579, "calib/step_q_w": 0.37732758620689655, "calib/step_q_w_n": 348.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 487.60546875, "completions/mean_terminated_length": 493.3873596191406, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.15253333333333333, "grad_norm": 0.006679258309304714, "kl": 0.0638275146484375, "learning_rate": 1.5833333333333333e-06, "loss": 0.0334, "num_tokens": 32354516.0, "reward": 0.4849836230278015, "reward_std": 0.16762301325798035, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.787904679775238, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.13356241583824158, "step": 143 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7010775862068965, "calib/avg_num_step_conf": 2.0703125, "calib/ece": 0.16389370078740156, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5393700787401575, "calib/gap": 0.2521537356321839, "calib/mean_conf": 0.7216102362204725, "calib/mu_c": 0.8010287356321839, "calib/mu_w": 0.548875, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10023228346456695, "calib/std_conf": 0.3310463155584519, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6510911854103344, "calib/step_q_c_n": 329.0, "calib/step_q_gap": 0.16780760332078215, "calib/step_q_w": 0.48328358208955224, "calib/step_q_w_n": 201.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 442.734375, "completions/mean_terminated_length": 442.734375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1536, "grad_norm": 0.007385350298136473, "kl": 0.06850814819335938, "learning_rate": 1.5555555555555558e-06, "loss": -0.0029, "num_tokens": 32571984.0, "reward": 0.4696730971336365, "reward_std": 0.19212749600410461, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7683699131011963, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.16261744499206543, "step": 144 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6778014688462449, "calib/avg_num_step_conf": 2.3671875, "calib/ece": 0.15023437500000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.46484375, "calib/gap": 0.2027323698965492, "calib/mean_conf": 0.739375, "calib/mu_c": 0.7924338624338626, "calib/mu_w": 0.5897014925373134, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07566406250000002, "calib/std_conf": 0.29674865732636435, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.638768472906404, "calib/step_q_c_n": 406.0, "calib/step_q_gap": 0.1725684729064041, "calib/step_q_w": 0.46619999999999995, "calib/step_q_w_n": 200.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 393.6796875, "completions/mean_terminated_length": 395.2235412597656, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.15466666666666667, "grad_norm": 0.007237947080284357, "kl": 0.06774139404296875, "learning_rate": 1.527777777777778e-06, "loss": 0.0243, "num_tokens": 32775470.0, "reward": 0.5008053779602051, "reward_std": 0.16850511729717255, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7931559085845947, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.13842013478279114, "step": 145 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7339015151515151, "calib/avg_num_step_conf": 2.37890625, "calib/ece": 0.24218750000000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4921875, "calib/gap": 0.2860361681329424, "calib/mean_conf": 0.7196093750000001, "calib/mu_c": 0.8670967741935485, "calib/mu_w": 0.581060606060606, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23871093750000005, "calib/std_conf": 0.3211257112286548, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7208627450980393, "calib/step_q_c_n": 255.0, "calib/step_q_gap": 0.29100398803589234, "calib/step_q_w": 0.42985875706214693, "calib/step_q_w_n": 354.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 444.40234375, "completions/mean_terminated_length": 446.1451110839844, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.15573333333333333, "grad_norm": 0.007386982906609774, "kl": 0.0638885498046875, "learning_rate": 1.5e-06, "loss": 0.0116, "num_tokens": 32996453.0, "reward": 0.40284696221351624, "reward_std": 0.22834259271621704, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.719482421875, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.20753851532936096, "step": 146 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.755403548225887, "calib/avg_num_step_conf": 2.3203125, "calib/ece": 0.22779527559055124, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5078740157480315, "calib/gap": 0.25840329835082454, "calib/mean_conf": 0.6967716535433072, "calib/mu_c": 0.8147826086956522, "calib/mu_w": 0.5563793103448277, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1906299212598426, "calib/std_conf": 0.3387782167456465, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6112996389891696, "calib/step_q_c_n": 277.0, "calib/step_q_gap": 0.13502203646551025, "calib/step_q_w": 0.47627760252365936, "calib/step_q_w_n": 317.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2760.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 452.94921875, "completions/mean_terminated_length": 454.72552490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.1568, "grad_norm": 0.006861098576337099, "kl": 0.063690185546875, "learning_rate": 1.4722222222222225e-06, "loss": -0.0086, "num_tokens": 33216088.0, "reward": 0.42057937383651733, "reward_std": 0.18396975100040436, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7294925451278687, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.19302131235599518, "step": 147 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.761721666417799, "calib/avg_num_step_conf": 2.10546875, "calib/ece": 0.14207843137254897, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5254901960784314, "calib/gap": 0.3064976855308348, "calib/mean_conf": 0.7328235294117648, "calib/mu_c": 0.8217679558011051, "calib/mu_w": 0.5152702702702703, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.08254901960784308, "calib/std_conf": 0.3262848933113967, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6500506329113923, "calib/step_q_c_n": 395.0, "calib/step_q_gap": 0.11220341068917006, "calib/step_q_w": 0.5378472222222223, "calib/step_q_w_n": 144.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 408.74609375, "completions/mean_terminated_length": 410.3490295410156, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.15786666666666666, "grad_norm": 0.008046685717999935, "kl": 0.06903076171875, "learning_rate": 1.4444444444444445e-06, "loss": 0.0241, "num_tokens": 33425839.0, "reward": 0.4869382381439209, "reward_std": 0.1796838790178299, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7993249893188477, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.16372980177402496, "step": 148 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8624321530111139, "calib/avg_num_step_conf": 2.25, "calib/ece": 0.1641984126984126, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5357142857142857, "calib/gap": 0.4337823727061256, "calib/mean_conf": 0.7339126984126986, "calib/mu_c": 0.916376712328767, "calib/mu_w": 0.4825943396226415, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15937301587301578, "calib/std_conf": 0.3253357733086017, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7625072992700731, "calib/step_q_c_n": 274.0, "calib/step_q_gap": 0.35601723304490757, "calib/step_q_w": 0.40649006622516554, "calib/step_q_w_n": 302.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 483.5234375, "completions/mean_terminated_length": 487.3307189941406, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.15893333333333334, "grad_norm": 0.00647739227861166, "kl": 0.05419921875, "learning_rate": 1.4166666666666667e-06, "loss": 0.0281, "num_tokens": 33654077.0, "reward": 0.49852481484413147, "reward_std": 0.21163997054100037, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.8149806261062622, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.12652483582496643, "step": 149 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7663901525287664, "calib/avg_num_step_conf": 2.4375, "calib/ece": 0.20080321285140562, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.28906609579876885, "calib/mean_conf": 0.7951807228915664, "calib/mu_c": 0.9124324324324323, "calib/mu_w": 0.6233663366336635, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.20080321285140562, "calib/std_conf": 0.2744005285529898, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.6704747774480712, "calib/step_q_c_n": 337.0, "calib/step_q_gap": 0.17747826176862863, "calib/step_q_w": 0.4929965156794425, "calib/step_q_w_n": 287.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 441.90234375, "completions/mean_terminated_length": 443.63531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.16, "grad_norm": 0.007934695109724998, "kl": 0.07119369506835938, "learning_rate": 1.3888888888888892e-06, "loss": 0.0276, "num_tokens": 33872164.0, "reward": 0.4670780897140503, "reward_std": 0.20850981771945953, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7612718343734741, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": -0.1372719407081604, "step": 150 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7397745571658615, "calib/avg_num_step_conf": 1.859375, "calib/ece": 0.2719200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.512, "calib/gap": 0.2768953301127214, "calib/mean_conf": 0.70352, "calib/mu_c": 0.8530434782608696, "calib/mu_w": 0.5761481481481482, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2577200000000001, "calib/std_conf": 0.3358660590175792, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7825128205128207, "calib/step_q_c_n": 195.0, "calib/step_q_gap": 0.28119609453417294, "calib/step_q_w": 0.5013167259786477, "calib/step_q_w_n": 281.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 487.125, "completions/mean_terminated_length": 490.96063232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.16106666666666666, "grad_norm": 0.0062635913491249084, "kl": 0.052555084228515625, "learning_rate": 1.3611111111111112e-06, "loss": 0.0214, "num_tokens": 34103892.0, "reward": 0.3825833797454834, "reward_std": 0.186384379863739, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6929078102111816, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.21133476495742798, "step": 151 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.761882008154944, "calib/avg_num_step_conf": 2.2734375, "calib/ece": 0.15128853754940716, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4189723320158103, "calib/gap": 0.3069249490316004, "calib/mean_conf": 0.698086956521739, "calib/mu_c": 0.8303194444444445, "calib/mu_w": 0.5233944954128441, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.14010276679841902, "calib/std_conf": 0.3178229059605173, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.6928536585365853, "calib/step_q_c_n": 328.0, "calib/step_q_gap": 0.20253869790666407, "calib/step_q_w": 0.49031496062992125, "calib/step_q_w_n": 254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2254.0, "completions/max_terminated_length": 2254.0, "completions/mean_length": 440.63671875, "completions/mean_terminated_length": 442.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.16213333333333332, "grad_norm": 0.0070022461004555225, "kl": 0.06756591796875, "learning_rate": 1.3333333333333334e-06, "loss": 0.0227, "num_tokens": 34322087.0, "reward": 0.45159396529197693, "reward_std": 0.19584518671035767, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7669804096221924, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.1716049760580063, "step": 152 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7111842105263158, "calib/avg_num_step_conf": 2.125, "calib/ece": 0.21464285714285697, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5753968253968254, "calib/gap": 0.206663157894737, "calib/mean_conf": 0.7907539682539684, "calib/mu_c": 0.872763157894737, "calib/mu_w": 0.6661, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.20111111111111096, "calib/std_conf": 0.2823003350173705, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7380727272727273, "calib/step_q_c_n": 275.0, "calib/step_q_gap": 0.3008980060831362, "calib/step_q_w": 0.4371747211895911, "calib/step_q_w_n": 269.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 464.30078125, "completions/mean_terminated_length": 467.9566955566406, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.1632, "grad_norm": 0.006464678328484297, "kl": 0.057842254638671875, "learning_rate": 1.3055555555555556e-06, "loss": -0.0094, "num_tokens": 34548268.0, "reward": 0.4233105182647705, "reward_std": 0.19462406635284424, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7310148477554321, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.1992376148700714, "step": 153 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7771455223880597, "calib/avg_num_step_conf": 1.8671875, "calib/ece": 0.2810236220472442, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5039370078740157, "calib/gap": 0.2784477611940299, "calib/mean_conf": 0.7311023622047245, "calib/mu_c": 0.8780000000000001, "calib/mu_w": 0.5995522388059702, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2698425196850395, "calib/std_conf": 0.3175688364747583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7962146892655367, "calib/step_q_c_n": 177.0, "calib/step_q_gap": 0.30312498826885903, "calib/step_q_w": 0.4930897009966777, "calib/step_q_w_n": 301.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 410.5826721191406, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.16426666666666667, "grad_norm": 0.007672094739973545, "kl": 0.06097412109375, "learning_rate": 1.2777777777777779e-06, "loss": -0.0456, "num_tokens": 34756996.0, "reward": 0.3938148617744446, "reward_std": 0.19781461358070374, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7103534936904907, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.21334879100322723, "step": 154 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6745472837022134, "calib/avg_num_step_conf": 2.25, "calib/ece": 0.21767716535433076, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.49606299212598426, "calib/gap": 0.1960110663983904, "calib/mean_conf": 0.7527952755905514, "calib/mu_c": 0.839225352112676, "calib/mu_w": 0.6432142857142856, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20570866141732289, "calib/std_conf": 0.296939211111955, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6303960396039605, "calib/step_q_c_n": 303.0, "calib/step_q_gap": 0.09116527037319133, "calib/step_q_w": 0.5392307692307692, "calib/step_q_w_n": 273.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 405.46484375, "completions/mean_terminated_length": 407.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.16533333333333333, "grad_norm": 0.008317645639181137, "kl": 0.066864013671875, "learning_rate": 1.25e-06, "loss": -0.0029, "num_tokens": 34968011.0, "reward": 0.3980780243873596, "reward_std": 0.18083029985427856, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.718758225440979, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.23197712004184723, "step": 155 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6635432283858071, "calib/avg_num_step_conf": 1.953125, "calib/ece": 0.2611417322834647, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.531496062992126, "calib/gap": 0.16840204897551225, "calib/mean_conf": 0.7310629921259844, "calib/mu_c": 0.8079710144927537, "calib/mu_w": 0.6395689655172414, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.22444881889763796, "calib/std_conf": 0.32045005870881077, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6607053941908715, "calib/step_q_c_n": 241.0, "calib/step_q_gap": 0.1683540428395201, "calib/step_q_w": 0.4923513513513514, "calib/step_q_w_n": 259.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 427.13671875, "completions/mean_terminated_length": 427.13671875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1664, "grad_norm": 0.006734688300639391, "kl": 0.062412261962890625, "learning_rate": 1.2222222222222223e-06, "loss": 0.0044, "num_tokens": 35182118.0, "reward": 0.3882417678833008, "reward_std": 0.21631193161010742, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6920551061630249, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.22182153165340424, "step": 156 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7651593364994149, "calib/avg_num_step_conf": 2.2578125, "calib/ece": 0.1635826771653544, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5236220472440944, "calib/gap": 0.295426388602106, "calib/mean_conf": 0.7294094488188977, "calib/mu_c": 0.8305988023952094, "calib/mu_w": 0.5351724137931034, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.11775590551181109, "calib/std_conf": 0.3333887767249415, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6801794871794872, "calib/step_q_c_n": 390.0, "calib/step_q_gap": 0.20778587015821054, "calib/step_q_w": 0.47239361702127664, "calib/step_q_w_n": 188.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 423.82421875, "completions/mean_terminated_length": 423.82421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.16746666666666668, "grad_norm": 0.0070098452270030975, "kl": 0.06569671630859375, "learning_rate": 1.1944444444444446e-06, "loss": -0.0197, "num_tokens": 35394345.0, "reward": 0.47161611914634705, "reward_std": 0.17706799507141113, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7853542566299438, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.17180952429771423, "step": 157 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6336452940395592, "calib/avg_num_step_conf": 2.0390625, "calib/ece": 0.22211764705882353, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5686274509803921, "calib/gap": 0.17093787335722854, "calib/mean_conf": 0.7811764705882352, "calib/mu_c": 0.8435185185185188, "calib/mu_w": 0.6725806451612902, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.184, "calib/std_conf": 0.29557950430964497, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7130952380952381, "calib/step_q_c_n": 336.0, "calib/step_q_gap": 0.14083717357910908, "calib/step_q_w": 0.572258064516129, "calib/step_q_w_n": 186.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 428.8984375, "completions/mean_terminated_length": 430.5804138183594, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.16853333333333334, "grad_norm": 0.007019939366728067, "kl": 0.06101226806640625, "learning_rate": 1.1666666666666668e-06, "loss": 0.0395, "num_tokens": 35609383.0, "reward": 0.4132624864578247, "reward_std": 0.20368565618991852, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7299218773841858, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.22761566936969757, "step": 158 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7862674238513164, "calib/avg_num_step_conf": 2.00390625, "calib/ece": 0.1817391304347824, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5731225296442688, "calib/gap": 0.307683273102736, "calib/mean_conf": 0.7633201581027668, "calib/mu_c": 0.8897986577181207, "calib/mu_w": 0.5821153846153847, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17806324110671917, "calib/std_conf": 0.30821307671940945, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7772161172161173, "calib/step_q_c_n": 273.0, "calib/step_q_gap": 0.2800494505494506, "calib/step_q_w": 0.4971666666666667, "calib/step_q_w_n": 240.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 438.69921875, "completions/mean_terminated_length": 438.69921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1696, "grad_norm": 0.007189049851149321, "kl": 0.07638168334960938, "learning_rate": 1.138888888888889e-06, "loss": 0.0311, "num_tokens": 35826474.0, "reward": 0.4793606400489807, "reward_std": 0.15734824538230896, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7705523371696472, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.12511231005191803, "step": 159 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7946846341006926, "calib/avg_num_step_conf": 2.0625, "calib/ece": 0.19279527559055118, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4763779527559055, "calib/gap": 0.326921829184603, "calib/mean_conf": 0.7164173228346458, "calib/mu_c": 0.8670072992700731, "calib/mu_w": 0.5400854700854701, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18492125984251967, "calib/std_conf": 0.32399596432173317, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7056153846153845, "calib/step_q_c_n": 260.0, "calib/step_q_gap": 0.22916016073478745, "calib/step_q_w": 0.4764552238805971, "calib/step_q_w_n": 268.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 454.5, "completions/mean_terminated_length": 454.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.17066666666666666, "grad_norm": 0.007117413450032473, "kl": 0.05873870849609375, "learning_rate": 1.111111111111111e-06, "loss": 0.0267, "num_tokens": 36047666.0, "reward": 0.46024322509765625, "reward_std": 0.1503116339445114, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7683855295181274, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.15258657932281494, "step": 160 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6931727490386709, "calib/avg_num_step_conf": 2.1015625, "calib/ece": 0.16796875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.4453125, "calib/gap": 0.24856997750852516, "calib/mean_conf": 0.6948437500000001, "calib/mu_c": 0.7696089385474861, "calib/mu_w": 0.521038961038961, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08179687499999999, "calib/std_conf": 0.3302401691737961, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6068948655256724, "calib/step_q_c_n": 409.0, "calib/step_q_gap": 0.11526695854892827, "calib/step_q_w": 0.49162790697674413, "calib/step_q_w_n": 129.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 398.4296875, "completions/mean_terminated_length": 399.9921875, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.17173333333333332, "grad_norm": 0.007734321523457766, "kl": 0.0670166015625, "learning_rate": 1.0833333333333335e-06, "loss": 0.0105, "num_tokens": 36253584.0, "reward": 0.4745808243751526, "reward_std": 0.15935248136520386, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7851648330688477, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.17584697902202606, "step": 161 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7935420743639922, "calib/avg_num_step_conf": 1.83203125, "calib/ece": 0.10458823529411768, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.3656977269306033, "calib/mean_conf": 0.7849803921568628, "calib/mu_c": 0.8896703296703294, "calib/mu_w": 0.5239726027397261, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.087921568627451, "calib/std_conf": 0.2960363974837221, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7499999999999999, "calib/step_q_c_n": 341.0, "calib/step_q_gap": 0.22757812499999985, "calib/step_q_w": 0.522421875, "calib/step_q_w_n": 128.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 410.00390625, "completions/mean_terminated_length": 411.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.1728, "grad_norm": 0.007531987503170967, "kl": 0.06168365478515625, "learning_rate": 1.0555555555555557e-06, "loss": 0.0069, "num_tokens": 36462689.0, "reward": 0.5377246141433716, "reward_std": 0.17636175453662872, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.8490738272666931, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.11503089964389801, "step": 162 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.783248001998002, "calib/avg_num_step_conf": 2.07421875, "calib/ece": 0.16941176470588232, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4823529411764706, "calib/gap": 0.35326111388611403, "calib/mean_conf": 0.7001568627450981, "calib/mu_c": 0.8553146853146855, "calib/mu_w": 0.5020535714285714, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15439215686274504, "calib/std_conf": 0.3368155003506106, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.773085501858736, "calib/step_q_c_n": 269.0, "calib/step_q_gap": 0.3311389369732399, "calib/step_q_w": 0.44194656488549616, "calib/step_q_w_n": 262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 428.6875, "completions/mean_terminated_length": 430.36865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.17386666666666667, "grad_norm": 0.0073830727487802505, "kl": 0.06630706787109375, "learning_rate": 1.0277777777777777e-06, "loss": -0.0014, "num_tokens": 36677265.0, "reward": 0.4658210873603821, "reward_std": 0.189020574092865, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7848738431930542, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.16338786482810974, "step": 163 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.774967824967825, "calib/avg_num_step_conf": 1.92578125, "calib/ece": 0.1702292490118577, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.43873517786561267, "calib/gap": 0.3019448519948519, "calib/mean_conf": 0.7031936758893281, "calib/mu_c": 0.8285067567567567, "calib/mu_w": 0.5265619047619048, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.14422134387351776, "calib/std_conf": 0.3259338757223692, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6631146953405018, "calib/step_q_c_n": 279.0, "calib/step_q_gap": 0.1753670317890999, "calib/step_q_w": 0.4877476635514019, "calib/step_q_w_n": 214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2330.0, "completions/max_terminated_length": 2330.0, "completions/mean_length": 471.390625, "completions/mean_terminated_length": 471.390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.17493333333333333, "grad_norm": 0.006834862753748894, "kl": 0.0714569091796875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0036, "num_tokens": 36904077.0, "reward": 0.4619291424751282, "reward_std": 0.17501939833164215, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.771399199962616, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.1600409299135208, "step": 164 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6768137528703532, "calib/avg_num_step_conf": 2.203125, "calib/ece": 0.28952755905511807, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5511811023622047, "calib/gap": 0.1804319493576616, "calib/mean_conf": 0.745984251968504, "calib/mu_c": 0.8333587786259543, "calib/mu_w": 0.6529268292682927, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.25988188976377946, "calib/std_conf": 0.3173022599375055, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7048421052631578, "calib/step_q_c_n": 285.0, "calib/step_q_gap": 0.14688511601584608, "calib/step_q_w": 0.5579569892473117, "calib/step_q_w_n": 279.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 481.8046875, "completions/mean_terminated_length": 483.69415283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.176, "grad_norm": 0.007147556636482477, "kl": 0.05928802490234375, "learning_rate": 9.722222222222224e-07, "loss": -0.0406, "num_tokens": 37132995.0, "reward": 0.36501526832580566, "reward_std": 0.20879912376403809, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6746468544006348, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.24383507668972015, "step": 165 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8364864864864865, "calib/avg_num_step_conf": 1.9453125, "calib/ece": 0.10988188976377947, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5275590551181102, "calib/gap": 0.39777327327327316, "calib/mean_conf": 0.719724409448819, "calib/mu_c": 0.835611111111111, "calib/mu_w": 0.4378378378378378, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.060472440944881814, "calib/std_conf": 0.3372893376289778, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7329824561403508, "calib/step_q_c_n": 342.0, "calib/step_q_gap": 0.21009784075573545, "calib/step_q_w": 0.5228846153846154, "calib/step_q_w_n": 156.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 483.12109375, "completions/mean_terminated_length": 483.12109375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.17706666666666668, "grad_norm": 0.006450401619076729, "kl": 0.06050872802734375, "learning_rate": 9.444444444444445e-07, "loss": 0.0376, "num_tokens": 37362858.0, "reward": 0.5276123881340027, "reward_std": 0.15655210614204407, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.8373090028762817, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.12114673852920532, "step": 166 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6948282097649187, "calib/avg_num_step_conf": 1.73046875, "calib/ece": 0.17807086614173234, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6496062992125984, "calib/gap": 0.23772224231464734, "calib/mean_conf": 0.8042913385826772, "calib/mu_c": 0.8782285714285715, "calib/mu_w": 0.6405063291139241, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14669291338582682, "calib/std_conf": 0.29233126528843495, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7734219269102991, "calib/step_q_c_n": 301.0, "calib/step_q_gap": 0.19468953254410182, "calib/step_q_w": 0.5787323943661973, "calib/step_q_w_n": 142.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 424.11328125, "completions/mean_terminated_length": 425.7764892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.17813333333333334, "grad_norm": 0.006349243223667145, "kl": 0.05992889404296875, "learning_rate": 9.166666666666666e-07, "loss": -0.0187, "num_tokens": 37577039.0, "reward": 0.46977323293685913, "reward_std": 0.15033377707004547, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7826762199401855, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.17828592658042908, "step": 167 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8598246094820499, "calib/avg_num_step_conf": 2.22265625, "calib/ece": 0.12833992094861651, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5612648221343873, "calib/gap": 0.41263565360372706, "calib/mean_conf": 0.7647826086956522, "calib/mu_c": 0.909939024390244, "calib/mu_w": 0.49730337078651693, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.12245059288537541, "calib/std_conf": 0.30643502263408473, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.733914373088685, "calib/step_q_c_n": 327.0, "calib/step_q_gap": 0.3189970177167842, "calib/step_q_w": 0.41491735537190083, "calib/step_q_w_n": 242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 492.76953125, "completions/mean_terminated_length": 494.7019958496094, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1792, "grad_norm": 0.0063049341551959515, "kl": 0.05417633056640625, "learning_rate": 8.88888888888889e-07, "loss": 0.0112, "num_tokens": 37807860.0, "reward": 0.5274177193641663, "reward_std": 0.16986939311027527, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.8387695550918579, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.10893408954143524, "step": 168 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7536912751677853, "calib/avg_num_step_conf": 1.78515625, "calib/ece": 0.2051181102362205, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5708661417322834, "calib/gap": 0.26666538830297226, "calib/mean_conf": 0.7738582677165354, "calib/mu_c": 0.8840939597315437, "calib/mu_w": 0.6174285714285714, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19618110236220476, "calib/std_conf": 0.3029670787034485, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.8015637860082304, "calib/step_q_c_n": 243.0, "calib/step_q_gap": 0.2658161224568285, "calib/step_q_w": 0.5357476635514019, "calib/step_q_w_n": 214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 442.9921875, "completions/mean_terminated_length": 444.72943115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.18026666666666666, "grad_norm": 0.006920646410435438, "kl": 0.0618438720703125, "learning_rate": 8.611111111111112e-07, "loss": -0.0157, "num_tokens": 38025450.0, "reward": 0.46007660031318665, "reward_std": 0.15827137231826782, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7540468573570251, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.1495186686515808, "step": 169 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7508914754760143, "calib/avg_num_step_conf": 2.27734375, "calib/ece": 0.1843749999999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.57421875, "calib/gap": 0.25116329139473836, "calib/mean_conf": 0.7796875000000001, "calib/mu_c": 0.8670059880239519, "calib/mu_w": 0.6158426966292135, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1558593749999999, "calib/std_conf": 0.29746569523854344, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7223076923076923, "calib/step_q_c_n": 325.0, "calib/step_q_gap": 0.272850327966607, "calib/step_q_w": 0.4494573643410853, "calib/step_q_w_n": 258.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 462.48046875, "completions/mean_terminated_length": 464.2941589355469, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.18133333333333335, "grad_norm": 0.006413524504750967, "kl": 0.06085205078125, "learning_rate": 8.333333333333333e-07, "loss": -0.0082, "num_tokens": 38247997.0, "reward": 0.4750288128852844, "reward_std": 0.20399650931358337, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7785238027572632, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.15815365314483643, "step": 170 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6820796733885933, "calib/avg_num_step_conf": 2.30859375, "calib/ece": 0.2445098039215686, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.47843137254901963, "calib/gap": 0.19062786094271944, "calib/mean_conf": 0.6989411764705883, "calib/mu_c": 0.7871532846715329, "calib/mu_w": 0.5965254237288135, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20309803921568625, "calib/std_conf": 0.33759248038767425, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6557492354740061, "calib/step_q_c_n": 327.0, "calib/step_q_gap": 0.1330977203224909, "calib/step_q_w": 0.5226515151515152, "calib/step_q_w_n": 264.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 442.7109375, "completions/mean_terminated_length": 442.7109375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1824, "grad_norm": 0.007600404787808657, "kl": 0.068572998046875, "learning_rate": 8.055555555555557e-07, "loss": 0.0208, "num_tokens": 38468227.0, "reward": 0.39803436398506165, "reward_std": 0.20574426651000977, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7018972635269165, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.21129731833934784, "step": 171 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7579238754325259, "calib/avg_num_step_conf": 2.13671875, "calib/ece": 0.18980392156862735, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.611764705882353, "calib/gap": 0.21288235294117652, "calib/mean_conf": 0.8309803921568629, "calib/mu_c": 0.9019411764705884, "calib/mu_w": 0.6890588235294118, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.17705882352941169, "calib/std_conf": 0.2571003840111285, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7507894736842107, "calib/step_q_c_n": 342.0, "calib/step_q_gap": 0.21293581514762527, "calib/step_q_w": 0.5378536585365854, "calib/step_q_w_n": 205.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 443.11328125, "completions/mean_terminated_length": 444.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.18346666666666667, "grad_norm": 0.006095860619097948, "kl": 0.0571746826171875, "learning_rate": 7.777777777777779e-07, "loss": 0.0012, "num_tokens": 38685016.0, "reward": 0.4755558669567108, "reward_std": 0.1840173453092575, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7747269868850708, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.15408393740653992, "step": 172 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6584417372152248, "calib/avg_num_step_conf": 2.12109375, "calib/ece": 0.24011811023622043, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6732283464566929, "calib/gap": 0.12509532658820277, "calib/mean_conf": 0.8352362204724411, "calib/mu_c": 0.8780838323353293, "calib/mu_w": 0.7529885057471265, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2089370078740157, "calib/std_conf": 0.2684421067424582, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7461980830670927, "calib/step_q_c_n": 313.0, "calib/step_q_gap": 0.10011112654535348, "calib/step_q_w": 0.6460869565217392, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 496.71484375, "completions/mean_terminated_length": 496.71484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.18453333333333333, "grad_norm": 0.00640326039865613, "kl": 0.058563232421875, "learning_rate": 7.5e-07, "loss": -0.0174, "num_tokens": 38915335.0, "reward": 0.43277373909950256, "reward_std": 0.16515189409255981, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7180511355400085, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.180628702044487, "step": 173 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6305704099821746, "calib/avg_num_step_conf": 2.37109375, "calib/ece": 0.290398406374502, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4940239043824701, "calib/gap": 0.13328749681690877, "calib/mean_conf": 0.7128685258964144, "calib/mu_c": 0.776060606060606, "calib/mu_w": 0.6427731092436972, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23868525896414336, "calib/std_conf": 0.3249709436067534, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6553260869565217, "calib/step_q_c_n": 276.0, "calib/step_q_gap": 0.1292445159595429, "calib/step_q_w": 0.5260815709969788, "calib/step_q_w_n": 331.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 544.0703125, "completions/mean_terminated_length": 546.2039794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.1856, "grad_norm": 0.0059709707275033, "kl": 0.05558013916015625, "learning_rate": 7.222222222222222e-07, "loss": 0.0299, "num_tokens": 39158849.0, "reward": 0.3485894203186035, "reward_std": 0.2507731020450592, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6534445285797119, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.2531406879425049, "step": 174 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7570719602977668, "calib/avg_num_step_conf": 2.0078125, "calib/ece": 0.22905511811023632, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.301862282878412, "calib/mean_conf": 0.6925196850393701, "calib/mu_c": 0.8470161290322582, "calib/mu_w": 0.5451538461538462, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21669291338582686, "calib/std_conf": 0.3442119117109115, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6973622047244095, "calib/step_q_c_n": 254.0, "calib/step_q_gap": 0.19455451241671706, "calib/step_q_w": 0.5028076923076924, "calib/step_q_w_n": 260.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 479.45703125, "completions/mean_terminated_length": 481.3372802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.18666666666666668, "grad_norm": 0.006477923132479191, "kl": 0.056880950927734375, "learning_rate": 6.944444444444446e-07, "loss": -0.0068, "num_tokens": 39387414.0, "reward": 0.4128992557525635, "reward_std": 0.1991274058818817, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7348886728286743, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.20362144708633423, "step": 175 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7421730555897942, "calib/avg_num_step_conf": 2.3203125, "calib/ece": 0.2615294117647059, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5450980392156862, "calib/gap": 0.2454147664242574, "calib/mean_conf": 0.7598039215686274, "calib/mu_c": 0.877218045112782, "calib/mu_w": 0.6318032786885246, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.24988235294117647, "calib/std_conf": 0.3098177212116475, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7332653061224491, "calib/step_q_c_n": 294.0, "calib/step_q_gap": 0.20066530612244915, "calib/step_q_w": 0.5326, "calib/step_q_w_n": 300.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 448.6484375, "completions/mean_terminated_length": 450.4078674316406, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.18773333333333334, "grad_norm": 0.009094655513763428, "kl": 0.09540557861328125, "learning_rate": 6.666666666666667e-07, "loss": 0.0078, "num_tokens": 39606332.0, "reward": 0.41096848249435425, "reward_std": 0.2125389277935028, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7137258052825928, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.19413259625434875, "step": 176 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7259706802429001, "calib/avg_num_step_conf": 2.90625, "calib/ece": 0.239375, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5546875, "calib/gap": 0.25054775194749435, "calib/mean_conf": 0.75046875, "calib/mu_c": 0.8669343065693431, "calib/mu_w": 0.6163865546218488, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22734375, "calib/std_conf": 0.31732120796353575, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6893696275071635, "calib/step_q_c_n": 349.0, "calib/step_q_gap": 0.21255950092488496, "calib/step_q_w": 0.4768101265822785, "calib/step_q_w_n": 395.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 480.25, "completions/mean_terminated_length": 482.13336181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.1888, "grad_norm": 0.006366265006363392, "kl": 0.05780792236328125, "learning_rate": 6.388888888888889e-07, "loss": 0.0256, "num_tokens": 39833108.0, "reward": 0.42046934366226196, "reward_std": 0.19948306679725647, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7267382740974426, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.19204957783222198, "step": 177 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7904965404965405, "calib/avg_num_step_conf": 2.52734375, "calib/ece": 0.19743083003952552, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6284584980237155, "calib/gap": 0.2697049247049248, "calib/mean_conf": 0.822806324110672, "calib/mu_c": 0.9198148148148148, "calib/mu_w": 0.65010989010989, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18996047430830024, "calib/std_conf": 0.26390880199470856, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7512957746478874, "calib/step_q_c_n": 355.0, "calib/step_q_gap": 0.3298916650588464, "calib/step_q_w": 0.42140410958904106, "calib/step_q_w_n": 292.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3064.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 452.44140625, "completions/mean_terminated_length": 454.2156982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.18986666666666666, "grad_norm": 0.006662712432444096, "kl": 0.057811737060546875, "learning_rate": 6.111111111111112e-07, "loss": 0.0158, "num_tokens": 40055005.0, "reward": 0.48111921548843384, "reward_std": 0.16879983246326447, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7817012071609497, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.14368149638175964, "step": 178 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7012151996399408, "calib/avg_num_step_conf": 2.328125, "calib/ece": 0.2307086614173229, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5551181102362205, "calib/gap": 0.20123834629974946, "calib/mean_conf": 0.784488188976378, "calib/mu_c": 0.8660927152317881, "calib/mu_w": 0.6648543689320386, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2103543307086615, "calib/std_conf": 0.29155761079361453, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7528892307692308, "calib/step_q_c_n": 325.0, "calib/step_q_gap": 0.22488185069542999, "calib/step_q_w": 0.5280073800738008, "calib/step_q_w_n": 271.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 475.6171875, "completions/mean_terminated_length": 475.6171875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.19093333333333334, "grad_norm": 0.006497078109532595, "kl": 0.06073760986328125, "learning_rate": 5.833333333333334e-07, "loss": 0.003, "num_tokens": 40283027.0, "reward": 0.4224216341972351, "reward_std": 0.23153510689735413, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7222155928611755, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.1914348155260086, "step": 179 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7532517957678121, "calib/avg_num_step_conf": 1.9765625, "calib/ece": 0.18440944881889765, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.547244094488189, "calib/gap": 0.29907461334368723, "calib/mean_conf": 0.7678740157480316, "calib/mu_c": 0.886797385620915, "calib/mu_w": 0.5877227722772278, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17496062992125982, "calib/std_conf": 0.30078409323323846, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7633684210526317, "calib/step_q_c_n": 285.0, "calib/step_q_gap": 0.18762181471779005, "calib/step_q_w": 0.5757466063348416, "calib/step_q_w_n": 221.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 529.6796875, "completions/mean_terminated_length": 529.6796875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.192, "grad_norm": 0.006103991065174341, "kl": 0.0552825927734375, "learning_rate": 5.555555555555555e-07, "loss": 0.0129, "num_tokens": 40522481.0, "reward": 0.47175779938697815, "reward_std": 0.21102026104927063, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.769753098487854, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": -0.14186255633831024, "step": 180 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7538148843026892, "calib/avg_num_step_conf": 2.578125, "calib/ece": 0.2626086956521738, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47035573122529645, "calib/gap": 0.23111757348342743, "calib/mean_conf": 0.7637154150197629, "calib/mu_c": 0.8760769230769232, "calib/mu_w": 0.6449593495934958, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.25624505928853747, "calib/std_conf": 0.2807662265161187, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7090378006872852, "calib/step_q_c_n": 291.0, "calib/step_q_gap": 0.18971530746235288, "calib/step_q_w": 0.5193224932249323, "calib/step_q_w_n": 369.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 433.6015625, "completions/mean_terminated_length": 437.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.19306666666666666, "grad_norm": 0.007144651375710964, "kl": 0.07428741455078125, "learning_rate": 5.277777777777779e-07, "loss": 0.0021, "num_tokens": 40739747.0, "reward": 0.40516453981399536, "reward_std": 0.20815244317054749, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7120101451873779, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.20011860132217407, "step": 181 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7901029787822241, "calib/avg_num_step_conf": 2.1328125, "calib/ece": 0.17932000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.64, "calib/gap": 0.30472942152187443, "calib/mean_conf": 0.8017199999999999, "calib/mu_c": 0.9126415094339624, "calib/mu_w": 0.607912087912088, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.17252000000000006, "calib/std_conf": 0.2873253236315936, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.7957911392405064, "calib/step_q_c_n": 316.0, "calib/step_q_gap": 0.2641824435883324, "calib/step_q_w": 0.531608695652174, "calib/step_q_w_n": 230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 484.37109375, "completions/mean_terminated_length": 490.1146545410156, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.19413333333333332, "grad_norm": 0.005775344092398882, "kl": 0.050815582275390625, "learning_rate": 5.000000000000001e-07, "loss": 0.0152, "num_tokens": 40969906.0, "reward": 0.4684050679206848, "reward_std": 0.21433956921100616, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7808293104171753, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.16355042159557343, "step": 182 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6680239898989898, "calib/avg_num_step_conf": 1.87890625, "calib/ece": 0.2461811023622047, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5433070866141733, "calib/gap": 0.20218939393939417, "calib/mean_conf": 0.7603543307086614, "calib/mu_c": 0.8479166666666668, "calib/mu_w": 0.6457272727272726, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21980314960629915, "calib/std_conf": 0.31335665643925337, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7191319444444445, "calib/step_q_c_n": 288.0, "calib/step_q_gap": 0.09586769573978127, "calib/step_q_w": 0.6232642487046632, "calib/step_q_w_n": 193.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 467.65234375, "completions/mean_terminated_length": 469.4862976074219, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.1952, "grad_norm": 0.006125408224761486, "kl": 0.05516815185546875, "learning_rate": 4.7222222222222226e-07, "loss": -0.0079, "num_tokens": 41196305.0, "reward": 0.39300990104675293, "reward_std": 0.26003241539001465, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7125464677810669, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.23746415972709656, "step": 183 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7377312128864925, "calib/avg_num_step_conf": 2.5390625, "calib/ece": 0.21607142857142864, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.2206832298136645, "calib/mean_conf": 0.8224206349206349, "calib/mu_c": 0.9021118012422361, "calib/mu_w": 0.6814285714285716, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19980158730158734, "calib/std_conf": 0.26944109858361637, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7238983050847457, "calib/step_q_c_n": 354.0, "calib/step_q_gap": 0.27320573751717814, "calib/step_q_w": 0.4506925675675676, "calib/step_q_w_n": 296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 482.55859375, "completions/mean_terminated_length": 486.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.19626666666666667, "grad_norm": 0.006096668541431427, "kl": 0.062225341796875, "learning_rate": 4.444444444444445e-07, "loss": 0.0204, "num_tokens": 41425120.0, "reward": 0.44166046380996704, "reward_std": 0.2429095208644867, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7461289167404175, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": -0.18390172719955444, "step": 184 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7485714285714284, "calib/avg_num_step_conf": 2.3359375, "calib/ece": 0.26878431372549005, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6, "calib/gap": 0.2520714285714285, "calib/mean_conf": 0.7883921568627452, "calib/mu_c": 0.9020714285714284, "calib/mu_w": 0.6499999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2540784313725489, "calib/std_conf": 0.3005579177118272, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7370860927152318, "calib/step_q_c_n": 302.0, "calib/step_q_gap": 0.25073474136388046, "calib/step_q_w": 0.4863513513513514, "calib/step_q_w_n": 296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 474.80859375, "completions/mean_terminated_length": 474.80859375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.19733333333333333, "grad_norm": 0.006603894755244255, "kl": 0.07291412353515625, "learning_rate": 4.1666666666666667e-07, "loss": -0.0034, "num_tokens": 41653591.0, "reward": 0.4074620008468628, "reward_std": 0.1967330127954483, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7267429828643799, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.2204127013683319, "step": 185 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7217297084318363, "calib/avg_num_step_conf": 1.9921875, "calib/ece": 0.18781249999999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.56640625, "calib/gap": 0.24395849750459675, "calib/mean_conf": 0.7678906250000002, "calib/mu_c": 0.8574691358024693, "calib/mu_w": 0.6135106382978726, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16144531249999994, "calib/std_conf": 0.3046223387854367, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7850961538461538, "calib/step_q_c_n": 312.0, "calib/step_q_gap": 0.1851971639471639, "calib/step_q_w": 0.5998989898989899, "calib/step_q_w_n": 198.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 460.98828125, "completions/mean_terminated_length": 462.7961120605469, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.1984, "grad_norm": 0.005948403850197792, "kl": 0.05864715576171875, "learning_rate": 3.8888888888888895e-07, "loss": 0.0043, "num_tokens": 41876644.0, "reward": 0.45981481671333313, "reward_std": 0.19684401154518127, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7699711322784424, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.17690393328666687, "step": 186 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7147108843537415, "calib/avg_num_step_conf": 2.48828125, "calib/ece": 0.21736220472440942, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5669291338582677, "calib/gap": 0.19568681318681314, "calib/mean_conf": 0.8216141732283464, "calib/mu_c": 0.8971153846153845, "calib/mu_w": 0.7014285714285714, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21240157480314958, "calib/std_conf": 0.24712550757061275, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6770189701897019, "calib/step_q_c_n": 369.0, "calib/step_q_gap": 0.19981747765238855, "calib/step_q_w": 0.47720149253731337, "calib/step_q_w_n": 268.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 492.94140625, "completions/mean_terminated_length": 492.94140625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.19946666666666665, "grad_norm": 0.006126835942268372, "kl": 0.057132720947265625, "learning_rate": 3.611111111111111e-07, "loss": 0.0201, "num_tokens": 42104381.0, "reward": 0.45576226711273193, "reward_std": 0.20533543825149536, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7420246601104736, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.1492500901222229, "step": 187 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6952429149797571, "calib/avg_num_step_conf": 1.88671875, "calib/ece": 0.2142629482071714, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5936254980079682, "calib/gap": 0.20449797570850203, "calib/mean_conf": 0.7706772908366535, "calib/mu_c": 0.8480769230769232, "calib/mu_w": 0.6435789473684211, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.18171314741035866, "calib/std_conf": 0.31263808015910155, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.7659322033898307, "calib/step_q_c_n": 295.0, "calib/step_q_gap": 0.1443364587089797, "calib/step_q_w": 0.621595744680851, "calib/step_q_w_n": 188.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 526.2265625, "completions/mean_terminated_length": 528.2902221679688, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.20053333333333334, "grad_norm": 0.005998818203806877, "kl": 0.052219390869140625, "learning_rate": 3.3333333333333335e-07, "loss": 0.0515, "num_tokens": 42343167.0, "reward": 0.40924012660980225, "reward_std": 0.23015396296977997, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7154003977775574, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": -0.21254518628120422, "step": 188 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7343910256410255, "calib/avg_num_step_conf": 1.6171875, "calib/ece": 0.1864453124999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.49609375, "calib/gap": 0.26669487179487195, "calib/mean_conf": 0.7101171875, "calib/mu_c": 0.8142948717948719, "calib/mu_w": 0.5476, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1435937499999999, "calib/std_conf": 0.3301461352826803, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7537130801687764, "calib/step_q_c_n": 237.0, "calib/step_q_gap": 0.24619895587499108, "calib/step_q_w": 0.5075141242937853, "calib/step_q_w_n": 177.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 430.1640625, "completions/mean_terminated_length": 431.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.2016, "grad_norm": 0.006668528076261282, "kl": 0.05899810791015625, "learning_rate": 3.055555555555556e-07, "loss": -0.0135, "num_tokens": 42561057.0, "reward": 0.4447804391384125, "reward_std": 0.2044180929660797, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7697839736938477, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.20209810137748718, "step": 189 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7648008966244726, "calib/avg_num_step_conf": 2.00390625, "calib/ece": 0.19811023622047258, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5866141732283464, "calib/gap": 0.2740479957805906, "calib/mean_conf": 0.7869291338582678, "calib/mu_c": 0.890506329113924, "calib/mu_w": 0.6164583333333334, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18149606299212612, "calib/std_conf": 0.2979682163756999, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7988996763754046, "calib/step_q_c_n": 309.0, "calib/step_q_gap": 0.23551732343422804, "calib/step_q_w": 0.5633823529411766, "calib/step_q_w_n": 204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 510.4453125, "completions/mean_terminated_length": 512.4470825195312, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.20266666666666666, "grad_norm": 0.00550629198551178, "kl": 0.04981231689453125, "learning_rate": 2.7777777777777776e-07, "loss": -0.021, "num_tokens": 42797339.0, "reward": 0.4591637849807739, "reward_std": 0.22176282107830048, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7685238718986511, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.17128996551036835, "step": 190 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6950024801587301, "calib/avg_num_step_conf": 2.30859375, "calib/ece": 0.28672519685039377, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5708661417322834, "calib/gap": 0.23468420138888857, "calib/mean_conf": 0.7456228346456694, "calib/mu_c": 0.8638888888888886, "calib/mu_w": 0.6292046875, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.26814251968503944, "calib/std_conf": 0.33022037921463554, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7361940298507463, "calib/step_q_c_n": 268.0, "calib/step_q_gap": 0.16311848805508067, "calib/step_q_w": 0.5730755417956657, "calib/step_q_w_n": 323.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 469.359375, "completions/mean_terminated_length": 469.359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.20373333333333332, "grad_norm": 0.006034213118255138, "kl": 0.06505584716796875, "learning_rate": 2.5000000000000004e-07, "loss": 0.0111, "num_tokens": 43021663.0, "reward": 0.373573362827301, "reward_std": 0.21615146100521088, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6873253583908081, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.23549112677574158, "step": 191 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7284951363723059, "calib/avg_num_step_conf": 2.4296875, "calib/ece": 0.2149606299212598, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.531496062992126, "calib/gap": 0.25306122448979596, "calib/mean_conf": 0.7564566929133858, "calib/mu_c": 0.8630612244897959, "calib/mu_w": 0.61, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1963385826771653, "calib/std_conf": 0.3208319096758122, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7225545171339564, "calib/step_q_c_n": 321.0, "calib/step_q_gap": 0.21444820484159755, "calib/step_q_w": 0.5081063122923588, "calib/step_q_w_n": 301.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 471.5625, "completions/mean_terminated_length": 473.41180419921875, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.2048, "grad_norm": 0.0063798511400818825, "kl": 0.0574493408203125, "learning_rate": 2.2222222222222224e-07, "loss": 0.0172, "num_tokens": 43247359.0, "reward": 0.433309942483902, "reward_std": 0.24963872134685516, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7353507876396179, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.18123087286949158, "step": 192 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7555152902282408, "calib/avg_num_step_conf": 2.28515625, "calib/ece": 0.200511811023622, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.531496062992126, "calib/gap": 0.2700521330027338, "calib/mean_conf": 0.7575984251968504, "calib/mu_c": 0.871360544217687, "calib/mu_w": 0.6013084112149533, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1896850393700787, "calib/std_conf": 0.31375321963633407, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7516428571428573, "calib/step_q_c_n": 280.0, "calib/step_q_gap": 0.27567564402810313, "calib/step_q_w": 0.47596721311475415, "calib/step_q_w_n": 305.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 476.28515625, "completions/mean_terminated_length": 476.28515625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.20586666666666667, "grad_norm": 0.005859904922544956, "kl": 0.055263519287109375, "learning_rate": 1.9444444444444447e-07, "loss": -0.0113, "num_tokens": 43475000.0, "reward": 0.4385659992694855, "reward_std": 0.21207654476165771, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7476226687431335, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": -0.1829906404018402, "step": 193 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7885227485489609, "calib/avg_num_step_conf": 2.046875, "calib/ece": 0.22960937499999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.625, "calib/gap": 0.30555451538413525, "calib/mean_conf": 0.7755468750000001, "calib/mu_c": 0.9056462585034013, "calib/mu_w": 0.600091743119266, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21546875, "calib/std_conf": 0.3153490342901566, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8037246963562752, "calib/step_q_c_n": 247.0, "calib/step_q_gap": 0.26437451585086014, "calib/step_q_w": 0.5393501805054151, "calib/step_q_w_n": 277.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 421.5703125, "completions/mean_terminated_length": 423.2235412597656, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.20693333333333333, "grad_norm": 0.006361325271427631, "kl": 0.0576019287109375, "learning_rate": 1.6666666666666668e-07, "loss": -0.0022, "num_tokens": 43688866.0, "reward": 0.4467424154281616, "reward_std": 0.20001167058944702, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7649414539337158, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.1863003522157669, "step": 194 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7467991464390504, "calib/avg_num_step_conf": 2.15625, "calib/ece": 0.1723137254901961, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5333333333333333, "calib/gap": 0.264561883168845, "calib/mean_conf": 0.7535686274509804, "calib/mu_c": 0.8490184049079754, "calib/mu_w": 0.5844565217391304, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14333333333333337, "calib/std_conf": 0.3055785419336132, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.738676923076923, "calib/step_q_c_n": 325.0, "calib/step_q_gap": 0.24198088783463217, "calib/step_q_w": 0.49669603524229083, "calib/step_q_w_n": 227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 475.8984375, "completions/mean_terminated_length": 477.7647399902344, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.208, "grad_norm": 0.006363731808960438, "kl": 0.0585174560546875, "learning_rate": 1.3888888888888888e-07, "loss": 0.0067, "num_tokens": 43916680.0, "reward": 0.4543408751487732, "reward_std": 0.1945040225982666, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7743749618530273, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.18991197645664215, "step": 195 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7149387278739545, "calib/avg_num_step_conf": 2.140625, "calib/ece": 0.2534375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.703125, "calib/gap": 0.18491733125850995, "calib/mean_conf": 0.8580468750000001, "calib/mu_c": 0.9281132075471697, "calib/mu_w": 0.7431958762886598, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24519531249999998, "calib/std_conf": 0.24048635159346232, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7884272997032641, "calib/step_q_c_n": 337.0, "calib/step_q_gap": 0.15117611486914084, "calib/step_q_w": 0.6372511848341232, "calib/step_q_w_n": 211.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 392.52734375, "completions/mean_terminated_length": 394.0666809082031, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.20906666666666668, "grad_norm": 0.006647142581641674, "kl": 0.06627655029296875, "learning_rate": 1.1111111111111112e-07, "loss": -0.0162, "num_tokens": 44119711.0, "reward": 0.41717860102653503, "reward_std": 0.13773755729198456, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7377187609672546, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": -0.22758033871650696, "step": 196 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7500314228255405, "calib/avg_num_step_conf": 1.95703125, "calib/ece": 0.24754940711462442, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5849802371541502, "calib/gap": 0.2424478381096029, "calib/mean_conf": 0.762806324110672, "calib/mu_c": 0.8749264705882354, "calib/mu_w": 0.6324786324786325, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23640316205533587, "calib/std_conf": 0.31291885769558153, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6909057971014493, "calib/step_q_c_n": 276.0, "calib/step_q_gap": 0.08872801932367147, "calib/step_q_w": 0.6021777777777778, "calib/step_q_w_n": 225.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 484.69140625, "completions/mean_terminated_length": 484.69140625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21013333333333334, "grad_norm": 0.006134700495749712, "kl": 0.0559844970703125, "learning_rate": 8.333333333333334e-08, "loss": 0.0136, "num_tokens": 44348848.0, "reward": 0.4062767028808594, "reward_std": 0.21886281669139862, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7109090089797974, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.20148061215877533, "step": 197 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7699800531914893, "calib/avg_num_step_conf": 2.16796875, "calib/ece": 0.16625984251968506, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5511811023622047, "calib/gap": 0.29319015957446815, "calib/mean_conf": 0.7625590551181102, "calib/mu_c": 0.8710625000000001, "calib/mu_w": 0.5778723404255319, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1494488188976378, "calib/std_conf": 0.30623852454266404, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7793093093093093, "calib/step_q_c_n": 333.0, "calib/step_q_gap": 0.20872372372372383, "calib/step_q_w": 0.5705855855855855, "calib/step_q_w_n": 222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2011.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 427.7421875, "completions/mean_terminated_length": 427.7421875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2112, "grad_norm": 0.006533287465572357, "kl": 0.07459259033203125, "learning_rate": 5.555555555555556e-08, "loss": 0.0232, "num_tokens": 44563734.0, "reward": 0.47105446457862854, "reward_std": 0.19906282424926758, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7860128879547119, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": -0.16734150052070618, "step": 198 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7142857142857143, "calib/avg_num_step_conf": 2.27734375, "calib/ece": 0.234313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5647058823529412, "calib/gap": 0.16734521575984984, "calib/mean_conf": 0.7745490196078432, "calib/mu_c": 0.8342682926829268, "calib/mu_w": 0.666923076923077, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.18286274509803915, "calib/std_conf": 0.3003091422691147, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6988650306748466, "calib/step_q_c_n": 326.0, "calib/step_q_gap": 0.1422891551884652, "calib/step_q_w": 0.5565758754863814, "calib/step_q_w_n": 257.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 481.91796875, "completions/mean_terminated_length": 483.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.21226666666666666, "grad_norm": 0.006298987660557032, "kl": 0.057342529296875, "learning_rate": 2.777777777777778e-08, "loss": -0.0096, "num_tokens": 44791305.0, "reward": 0.42878156900405884, "reward_std": 0.20725329220294952, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7290300130844116, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": -0.19646695256233215, "step": 199 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8179012345679012, "calib/avg_num_step_conf": 2.08203125, "calib/ece": 0.16921568627450984, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6078431372549019, "calib/gap": 0.3307765830346474, "calib/mean_conf": 0.8012156862745098, "calib/mu_c": 0.9218518518518517, "calib/mu_w": 0.5910752688172043, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16756862745098045, "calib/std_conf": 0.28398096551274105, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7976415094339622, "calib/step_q_c_n": 318.0, "calib/step_q_gap": 0.25587406757349695, "calib/step_q_w": 0.5417674418604652, "calib/step_q_w_n": 215.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 468.3828125, "completions/mean_terminated_length": 470.2196350097656, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.21333333333333335, "grad_norm": 0.006423901300877333, "kl": 0.0532379150390625, "learning_rate": 0.0, "loss": 0.0273, "num_tokens": 45019259.0, "reward": 0.4917670786380768, "reward_std": 0.1764676719903946, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.8102308511734009, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": -0.15247796475887299, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.011295410779421217, "train_runtime": 10938.5614, "train_samples_per_second": 4.681, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 45019259, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }