{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5086206896551725, "calib/avg_num_step_conf": 7.875, "calib/ece": 0.2888991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0001713264989126051, "calib/mean_conf": 0.9905120967741936, "calib/mu_c": 0.9905632183908043, "calib/mu_w": 0.9903918918918917, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2888991935483871, "calib/std_conf": 0.0021794159006610276, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9119477557027226, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": 0.0056311651395566376, "calib/step_q_w": 0.9063165905631659, "calib/step_q_w_n": 657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 755.49609375, "completions/mean_terminated_length": 776.7349243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 397.0, "epoch": 0.0010666666666666667, "grad_norm": 0.007995938882231712, "kl": 0.0005849599838256836, "learning_rate": 2.5000000000000004e-07, "loss": -0.0584, "num_tokens": 300991.0, "reward": 0.8747976422309875, "reward_std": 0.23758892714977264, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6851503849029541, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7347574234008789, "step": 1 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4872611464968153, "calib/avg_num_step_conf": 7.6953125, "calib/ece": 0.36465737051792824, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024481637078155316, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9903085106382975, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36465737051792824, "calib/std_conf": 0.001222205307190084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9075405636208369, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": -0.003804868168900244, "calib/step_q_w": 0.9113454317897371, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 840.640625, "completions/mean_terminated_length": 850.6087036132812, "completions/min_length": 0.0, "completions/min_terminated_length": 466.0, "epoch": 0.0021333333333333334, "grad_norm": 0.006802097428590059, "kl": 0.0016820430755615234, "learning_rate": 5.000000000000001e-07, "loss": 0.0262, "num_tokens": 619483.0, "reward": 0.818764328956604, "reward_std": 0.2176743745803833, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6203019618988037, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6984766721725464, "step": 2 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4973198746954403, "calib/avg_num_step_conf": 7.55078125, "calib/ece": 0.3249566929133857, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.53184824222852e-05, "calib/mean_conf": 0.9903110236220471, "calib/mu_c": 0.9902958579881658, "calib/mu_w": 0.9903411764705881, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3249566929133857, "calib/std_conf": 0.00172570781308277, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9122644497228821, "calib/step_q_c_n": 1263.0, "calib/step_q_gap": 0.0014062407676580513, "calib/step_q_w": 0.9108582089552241, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 793.03125, "completions/mean_terminated_length": 799.2755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.0032, "grad_norm": 0.007428816519677639, "kl": 0.0005386471748352051, "learning_rate": 7.5e-07, "loss": 0.0164, "num_tokens": 927755.0, "reward": 0.8603240251541138, "reward_std": 0.21694022417068481, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6664742231369019, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7237049341201782, "step": 3 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4995388108414928, "calib/avg_num_step_conf": 7.6484375, "calib/ece": 0.30776078431372544, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.4261387824787697e-05, "calib/mean_conf": 0.990113725490196, "calib/mu_c": 0.9901091954022987, "calib/mu_w": 0.9901234567901235, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30776078431372544, "calib/std_conf": 0.0010435647067591877, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9116891891891893, "calib/step_q_c_n": 1332.0, "calib/step_q_gap": 0.003542224332959254, "calib/step_q_w": 0.90814696485623, "calib/step_q_w_n": 626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 788.34375, "completions/mean_terminated_length": 788.34375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.004266666666666667, "grad_norm": 0.0074525512754917145, "kl": 0.0006039738655090332, "learning_rate": 1.0000000000000002e-06, "loss": 0.019, "num_tokens": 1235739.0, "reward": 0.8774030208587646, "reward_std": 0.20519384741783142, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.685839056968689, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7338106632232666, "step": 4 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5065074420214607, "calib/avg_num_step_conf": 7.37890625, "calib/ece": 0.4323057851239669, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00013070266528203422, "calib/mean_conf": 0.9901570247933884, "calib/mu_c": 0.9902148148148147, "calib/mu_w": 0.9900841121495326, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4323057851239669, "calib/std_conf": 0.001212934863402139, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9135028680688336, "calib/step_q_c_n": 1046.0, "calib/step_q_gap": 0.008977363916994885, "calib/step_q_w": 0.9045255041518387, "calib/step_q_w_n": 843.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 837.125, "completions/mean_terminated_length": 853.8008422851562, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.005333333333333333, "grad_norm": 0.006662297062575817, "kl": 0.0005676746368408203, "learning_rate": 1.25e-06, "loss": -0.0101, "num_tokens": 1556731.0, "reward": 0.725764811038971, "reward_std": 0.16995424032211304, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5355398058891296, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6214585304260254, "step": 5 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5138888888888888, "calib/avg_num_step_conf": 7.94921875, "calib/ece": 0.4141600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002777777777777102, "calib/mean_conf": 0.99016, "calib/mu_c": 0.9902777777777777, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4141600000000001, "calib/std_conf": 0.001254750971308651, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.911831726555653, "calib/step_q_c_n": 1141.0, "calib/step_q_gap": 0.0027265811417828134, "calib/step_q_w": 0.9091051454138702, "calib/step_q_w_n": 894.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 717.16015625, "completions/mean_terminated_length": 731.4462280273438, "completions/min_length": 0.0, "completions/min_terminated_length": 441.0, "epoch": 0.0064, "grad_norm": 0.007342583499848843, "kl": 0.0007786750793457031, "learning_rate": 1.5e-06, "loss": -0.04, "num_tokens": 1846276.0, "reward": 0.7668963670730591, "reward_std": 0.20423705875873566, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5706851482391357, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6552950143814087, "step": 6 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5008736559139785, "calib/avg_num_step_conf": 7.29296875, "calib/ece": 0.35770750988142286, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.7473118279931477e-05, "calib/mean_conf": 0.9901185770750988, "calib/mu_c": 0.9901250000000001, "calib/mu_w": 0.9901075268817202, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35770750988142286, "calib/std_conf": 0.0010824556472434123, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9110333048676347, "calib/step_q_c_n": 1171.0, "calib/step_q_gap": 0.008289051994071372, "calib/step_q_w": 0.9027442528735633, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 827.6015625, "completions/mean_terminated_length": 827.6015625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.007466666666666667, "grad_norm": 0.006901096552610397, "kl": 0.0005282759666442871, "learning_rate": 1.75e-06, "loss": -0.0061, "num_tokens": 2165566.0, "reward": 0.8263709545135498, "reward_std": 0.22447288036346436, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6281062364578247, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7035418748855591, "step": 7 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4971477079796265, "calib/avg_num_step_conf": 7.546875, "calib/ece": 0.37027200000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.365025466874673e-05, "calib/mean_conf": 0.9902720000000002, "calib/mu_c": 0.9902516129032256, "calib/mu_w": 0.9903052631578944, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37027200000000016, "calib/std_conf": 0.0016043740212307118, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9117098976109216, "calib/step_q_c_n": 1172.0, "calib/step_q_gap": 0.0050191081372372714, "calib/step_q_w": 0.9066907894736843, "calib/step_q_w_n": 760.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2546.0, "completions/max_terminated_length": 2546.0, "completions/mean_length": 871.5546875, "completions/mean_terminated_length": 878.4172973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.008533333333333334, "grad_norm": 0.006285594776272774, "kl": 0.0006301999092102051, "learning_rate": 2.0000000000000003e-06, "loss": 0.0275, "num_tokens": 2495196.0, "reward": 0.8215715885162354, "reward_std": 0.18256333470344543, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6125690937042236, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7141676545143127, "step": 8 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5057920981971615, "calib/avg_num_step_conf": 7.5, "calib/ece": 0.31397540983606553, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00011584196394365254, "calib/mean_conf": 0.9902049180327869, "calib/mu_c": 0.9902424242424241, "calib/mu_w": 0.9901265822784805, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31397540983606553, "calib/std_conf": 0.0014167529522494773, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9068453768453769, "calib/step_q_c_n": 1287.0, "calib/step_q_gap": 0.0031265774772885058, "calib/step_q_w": 0.9037187993680884, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 850.46484375, "completions/mean_terminated_length": 860.5494384765625, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.0096, "grad_norm": 0.007337111514061689, "kl": 0.0006710290908813477, "learning_rate": 2.25e-06, "loss": 0.0234, "num_tokens": 2820451.0, "reward": 0.8264840841293335, "reward_std": 0.24139253795146942, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6505316495895386, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6829051375389099, "step": 9 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5063398140321218, "calib/avg_num_step_conf": 7.50390625, "calib/ece": 0.35444939271255055, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.011128205128205115, "calib/mean_conf": 0.9860283400809716, "calib/mu_c": 0.990128205128205, "calib/mu_w": 0.9789999999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35444939271255055, "calib/std_conf": 0.06290809444449404, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.908688524590164, "calib/step_q_c_n": 1159.0, "calib/step_q_gap": 0.010470676821135116, "calib/step_q_w": 0.8982178477690289, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 827.86328125, "completions/mean_terminated_length": 847.7320556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 482.0, "epoch": 0.010666666666666666, "grad_norm": 0.007391604594886303, "kl": 0.0006058216094970703, "learning_rate": 2.5e-06, "loss": -0.0199, "num_tokens": 3139184.0, "reward": 0.81383216381073, "reward_std": 0.2799851894378662, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6202203035354614, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6933814883232117, "step": 10 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4997578357434443, "calib/avg_num_step_conf": 7.5234375, "calib/ece": 0.3845121951219511, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -4.84328513128407e-06, "calib/mean_conf": 0.9902032520325202, "calib/mu_c": 0.990201342281879, "calib/mu_w": 0.9902061855670102, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3845121951219511, "calib/std_conf": 0.0014111020290820972, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9101703964757709, "calib/step_q_c_n": 1135.0, "calib/step_q_gap": -0.004267024510322681, "calib/step_q_w": 0.9144374209860936, "calib/step_q_w_n": 791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 825.078125, "completions/mean_terminated_length": 841.5139770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.011733333333333333, "grad_norm": 0.007041232660412788, "kl": 0.0005875229835510254, "learning_rate": 2.7500000000000004e-06, "loss": 0.0031, "num_tokens": 3454884.0, "reward": 0.7954146862030029, "reward_std": 0.23855015635490417, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5893589854240417, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6920953989028931, "step": 11 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5183193277310925, "calib/avg_num_step_conf": 7.8203125, "calib/ece": 0.2702427983539093, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003615126050421358, "calib/mean_conf": 0.9904074074074073, "calib/mu_c": 0.9905085714285713, "calib/mu_w": 0.9901470588235292, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2702427983539093, "calib/std_conf": 0.00196749999782125, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.914101754385965, "calib/step_q_c_n": 1425.0, "calib/step_q_gap": 0.008902447626866317, "calib/step_q_w": 0.9051993067590987, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 745.87890625, "completions/mean_terminated_length": 776.1991577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 355.0, "epoch": 0.0128, "grad_norm": 0.007004205137491226, "kl": 0.0007008910179138184, "learning_rate": 3e-06, "loss": -0.0444, "num_tokens": 3750005.0, "reward": 0.8753750920295715, "reward_std": 0.21079403162002563, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.688737154006958, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7354505062103271, "step": 12 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5019653179190752, "calib/avg_num_step_conf": 8.00390625, "calib/ece": 0.2925766129032259, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.429672447019527e-05, "calib/mean_conf": 0.9901572580645163, "calib/mu_c": 0.9901676300578034, "calib/mu_w": 0.9901333333333332, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2925766129032259, "calib/std_conf": 0.0012294552548691184, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9127642215568862, "calib/step_q_c_n": 1336.0, "calib/step_q_gap": 0.00033785409545550316, "calib/step_q_w": 0.9124263674614307, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 795.58203125, "completions/mean_terminated_length": 817.94775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 0.013866666666666666, "grad_norm": 0.007008600048720837, "kl": 0.0008458495140075684, "learning_rate": 3.2500000000000002e-06, "loss": -0.0267, "num_tokens": 4058266.0, "reward": 0.8624738454818726, "reward_std": 0.19027510285377502, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6814671754837036, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7145742177963257, "step": 13 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4962748344370861, "calib/avg_num_step_conf": 7.4375, "calib/ece": 0.3788178137651821, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.504966887470708e-05, "calib/mean_conf": 0.990153846153846, "calib/mu_c": 0.9901324503311254, "calib/mu_w": 0.9901875000000001, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3788178137651821, "calib/std_conf": 0.0012041659235641337, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9117429577464791, "calib/step_q_c_n": 1136.0, "calib/step_q_gap": 0.0012351452464790524, "calib/step_q_w": 0.9105078125, "calib/step_q_w_n": 768.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 823.8671875, "completions/mean_terminated_length": 847.028076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 458.0, "epoch": 0.014933333333333333, "grad_norm": 0.007082659751176834, "kl": 0.0009881258010864258, "learning_rate": 3.5e-06, "loss": -0.0441, "num_tokens": 4374576.0, "reward": 0.8011032342910767, "reward_std": 0.2523154616355896, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5971081852912903, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6941607594490051, "step": 14 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49928506434420905, "calib/avg_num_step_conf": 7.56640625, "calib/ece": 0.37450588235294113, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.9368256856777677e-05, "calib/mean_conf": 0.9901921568627451, "calib/mu_c": 0.9901847133757962, "calib/mu_w": 0.9902040816326529, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37450588235294113, "calib/std_conf": 0.0013599081770119211, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9131745513866233, "calib/step_q_c_n": 1226.0, "calib/step_q_gap": 0.0028932574344432416, "calib/step_q_w": 0.91028129395218, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 780.34765625, "completions/mean_terminated_length": 783.4078979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 426.0, "epoch": 0.016, "grad_norm": 0.007479914464056492, "kl": 0.002272963523864746, "learning_rate": 3.7500000000000005e-06, "loss": 0.0059, "num_tokens": 4682225.0, "reward": 0.8200702667236328, "reward_std": 0.15475550293922424, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6206835508346558, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.6975818872451782, "step": 15 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5024885294346372, "calib/avg_num_step_conf": 7.3203125, "calib/ece": 0.30573770491803287, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.977058869282125e-05, "calib/mean_conf": 0.9901639344262296, "calib/mu_c": 0.9901796407185626, "calib/mu_w": 0.9901298701298697, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.30573770491803287, "calib/std_conf": 0.0012698306053139086, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.9036963434022257, "calib/step_q_c_n": 1258.0, "calib/step_q_gap": 0.011034005739887887, "calib/step_q_w": 0.8926623376623378, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 930.609375, "completions/mean_terminated_length": 949.1474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 429.0, "epoch": 0.017066666666666667, "grad_norm": 0.00610819086432457, "kl": 0.0013129711151123047, "learning_rate": 4.000000000000001e-06, "loss": 0.0022, "num_tokens": 5029309.0, "reward": 0.8363821506500244, "reward_std": 0.19631116092205048, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6579543352127075, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.6960598826408386, "step": 16 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.486911686938128, "calib/avg_num_step_conf": 7.87890625, "calib/ece": 0.24334285714285708, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00024916270051156264, "calib/mean_conf": 0.9902816326530611, "calib/mu_c": 0.9902185792349726, "calib/mu_w": 0.9904677419354841, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24334285714285708, "calib/std_conf": 0.0016432513153697445, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9083475177304965, "calib/step_q_c_n": 1410.0, "calib/step_q_gap": 0.003532031733791463, "calib/step_q_w": 0.904815485996705, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 807.6953125, "completions/mean_terminated_length": 833.75, "completions/min_length": 0.0, "completions/min_terminated_length": 392.0, "epoch": 0.018133333333333335, "grad_norm": 0.0072814165614545345, "kl": 0.0025234222412109375, "learning_rate": 4.25e-06, "loss": -0.0249, "num_tokens": 5339607.0, "reward": 0.9059667587280273, "reward_std": 0.2583204507827759, "rewards/accuracy_reward_step": 0.71484375, "rewards/final_brier_reward_step": 0.7193679809570312, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.758190393447876, "step": 17 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5074626865671642, "calib/avg_num_step_conf": 7.5, "calib/ece": 0.44724696356275295, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": -0.00044776119402989423, "calib/mean_conf": 0.9897570850202428, "calib/mu_c": 0.9895522388059701, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44724696356275295, "calib/std_conf": 0.005826584718220573, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9136172344689378, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.01803155117175781, "calib/step_q_w": 0.89558568329718, "calib/step_q_w_n": 922.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 862.61328125, "completions/mean_terminated_length": 879.7968139648438, "completions/min_length": 0.0, "completions/min_terminated_length": 486.0, "epoch": 0.0192, "grad_norm": 0.007226284593343735, "kl": 0.003521442413330078, "learning_rate": 4.5e-06, "loss": -0.0042, "num_tokens": 5671156.0, "reward": 0.7269142270088196, "reward_std": 0.26412057876586914, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5321316719055176, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6240404844284058, "step": 18 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4919809717974856, "calib/avg_num_step_conf": 7.96484375, "calib/ece": 0.43707786885245903, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00015147808358828563, "calib/mean_conf": 0.9903565573770492, "calib/mu_c": 0.9902888888888887, "calib/mu_w": 0.990440366972477, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43707786885245903, "calib/std_conf": 0.0018242217161460114, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9127476099426388, "calib/step_q_c_n": 1046.0, "calib/step_q_gap": 6.281638775462728e-05, "calib/step_q_w": 0.9126847935548842, "calib/step_q_w_n": 993.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 789.6953125, "completions/mean_terminated_length": 815.1693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.020266666666666665, "grad_norm": 0.007534496486186981, "kl": 0.0037870407104492188, "learning_rate": 4.75e-06, "loss": -0.0374, "num_tokens": 5978078.0, "reward": 0.7543637752532959, "reward_std": 0.2398720383644104, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5353925228118896, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6756787300109863, "step": 19 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.4992291880781089, "calib/avg_num_step_conf": 8.18359375, "calib/ece": 0.40379324894514756, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.148583174327761e-06, "calib/mean_conf": 0.9902911392405063, "calib/mu_c": 0.9902877697841725, "calib/mu_w": 0.9902959183673469, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40379324894514756, "calib/std_conf": 0.001669926843953468, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9174864391951008, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.004666061043840264, "calib/step_q_w": 0.9128203781512605, "calib/step_q_w_n": 952.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 785.80859375, "completions/mean_terminated_length": 834.7178955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 446.0, "epoch": 0.021333333333333333, "grad_norm": 0.007198127917945385, "kl": 0.005240440368652344, "learning_rate": 5e-06, "loss": -0.0936, "num_tokens": 6284117.0, "reward": 0.736098051071167, "reward_std": 0.22667299211025238, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.550230860710144, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.6289964914321899, "step": 20 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5605095541401274, "calib/avg_num_step_conf": 8.125, "calib/ece": 0.36771428571428566, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0011719745222932287, "calib/mean_conf": 0.9907301587301587, "calib/mu_c": 0.9911719745222929, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36771428571428566, "calib/std_conf": 0.0025755649662626777, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9162932917316694, "calib/step_q_c_n": 1282.0, "calib/step_q_gap": 0.003348429576281009, "calib/step_q_w": 0.9129448621553884, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 803.81640625, "completions/mean_terminated_length": 816.575439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 394.0, "epoch": 0.0224, "grad_norm": 0.007013499736785889, "kl": 0.006320476531982422, "learning_rate": 4.9722222222222224e-06, "loss": -0.0117, "num_tokens": 6592854.0, "reward": 0.822600245475769, "reward_std": 0.25758790969848633, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6206120252609253, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7050572633743286, "step": 21 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5008818342151675, "calib/avg_num_step_conf": 8.35546875, "calib/ece": 0.3319512195121951, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.7636684303545813e-05, "calib/mean_conf": 0.9904878048780488, "calib/mu_c": 0.9904938271604938, "calib/mu_w": 0.9904761904761903, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3319512195121951, "calib/std_conf": 0.0021540880161775256, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9200967982129561, "calib/step_q_c_n": 1343.0, "calib/step_q_gap": -0.0029975414096853337, "calib/step_q_w": 0.9230943396226414, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 812.3203125, "completions/mean_terminated_length": 831.8160400390625, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.023466666666666667, "grad_norm": 0.007960462011396885, "kl": 0.009766578674316406, "learning_rate": 4.944444444444445e-06, "loss": -0.0187, "num_tokens": 6902624.0, "reward": 0.8345315456390381, "reward_std": 0.2210981249809265, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6388933658599854, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7122009992599487, "step": 22 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.46663230240549824, "calib/avg_num_step_conf": 8.5859375, "calib/ece": 0.3834412955465588, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000667353951890437, "calib/mean_conf": 0.9907287449392713, "calib/mu_c": 0.9904666666666664, "calib/mu_w": 0.9911340206185568, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3834412955465588, "calib/std_conf": 0.0025993037925950667, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9173632653061224, "calib/step_q_c_n": 1225.0, "calib/step_q_gap": -0.0024825723094993313, "calib/step_q_w": 0.9198458376156218, "calib/step_q_w_n": 973.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 825.96484375, "completions/mean_terminated_length": 845.7880249023438, "completions/min_length": 0.0, "completions/min_terminated_length": 478.0, "epoch": 0.024533333333333334, "grad_norm": 0.007444644812494516, "kl": 0.010847091674804688, "learning_rate": 4.9166666666666665e-06, "loss": -0.051, "num_tokens": 7218007.0, "reward": 0.7837207913398743, "reward_std": 0.29620790481567383, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5924890041351318, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6655775308609009, "step": 23 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.47041190576950204, "calib/avg_num_step_conf": 8.99609375, "calib/ece": 0.4110843706777316, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.995850622406639, "calib/gap": 0.005764893967179274, "calib/mean_conf": 0.9878478561549101, "calib/mu_c": 0.9902877697841725, "calib/mu_w": 0.9845228758169933, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4110843706777316, "calib/std_conf": 0.04231167039487974, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9169182948490231, "calib/step_q_c_n": 1126.0, "calib/step_q_gap": -0.0028540076148682036, "calib/step_q_w": 0.9197723024638913, "calib/step_q_w_n": 1177.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 861.0859375, "completions/mean_terminated_length": 896.0894165039062, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.0256, "grad_norm": 0.006651679985225201, "kl": 0.011853218078613281, "learning_rate": 4.888888888888889e-06, "loss": -0.0175, "num_tokens": 7542957.0, "reward": 0.7452735304832458, "reward_std": 0.3208249807357788, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.553399920463562, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.641053318977356, "step": 24 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5025458978772231, "calib/avg_num_step_conf": 8.69921875, "calib/ece": 0.3214342629482072, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.0917957544527326e-05, "calib/mean_conf": 0.9907569721115538, "calib/mu_c": 0.9907738095238094, "calib/mu_w": 0.9907228915662649, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3214342629482072, "calib/std_conf": 0.0026451303064060315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9186124641833812, "calib/step_q_c_n": 1396.0, "calib/step_q_gap": -0.0059122048900243085, "calib/step_q_w": 0.9245246690734055, "calib/step_q_w_n": 831.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 782.078125, "completions/mean_terminated_length": 794.4921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.02666666666666667, "grad_norm": 0.007144877687096596, "kl": 0.014530181884765625, "learning_rate": 4.861111111111111e-06, "loss": -0.0008, "num_tokens": 7846393.0, "reward": 0.8513222932815552, "reward_std": 0.17412230372428894, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6621749997138977, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7131257653236389, "step": 25 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4729978738483345, "calib/avg_num_step_conf": 8.85546875, "calib/ece": 0.3294860557768925, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00038327427356477095, "calib/mean_conf": 0.9908406374501992, "calib/mu_c": 0.9907108433734938, "calib/mu_w": 0.9910941176470586, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3294860557768925, "calib/std_conf": 0.0026939695810490175, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9192279679533867, "calib/step_q_c_n": 1373.0, "calib/step_q_gap": -0.010238474999633485, "calib/step_q_w": 0.9294664429530202, "calib/step_q_w_n": 894.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 832.20703125, "completions/mean_terminated_length": 848.784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.027733333333333332, "grad_norm": 0.010776635259389877, "kl": 0.0294952392578125, "learning_rate": 4.833333333333333e-06, "loss": -0.0283, "num_tokens": 8164678.0, "reward": 0.8365595936775208, "reward_std": 0.16926750540733337, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6464504599571228, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7024499177932739, "step": 26 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4965912815470337, "calib/avg_num_step_conf": 8.4375, "calib/ece": 0.44655645161290347, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.542117338592668e-05, "calib/mean_conf": 0.9909112903225809, "calib/mu_c": 0.9908814814814814, "calib/mu_w": 0.9909469026548673, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44655645161290347, "calib/std_conf": 0.0028525938868781777, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9200569902048087, "calib/step_q_c_n": 1123.0, "calib/step_q_gap": 0.0002585331170555927, "calib/step_q_w": 0.9197984570877531, "calib/step_q_w_n": 1037.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 850.75390625, "completions/mean_terminated_length": 871.1720581054688, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.0288, "grad_norm": 0.006933312397450209, "kl": 0.018218994140625, "learning_rate": 4.805555555555556e-06, "loss": 0.011, "num_tokens": 8487687.0, "reward": 0.7302448749542236, "reward_std": 0.2871781587600708, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5313421487808228, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6307100057601929, "step": 27 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.49529190207156315, "calib/avg_num_step_conf": 8.43359375, "calib/ece": 0.2532000000000001, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.393866021005891e-05, "calib/mean_conf": 0.9907000000000001, "calib/mu_c": 0.9906779661016949, "calib/mu_w": 0.990761904761905, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2532000000000001, "calib/std_conf": 0.002536730178793167, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9211392405063292, "calib/step_q_c_n": 1422.0, "calib/step_q_gap": -0.008866186902083184, "calib/step_q_w": 0.9300054274084124, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2753.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 798.93359375, "completions/mean_terminated_length": 828.0445556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.029866666666666666, "grad_norm": 0.006898973137140274, "kl": 0.022180557250976562, "learning_rate": 4.777777777777778e-06, "loss": -0.0357, "num_tokens": 8799158.0, "reward": 0.8773797154426575, "reward_std": 0.19283947348594666, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.6958659887313843, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.733112096786499, "step": 28 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5181253055807782, "calib/avg_num_step_conf": 8.00390625, "calib/ece": 0.4160082644628099, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006086470629323992, "calib/mean_conf": 0.9903884297520661, "calib/mu_c": 0.9906474820143885, "calib/mu_w": 0.9900388349514561, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4160082644628099, "calib/std_conf": 0.0029243121698192286, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.91996399639964, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": -0.0013899481632597688, "calib/step_q_w": 0.9213539445628998, "calib/step_q_w_n": 938.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 872.84765625, "completions/mean_terminated_length": 912.0366821289062, "completions/min_length": 0.0, "completions/min_terminated_length": 512.0, "epoch": 0.030933333333333334, "grad_norm": 0.006322621833533049, "kl": 0.020404815673828125, "learning_rate": 4.75e-06, "loss": -0.0619, "num_tokens": 9129735.0, "reward": 0.7384533882141113, "reward_std": 0.22164635360240936, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5469831228256226, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.6330485343933105, "step": 29 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49972766884531594, "calib/avg_num_step_conf": 8.1328125, "calib/ece": 0.43298770491803285, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.0076252722979184e-05, "calib/mean_conf": 0.9903647540983607, "calib/mu_c": 0.9903602941176473, "calib/mu_w": 0.9903703703703702, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.43298770491803285, "calib/std_conf": 0.0018648351630731412, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9210891544117646, "calib/step_q_c_n": 1088.0, "calib/step_q_gap": -0.0020728174192212734, "calib/step_q_w": 0.9231619718309859, "calib/step_q_w_n": 994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 833.046875, "completions/mean_terminated_length": 863.4008178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 539.0, "epoch": 0.032, "grad_norm": 0.005995790008455515, "kl": 0.026248931884765625, "learning_rate": 4.722222222222222e-06, "loss": -0.0443, "num_tokens": 9449979.0, "reward": 0.7293350696563721, "reward_std": 0.314066082239151, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5392831563949585, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6225119233131409, "step": 30 }, { "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.48967857142857146, "calib/avg_num_step_conf": 9.33203125, "calib/ece": 0.4629113924050632, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002064285714287184, "calib/mean_conf": 0.9903375527426159, "calib/mu_c": 0.9902399999999999, "calib/mu_w": 0.9904464285714286, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4629113924050632, "calib/std_conf": 0.0018059860387369384, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9208267716535432, "calib/step_q_c_n": 1016.0, "calib/step_q_gap": -0.014243148229923785, "calib/step_q_w": 0.935069919883467, "calib/step_q_w_n": 1373.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 841.4765625, "completions/mean_terminated_length": 893.8506469726562, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.03306666666666667, "grad_norm": 0.006572486367076635, "kl": 0.023813247680664062, "learning_rate": 4.694444444444445e-06, "loss": -0.052, "num_tokens": 9771309.0, "reward": 0.6754677891731262, "reward_std": 0.23591241240501404, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.4964734613895416, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.5724309086799622, "step": 31 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.49348591549295767, "calib/avg_num_step_conf": 8.703125, "calib/ece": 0.4035991735537189, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -5.788732394373941e-05, "calib/mean_conf": 0.9903760330578512, "calib/mu_c": 0.9903521126760563, "calib/mu_w": 0.99041, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4035991735537189, "calib/std_conf": 0.0018349015238461625, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9203006872852234, "calib/step_q_c_n": 1164.0, "calib/step_q_gap": -0.011173936774927173, "calib/step_q_w": 0.9314746240601506, "calib/step_q_w_n": 1064.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 821.6484375, "completions/mean_terminated_length": 855.0487670898438, "completions/min_length": 0.0, "completions/min_terminated_length": 452.0, "epoch": 0.034133333333333335, "grad_norm": 0.006107104942202568, "kl": 0.02695465087890625, "learning_rate": 4.666666666666667e-06, "loss": -0.0281, "num_tokens": 10088355.0, "reward": 0.7452229261398315, "reward_std": 0.2125379592180252, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5620889663696289, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6283566951751709, "step": 32 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49014844804318486, "calib/avg_num_step_conf": 9.2265625, "calib/ece": 0.45744672131147535, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00018717948717950073, "calib/mean_conf": 0.990233606557377, "calib/mu_c": 0.9901461538461538, "calib/mu_w": 0.9903333333333333, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45744672131147535, "calib/std_conf": 0.0014733763308863153, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9224072072072073, "calib/step_q_c_n": 1110.0, "calib/step_q_gap": -0.011606371067553134, "calib/step_q_w": 0.9340135782747604, "calib/step_q_w_n": 1252.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 799.5703125, "completions/mean_terminated_length": 828.7044677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.0352, "grad_norm": 0.006116094067692757, "kl": 0.03139495849609375, "learning_rate": 4.638888888888889e-06, "loss": 0.0078, "num_tokens": 10399917.0, "reward": 0.7238995432853699, "reward_std": 0.22776684165000916, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5162511467933655, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6401416659355164, "step": 33 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5142857142857142, "calib/avg_num_step_conf": 9.54296875, "calib/ece": 0.41639344262295097, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002857142857141115, "calib/mean_conf": 0.9901639344262296, "calib/mu_c": 0.9902857142857142, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41639344262295097, "calib/std_conf": 0.0012698306053139084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9222103533278553, "calib/step_q_c_n": 1217.0, "calib/step_q_gap": -0.01408654716154123, "calib/step_q_w": 0.9362969004893965, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 782.359375, "completions/mean_terminated_length": 810.8663940429688, "completions/min_length": 0.0, "completions/min_terminated_length": 540.0, "epoch": 0.03626666666666667, "grad_norm": 0.006968450732529163, "kl": 0.0370941162109375, "learning_rate": 4.611111111111112e-06, "loss": -0.0311, "num_tokens": 10705313.0, "reward": 0.7707833051681519, "reward_std": 0.2937692701816559, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5549062490463257, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6866602897644043, "step": 34 }, { "calib/answer_extract_rate": 0.87890625, "calib/auroc": 0.5092366536458333, "calib/avg_num_step_conf": 13.3203125, "calib/ece": 0.4188750000000002, "calib/final_conf_rate": 0.875, "calib/format_rate": 0.87109375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.000184895833333365, "calib/mean_conf": 0.9903035714285716, "calib/mu_c": 0.9903828125, "calib/mu_w": 0.9901979166666667, "calib/nonempty_final_conf_rate": 0.875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4188750000000002, "calib/std_conf": 0.0016920997401489812, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9237611806797854, "calib/step_q_c_n": 1118.0, "calib/step_q_gap": -0.03533567795895809, "calib/step_q_w": 0.9590968586387435, "calib/step_q_w_n": 2292.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 789.234375, "completions/mean_terminated_length": 890.0616455078125, "completions/min_length": 0.0, "completions/min_terminated_length": 542.0, "epoch": 0.037333333333333336, "grad_norm": 0.00620591826736927, "kl": 0.0377349853515625, "learning_rate": 4.583333333333333e-06, "loss": -0.1299, "num_tokens": 11016613.0, "reward": 0.6767075061798096, "reward_std": 0.30802637338638306, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.50336092710495, "rewards/format_reward_step": 0.87109375, "rewards/step_l2_reward": 0.5758352279663086, "step": 35 }, { "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.5135869565217391, "calib/avg_num_step_conf": 12.32421875, "calib/ece": 0.2105508474576271, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00027173913043465614, "calib/mean_conf": 0.9902118644067797, "calib/mu_c": 0.9902717391304349, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2105508474576271, "calib/std_conf": 0.0014400547006751224, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9302077484559237, "calib/step_q_c_n": 1781.0, "calib/step_q_gap": -0.02971219331991337, "calib/step_q_w": 0.9599199417758371, "calib/step_q_w_n": 1374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 786.6640625, "completions/mean_terminated_length": 832.1735229492188, "completions/min_length": 0.0, "completions/min_terminated_length": 486.0, "epoch": 0.0384, "grad_norm": 0.00611078180372715, "kl": 0.04564666748046875, "learning_rate": 4.555555555555556e-06, "loss": -0.0689, "num_tokens": 11320711.0, "reward": 0.903594970703125, "reward_std": 0.24698996543884277, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7227222919464111, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7563426494598389, "step": 36 }, { "calib/answer_extract_rate": 0.859375, "calib/auroc": 0.5020491803278688, "calib/avg_num_step_conf": 13.91015625, "calib/ece": 0.5447136363636363, "calib/final_conf_rate": 0.859375, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.7935764469688706e-05, "calib/mean_conf": 0.9901681818181818, "calib/mu_c": 0.9901836734693877, "calib/mu_w": 0.990155737704918, "calib/nonempty_final_conf_rate": 0.859375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5447136363636363, "calib/std_conf": 0.001562016867455304, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9325173501577287, "calib/step_q_c_n": 951.0, "calib/step_q_gap": -0.023144718807788567, "calib/step_q_w": 0.9556620689655173, "calib/step_q_w_n": 2610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 784.1953125, "completions/mean_terminated_length": 880.5, "completions/min_length": 0.0, "completions/min_terminated_length": 514.0, "epoch": 0.039466666666666664, "grad_norm": 0.018121197819709778, "kl": 0.07901382446289062, "learning_rate": 4.527777777777778e-06, "loss": -0.0981, "num_tokens": 11628561.0, "reward": 0.5711500644683838, "reward_std": 0.2837553024291992, "rewards/accuracy_reward_step": 0.3828125, "rewards/final_brier_reward_step": 0.3919546604156494, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.5034705400466919, "step": 37 }, { "calib/answer_extract_rate": 0.86328125, "calib/auroc": 0.4744897959183674, "calib/avg_num_step_conf": 15.390625, "calib/ece": 0.43365158371040713, "calib/final_conf_rate": 0.86328125, "calib/format_rate": 0.859375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0004795918367347385, "calib/mean_conf": 0.9902126696832578, "calib/mu_c": 0.99, "calib/mu_w": 0.9904795918367347, "calib/nonempty_final_conf_rate": 0.86328125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43365158371040713, "calib/std_conf": 0.00139974872484343, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9298717948717949, "calib/step_q_c_n": 1170.0, "calib/step_q_gap": -0.03245275386466717, "calib/step_q_w": 0.9623245487364621, "calib/step_q_w_n": 2770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 804.42578125, "completions/mean_terminated_length": 903.2149047851562, "completions/min_length": 0.0, "completions/min_terminated_length": 517.0, "epoch": 0.04053333333333333, "grad_norm": 0.006702395621687174, "kl": 0.04339599609375, "learning_rate": 4.5e-06, "loss": -0.121, "num_tokens": 11941382.0, "reward": 0.6503283977508545, "reward_std": 0.3093435764312744, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.48759567737579346, "rewards/format_reward_step": 0.859375, "rewards/step_l2_reward": 0.5450923442840576, "step": 38 }, { "calib/answer_extract_rate": 0.859375, "calib/auroc": 0.4895833333333333, "calib/avg_num_step_conf": 16.78125, "calib/ece": 0.4264495454545454, "calib/final_conf_rate": 0.859375, "calib/format_rate": 0.85546875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00019687500000009628, "calib/mean_conf": 0.990085909090909, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.990196875, "calib/nonempty_final_conf_rate": 0.859375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4264495454545454, "calib/std_conf": 0.000897942897004589, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9336143572621035, "calib/step_q_c_n": 1198.0, "calib/step_q_gap": -0.030313467140737127, "calib/step_q_w": 0.9639278244028406, "calib/step_q_w_n": 3098.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 787.33203125, "completions/mean_terminated_length": 899.80810546875, "completions/min_length": 0.0, "completions/min_terminated_length": 503.0, "epoch": 0.0416, "grad_norm": 0.006064797751605511, "kl": 0.04642486572265625, "learning_rate": 4.472222222222223e-06, "loss": -0.151, "num_tokens": 12249027.0, "reward": 0.6602429747581482, "reward_std": 0.30667102336883545, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.487736314535141, "rewards/format_reward_step": 0.85546875, "rewards/step_l2_reward": 0.5647808313369751, "step": 39 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.49993055073268977, "calib/avg_num_step_conf": 10.2734375, "calib/ece": 0.48591666666666666, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.388985346073568e-06, "calib/mean_conf": 0.9900833333333333, "calib/mu_c": 0.9900826446280991, "calib/mu_w": 0.9900840336134452, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.48591666666666666, "calib/std_conf": 0.0009090593428863104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9249478672985783, "calib/step_q_c_n": 1055.0, "calib/step_q_gap": -0.016734672383961535, "calib/step_q_w": 0.9416825396825398, "calib/step_q_w_n": 1575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 830.140625, "completions/mean_terminated_length": 881.8092041015625, "completions/min_length": 0.0, "completions/min_terminated_length": 539.0, "epoch": 0.042666666666666665, "grad_norm": 0.006083718501031399, "kl": 0.037357330322265625, "learning_rate": 4.444444444444444e-06, "loss": -0.0745, "num_tokens": 12568303.0, "reward": 0.6597652435302734, "reward_std": 0.2658485174179077, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.4817820191383362, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.5557171702384949, "step": 40 }, { "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.4937741312741312, "calib/avg_num_step_conf": 10.5703125, "calib/ece": 0.22244813278008302, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001245173745172412, "calib/mean_conf": 0.9900829875518673, "calib/mu_c": 0.9900540540540542, "calib/mu_w": 0.9901785714285715, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22244813278008302, "calib/std_conf": 0.0009071871829491895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9267544396815676, "calib/step_q_c_n": 1633.0, "calib/step_q_gap": -0.028557769078917272, "calib/step_q_w": 0.9553122087604848, "calib/step_q_w_n": 1073.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 810.3046875, "completions/mean_terminated_length": 850.1557006835938, "completions/min_length": 0.0, "completions/min_terminated_length": 504.0, "epoch": 0.04373333333333333, "grad_norm": 0.0065657938830554485, "kl": 0.037689208984375, "learning_rate": 4.416666666666667e-06, "loss": -0.0424, "num_tokens": 12882989.0, "reward": 0.9010114669799805, "reward_std": 0.29251277446746826, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7268597483634949, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7423505783081055, "step": 41 }, { "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.49528301886792453, "calib/avg_num_step_conf": 9.12890625, "calib/ece": 0.4298713692946059, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.49056603774434e-05, "calib/mean_conf": 0.9900373443983403, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9900849056603773, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4298713692946059, "calib/std_conf": 0.0005785369313836811, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9261904761904762, "calib/step_q_c_n": 1155.0, "calib/step_q_gap": -0.006054870679236157, "calib/step_q_w": 0.9322453468697124, "calib/step_q_w_n": 1182.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 775.38671875, "completions/mean_terminated_length": 816.8682861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.0448, "grad_norm": 0.006504183169454336, "kl": 0.038684844970703125, "learning_rate": 4.388888888888889e-06, "loss": -0.0481, "num_tokens": 13185856.0, "reward": 0.7230309247970581, "reward_std": 0.2163492739200592, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5354609489440918, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.6168508529663086, "step": 42 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5117647058823529, "calib/avg_num_step_conf": 8.66796875, "calib/ece": 0.31285258964143436, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00021176470588246676, "calib/mean_conf": 0.9901434262948208, "calib/mu_c": 0.9902117647058825, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31285258964143436, "calib/std_conf": 0.001127060580155502, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9222005629838143, "calib/step_q_c_n": 1421.0, "calib/step_q_gap": -0.00909016383322847, "calib/step_q_w": 0.9312907268170427, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 857.0234375, "completions/mean_terminated_length": 870.6270141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 529.0, "epoch": 0.04586666666666667, "grad_norm": 0.006459683179855347, "kl": 0.035755157470703125, "learning_rate": 4.361111111111112e-06, "loss": -0.001, "num_tokens": 13510478.0, "reward": 0.8659541606903076, "reward_std": 0.21939551830291748, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6702163815498352, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7335669994354248, "step": 43 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49528301886792453, "calib/avg_num_step_conf": 8.02734375, "calib/ece": 0.41573895582329323, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -8.490566037755443e-05, "calib/mean_conf": 0.9900361445783133, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9900849056603773, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41573895582329323, "calib/std_conf": 0.0005692053884827821, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9229799666110183, "calib/step_q_c_n": 1198.0, "calib/step_q_gap": 0.004788601383480473, "calib/step_q_w": 0.9181913652275379, "calib/step_q_w_n": 857.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 892.921875, "completions/mean_terminated_length": 914.35205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 537.0, "epoch": 0.046933333333333334, "grad_norm": 0.006249780301004648, "kl": 0.0361480712890625, "learning_rate": 4.333333333333334e-06, "loss": -0.0469, "num_tokens": 13845386.0, "reward": 0.7539075613021851, "reward_std": 0.22332961857318878, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5667077898979187, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.634857177734375, "step": 44 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.4968466835810076, "calib/avg_num_step_conf": 9.203125, "calib/ece": 0.3528073770491804, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918032786885246, "calib/gap": 0.0037451975353388622, "calib/mean_conf": 0.9880532786885247, "calib/mu_c": 0.9894193548387098, "calib/mu_w": 0.9856741573033709, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3528073770491804, "calib/std_conf": 0.02555132310079111, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9200397140587767, "calib/step_q_c_n": 1259.0, "calib/step_q_gap": -0.011259283206492277, "calib/step_q_w": 0.931298997265269, "calib/step_q_w_n": 1097.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 842.14453125, "completions/mean_terminated_length": 879.955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.048, "grad_norm": 0.006733404938131571, "kl": 0.040283203125, "learning_rate": 4.305555555555556e-06, "loss": -0.0575, "num_tokens": 14166023.0, "reward": 0.7999862432479858, "reward_std": 0.21311859786510468, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6146713495254517, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6735823154449463, "step": 45 }, { "calib/answer_extract_rate": 0.89453125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.84375, "calib/ece": 0.45965217391304347, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.9956521739130435, "calib/gap": 0.008990825688073745, "calib/mean_conf": 0.9857391304347825, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9810091743119264, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45965217391304347, "calib/std_conf": 0.06514279792105178, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9217916260954235, "calib/step_q_c_n": 1027.0, "calib/step_q_gap": -0.0023959244300413296, "calib/step_q_w": 0.9241875505254649, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 861.7578125, "completions/mean_terminated_length": 938.7659301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 499.0, "epoch": 0.04906666666666667, "grad_norm": 0.006302679888904095, "kl": 0.042236328125, "learning_rate": 4.277777777777778e-06, "loss": -0.0863, "num_tokens": 14491401.0, "reward": 0.6648539304733276, "reward_std": 0.2813977301120758, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.4846773147583008, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.5723742842674255, "step": 46 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.495, "calib/avg_num_step_conf": 7.890625, "calib/ece": 0.40155967078189303, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -9.000000000014552e-05, "calib/mean_conf": 0.990037037037037, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9900899999999999, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40155967078189303, "calib/std_conf": 0.0005761610809668171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9216000000000001, "calib/step_q_c_n": 1175.0, "calib/step_q_gap": 0.0062106508875739985, "calib/step_q_w": 0.9153893491124261, "calib/step_q_w_n": 845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 913.67578125, "completions/mean_terminated_length": 943.149169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 548.0, "epoch": 0.050133333333333335, "grad_norm": 0.006680261809378862, "kl": 0.044597625732421875, "learning_rate": 4.25e-06, "loss": -0.038, "num_tokens": 14831278.0, "reward": 0.775047242641449, "reward_std": 0.2337433397769928, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5662413835525513, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6822904944419861, "step": 47 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5078740157480315, "calib/avg_num_step_conf": 8.765625, "calib/ece": 0.4779717741935485, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00013385826771616927, "calib/mean_conf": 0.9900685483870969, "calib/mu_c": 0.9901338582677163, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4779717741935485, "calib/std_conf": 0.0007720800283904093, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9241362038664322, "calib/step_q_c_n": 1138.0, "calib/step_q_gap": 0.0015051007922911586, "calib/step_q_w": 0.9226311030741411, "calib/step_q_w_n": 1106.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 867.58203125, "completions/mean_terminated_length": 891.9718627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.0512, "grad_norm": 0.006924331653863192, "kl": 0.048736572265625, "learning_rate": 4.222222222222223e-06, "loss": -0.0387, "num_tokens": 15157067.0, "reward": 0.6999430656433105, "reward_std": 0.2746431231498718, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5054507255554199, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6014666557312012, "step": 48 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.4765625, "calib/ece": 0.38591836734693874, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38591836734693874, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9225803921568627, "calib/step_q_c_n": 1275.0, "calib/step_q_gap": -0.0004587139883887037, "calib/step_q_w": 0.9230391061452514, "calib/step_q_w_n": 895.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 878.05078125, "completions/mean_terminated_length": 910.0445556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 541.0, "epoch": 0.05226666666666667, "grad_norm": 0.006520978640764952, "kl": 0.0503082275390625, "learning_rate": 4.194444444444445e-06, "loss": -0.0512, "num_tokens": 15486384.0, "reward": 0.7783027291297913, "reward_std": 0.2613601088523865, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5855296850204468, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.6648257374763489, "step": 49 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.48828125, "calib/ece": 0.33262948207171317, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33262948207171317, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.921271551724138, "calib/step_q_c_n": 1392.0, "calib/step_q_gap": -0.002300791425669857, "calib/step_q_w": 0.9235723431498079, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 871.23828125, "completions/mean_terminated_length": 888.5936279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 439.0, "epoch": 0.05333333333333334, "grad_norm": 0.0065833828411996365, "kl": 0.0535430908203125, "learning_rate": 4.166666666666667e-06, "loss": -0.0335, "num_tokens": 15814781.0, "reward": 0.8323620557785034, "reward_std": 0.23785735666751862, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6511518955230713, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6885721683502197, "step": 50 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.0234375, "calib/ece": 0.3669841269841271, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 2.220446049250313e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9899999999999997, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3669841269841271, "calib/std_conf": 1.1102230246251565e-16, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.919637904468413, "calib/step_q_c_n": 1298.0, "calib/step_q_gap": 0.005563830394338942, "calib/step_q_w": 0.914074074074074, "calib/step_q_w_n": 756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 856.62890625, "completions/mean_terminated_length": 870.2262573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.0544, "grad_norm": 0.006755459122359753, "kl": 0.06745147705078125, "learning_rate": 4.138888888888889e-06, "loss": -0.0184, "num_tokens": 16143374.0, "reward": 0.8147128820419312, "reward_std": 0.24777701497077942, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.620526909828186, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6901488304138184, "step": 51 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5030120481927711, "calib/avg_num_step_conf": 7.671875, "calib/ece": 0.326036, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 5.421686747009158e-05, "calib/mean_conf": 0.990036, "calib/mu_c": 0.9900542168674699, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.326036, "calib/std_conf": 0.0005680704181701424, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9202721804511278, "calib/step_q_c_n": 1330.0, "calib/step_q_gap": 0.007653883921159288, "calib/step_q_w": 0.9126182965299685, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 851.52734375, "completions/mean_terminated_length": 861.62451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 511.0, "epoch": 0.055466666666666664, "grad_norm": 0.007119378075003624, "kl": 0.048519134521484375, "learning_rate": 4.111111111111111e-06, "loss": 0.0232, "num_tokens": 16469317.0, "reward": 0.8668615221977234, "reward_std": 0.2582418918609619, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6549026966094971, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7530391216278076, "step": 52 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.49390243902439024, "calib/avg_num_step_conf": 7.72265625, "calib/ece": 0.3116039215686275, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010975609756103122, "calib/mean_conf": 0.9900352941176471, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901097560975609, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3116039215686275, "calib/std_conf": 0.0005624956747238556, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9183547169811322, "calib/step_q_c_n": 1325.0, "calib/step_q_gap": 6.94409075125968e-05, "calib/step_q_w": 0.9182852760736196, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 875.05078125, "completions/mean_terminated_length": 878.482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.05653333333333333, "grad_norm": 0.008037308230996132, "kl": 0.0555572509765625, "learning_rate": 4.083333333333334e-06, "loss": -0.0018, "num_tokens": 16799154.0, "reward": 0.8707619905471802, "reward_std": 0.21929433941841125, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6780343651771545, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7306770086288452, "step": 53 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.91796875, "calib/ece": 0.25086956521739134, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9900000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25086956521739134, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9221901639344263, "calib/step_q_c_n": 1525.0, "calib/step_q_gap": 0.008285781464306696, "calib/step_q_w": 0.9139043824701196, "calib/step_q_w_n": 502.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 810.4140625, "completions/mean_terminated_length": 813.5922241210938, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.0576, "grad_norm": 0.006980069447308779, "kl": 0.05690765380859375, "learning_rate": 4.055555555555556e-06, "loss": -0.0126, "num_tokens": 17112852.0, "reward": 0.9220144152641296, "reward_std": 0.18860012292861938, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7316203117370605, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7694397568702698, "step": 54 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5203287197231834, "calib/avg_num_step_conf": 7.828125, "calib/ece": 0.4505450980392157, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9803921568627451, "calib/gap": 0.013275210084033873, "calib/mean_conf": 0.9838784313725489, "calib/mu_c": 0.9900735294117647, "calib/mu_w": 0.9767983193277309, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4505450980392157, "calib/std_conf": 0.06449882903246397, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9266344463971881, "calib/step_q_c_n": 1138.0, "calib/step_q_gap": 0.016680635773631547, "calib/step_q_w": 0.9099538106235565, "calib/step_q_w_n": 866.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 816.03515625, "completions/mean_terminated_length": 819.2353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.058666666666666666, "grad_norm": 0.007960349321365356, "kl": 0.056209564208984375, "learning_rate": 4.027777777777779e-06, "loss": -0.0151, "num_tokens": 17429581.0, "reward": 0.7558916211128235, "reward_std": 0.2477262318134308, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.54833984375, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.658755898475647, "step": 55 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5043859649122807, "calib/avg_num_step_conf": 7.6171875, "calib/ece": 0.4474698795180724, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.00078947368421034, "calib/mean_conf": 0.9896385542168675, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9892105263157895, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4474698795180724, "calib/std_conf": 0.005692053884827813, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9263611615245009, "calib/step_q_c_n": 1102.0, "calib/step_q_gap": 0.0115852181282744, "calib/step_q_w": 0.9147759433962265, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 834.3125, "completions/mean_terminated_length": 847.5556030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.05973333333333333, "grad_norm": 0.006656978279352188, "kl": 0.050594329833984375, "learning_rate": 4.000000000000001e-06, "loss": 0.0016, "num_tokens": 17750005.0, "reward": 0.7507243156433105, "reward_std": 0.216395765542984, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.536817193031311, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6646314859390259, "step": 56 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 7.81640625, "calib/ece": 0.3445816733067729, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3445816733067729, "calib/std_conf": 0.0, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.929010989010989, "calib/step_q_c_n": 1365.0, "calib/step_q_gap": 0.013303441841177488, "calib/step_q_w": 0.9157075471698115, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 796.5234375, "completions/mean_terminated_length": 809.1666870117188, "completions/min_length": 0.0, "completions/min_terminated_length": 506.0, "epoch": 0.0608, "grad_norm": 0.006946193054318428, "kl": 0.054744720458984375, "learning_rate": 3.972222222222223e-06, "loss": -0.0144, "num_tokens": 18060707.0, "reward": 0.8400901556015015, "reward_std": 0.26458460092544556, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6396675109863281, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7178565263748169, "step": 57 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5138888888888888, "calib/avg_num_step_conf": 7.59375, "calib/ece": 0.4167063492063493, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.00435185185185194, "calib/mean_conf": 0.9881349206349207, "calib/mu_c": 0.99, "calib/mu_w": 0.985648148148148, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4167063492063493, "calib/std_conf": 0.017753091916980233, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9240925589836662, "calib/step_q_c_n": 1102.0, "calib/step_q_gap": 0.0058146492449488996, "calib/step_q_w": 0.9182779097387173, "calib/step_q_w_n": 842.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 832.734375, "completions/mean_terminated_length": 845.9524536132812, "completions/min_length": 0.0, "completions/min_terminated_length": 423.0, "epoch": 0.06186666666666667, "grad_norm": 0.015161849558353424, "kl": 0.05941009521484375, "learning_rate": 3.944444444444445e-06, "loss": -0.0325, "num_tokens": 18380207.0, "reward": 0.7666366100311279, "reward_std": 0.2728399634361267, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5740827918052673, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6505966186523438, "step": 58 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.51953125, "calib/ece": 0.3923904382470119, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3923904382470119, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.927784760408484, "calib/step_q_c_n": 1273.0, "calib/step_q_gap": -0.0012460765959212416, "calib/step_q_w": 0.9290308370044053, "calib/step_q_w_n": 908.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 788.6171875, "completions/mean_terminated_length": 804.3267211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.06293333333333333, "grad_norm": 0.006924844346940517, "kl": 0.0535430908203125, "learning_rate": 3.916666666666667e-06, "loss": -0.0334, "num_tokens": 18688341.0, "reward": 0.79398512840271, "reward_std": 0.28158038854599, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.5937300324440002, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6809588670730591, "step": 59 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.2265625, "calib/ece": 0.4445454545454546, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4445454545454546, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9294406779661017, "calib/step_q_c_n": 1180.0, "calib/step_q_gap": 0.006881282717721704, "calib/step_q_w": 0.92255939524838, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 786.98828125, "completions/mean_terminated_length": 796.3201904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.064, "grad_norm": 0.006995832081884146, "kl": 0.05194854736328125, "learning_rate": 3.88888888888889e-06, "loss": -0.0219, "num_tokens": 18998666.0, "reward": 0.7526835203170776, "reward_std": 0.20571644604206085, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5479480028152466, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6519501209259033, "step": 60 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4935064935064935, "calib/avg_num_step_conf": 8.74609375, "calib/ece": 0.29559523809523813, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.000129870129869869, "calib/mean_conf": 0.9900396825396826, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901298701298697, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29559523809523813, "calib/std_conf": 0.0006286896634029713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9310614886731392, "calib/step_q_c_n": 1545.0, "calib/step_q_gap": 0.0012776270016693347, "calib/step_q_w": 0.9297838616714699, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 779.7734375, "completions/mean_terminated_length": 789.019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 446.0, "epoch": 0.06506666666666666, "grad_norm": 0.00675692455843091, "kl": 0.0540924072265625, "learning_rate": 3.861111111111112e-06, "loss": -0.0246, "num_tokens": 19302352.0, "reward": 0.8764443397521973, "reward_std": 0.1781359612941742, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6894331574440002, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7298616766929626, "step": 61 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.62109375, "calib/ece": 0.4165612648221343, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.0005517241379311422, "calib/mean_conf": 0.9896837944664031, "calib/mu_c": 0.9894482758620687, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4165612648221343, "calib/std_conf": 0.005684283706343185, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9326629123089302, "calib/step_q_c_n": 1243.0, "calib/step_q_gap": 0.0033060658359009842, "calib/step_q_w": 0.9293568464730292, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 811.25390625, "completions/mean_terminated_length": 814.4353637695312, "completions/min_length": 0.0, "completions/min_terminated_length": 511.0, "epoch": 0.06613333333333334, "grad_norm": 0.006315034814178944, "kl": 0.0572509765625, "learning_rate": 3.833333333333334e-06, "loss": -0.0008, "num_tokens": 19617113.0, "reward": 0.7650270462036133, "reward_std": 0.26451075077056885, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5747065544128418, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6444100141525269, "step": 62 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49814471243042674, "calib/avg_num_step_conf": 8.21875, "calib/ece": 0.37896825396825395, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -3.710575139170835e-05, "calib/mean_conf": 0.9900793650793651, "calib/mu_c": 0.9900649350649349, "calib/mu_w": 0.9901020408163266, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.37896825396825395, "calib/std_conf": 0.0008873285624999172, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9344600591715977, "calib/step_q_c_n": 1352.0, "calib/step_q_gap": 0.007167505980108291, "calib/step_q_w": 0.9272925531914894, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 825.93359375, "completions/mean_terminated_length": 835.727294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 463.0, "epoch": 0.0672, "grad_norm": 0.005823509301990271, "kl": 0.061920166015625, "learning_rate": 3.8055555555555556e-06, "loss": -0.0249, "num_tokens": 19937192.0, "reward": 0.8004246950149536, "reward_std": 0.2571212351322174, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6011534929275513, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6848522424697876, "step": 63 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5066666666666666, "calib/avg_num_step_conf": 9.46875, "calib/ece": 0.28844621513944235, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.001200000000000312, "calib/mean_conf": 0.9896414342629484, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9887999999999998, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28844621513944235, "calib/std_conf": 0.005669422099903468, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9379964114832539, "calib/step_q_c_n": 1672.0, "calib/step_q_gap": 0.0023182199938921455, "calib/step_q_w": 0.9356781914893617, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 805.45703125, "completions/mean_terminated_length": 815.0079345703125, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.06826666666666667, "grad_norm": 0.006717866752296686, "kl": 0.0677490234375, "learning_rate": 3.777777777777778e-06, "loss": -0.024, "num_tokens": 20247165.0, "reward": 0.8886719346046448, "reward_std": 0.2663307189941406, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6939257383346558, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7498241662979126, "step": 64 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5, "calib/avg_num_step_conf": 10.87890625, "calib/ece": 0.3482677165354331, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3482677165354331, "calib/std_conf": 0.0008873565094161146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9431381466742468, "calib/step_q_c_n": 1759.0, "calib/step_q_gap": -0.0001562003043107385, "calib/step_q_w": 0.9432943469785575, "calib/step_q_w_n": 1026.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 783.87890625, "completions/mean_terminated_length": 786.9530029296875, "completions/min_length": 0.0, "completions/min_terminated_length": 453.0, "epoch": 0.06933333333333333, "grad_norm": 0.006531869061291218, "kl": 0.07587432861328125, "learning_rate": 3.7500000000000005e-06, "loss": -0.0152, "num_tokens": 20552862.0, "reward": 0.8248828053474426, "reward_std": 0.11450773477554321, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6437281370162964, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.6802562475204468, "step": 65 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5038461538461538, "calib/avg_num_step_conf": 11.0, "calib/ece": 0.4594285714285714, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 7.692307692319655e-05, "calib/mean_conf": 0.9900408163265306, "calib/mu_c": 0.9900769230769232, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4594285714285714, "calib/std_conf": 0.0006375714021148296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9444337016574585, "calib/step_q_c_n": 1448.0, "calib/step_q_gap": -0.00038354980452970366, "calib/step_q_w": 0.9448172514619883, "calib/step_q_w_n": 1368.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2753.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 906.8125, "completions/mean_terminated_length": 932.30517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 605.0, "epoch": 0.0704, "grad_norm": 0.00575052946805954, "kl": 0.0717010498046875, "learning_rate": 3.7222222222222225e-06, "loss": -0.0435, "num_tokens": 20891358.0, "reward": 0.71103835105896, "reward_std": 0.2630349397659302, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5167015194892883, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6124064326286316, "step": 66 }, { "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.5, "calib/avg_num_step_conf": 14.45703125, "calib/ece": 0.3578619246861924, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.99581589958159, "calib/gap": 0.0009204545454541968, "calib/mean_conf": 0.9896610878661087, "calib/mu_c": 0.9899999999999997, "calib/mu_w": 0.9890795454545455, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3578619246861924, "calib/std_conf": 0.0058408273462339086, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9500792811839325, "calib/step_q_c_n": 1892.0, "calib/step_q_gap": -0.011177766908936682, "calib/step_q_w": 0.9612570480928692, "calib/step_q_w_n": 1809.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 880.5703125, "completions/mean_terminated_length": 931.5123291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.07146666666666666, "grad_norm": 0.004531295504420996, "kl": 0.08599090576171875, "learning_rate": 3.694444444444445e-06, "loss": -0.0804, "num_tokens": 21221792.0, "reward": 0.7681565880775452, "reward_std": 0.2734624147415161, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.597142219543457, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.6352647542953491, "step": 67 }, { "calib/answer_extract_rate": 0.890625, "calib/auroc": 0.5011929247223366, "calib/avg_num_step_conf": 17.26171875, "calib/ece": 0.3629342105263158, "calib/final_conf_rate": 0.890625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.397778691904829e-05, "calib/mean_conf": 0.9901271929824561, "calib/mu_c": 0.99013986013986, "calib/mu_w": 0.990105882352941, "calib/nonempty_final_conf_rate": 0.890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3629342105263158, "calib/std_conf": 0.001102849983254654, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9531779016725798, "calib/step_q_c_n": 1973.0, "calib/step_q_gap": -0.013561673143446584, "calib/step_q_w": 0.9667395748160263, "calib/step_q_w_n": 2446.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 871.30078125, "completions/mean_terminated_length": 953.218017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 581.0, "epoch": 0.07253333333333334, "grad_norm": 0.004538587294518948, "kl": 0.09210205078125, "learning_rate": 3.6666666666666666e-06, "loss": -0.1512, "num_tokens": 21548933.0, "reward": 0.7421213388442993, "reward_std": 0.2593652009963989, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5650761723518372, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.6293228268623352, "step": 68 }, { "calib/answer_extract_rate": 0.80859375, "calib/auroc": 0.5199183832102603, "calib/avg_num_step_conf": 22.0234375, "calib/ece": 0.39082125603864726, "calib/final_conf_rate": 0.80859375, "calib/format_rate": 0.8046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007636999611347894, "calib/mean_conf": 0.989855072463768, "calib/mu_c": 0.9901612903225805, "calib/mu_w": 0.9893975903614457, "calib/nonempty_final_conf_rate": 0.80859375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39082125603864726, "calib/std_conf": 0.0030261757996842198, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9506164383561644, "calib/step_q_c_n": 1606.0, "calib/step_q_gap": -0.019599335453359457, "calib/step_q_w": 0.9702157738095238, "calib/step_q_w_n": 4032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 812.484375, "completions/mean_terminated_length": 995.1961059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 633.0, "epoch": 0.0736, "grad_norm": 0.00564149022102356, "kl": 0.0941619873046875, "learning_rate": 3.638888888888889e-06, "loss": -0.2617, "num_tokens": 21861425.0, "reward": 0.6458406448364258, "reward_std": 0.35272741317749023, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.4910816252231598, "rewards/format_reward_step": 0.8046875, "rewards/step_l2_reward": 0.5427870154380798, "step": 69 }, { "calib/answer_extract_rate": 0.73046875, "calib/auroc": 0.5061728395061729, "calib/avg_num_step_conf": 26.5390625, "calib/ece": 0.4226737967914438, "calib/final_conf_rate": 0.73046875, "calib/format_rate": 0.73046875, "calib/frac_conf_gt_0.9": 0.9946524064171123, "calib/gap": 0.0011111111111111738, "calib/mean_conf": 0.9895187165775401, "calib/mu_c": 0.99, "calib/mu_w": 0.9888888888888888, "calib/nonempty_final_conf_rate": 0.73046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4226737967914438, "calib/std_conf": 0.006644801953800668, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.960535424553813, "calib/step_q_c_n": 1849.0, "calib/step_q_gap": -0.01294890304982721, "calib/step_q_w": 0.9734843276036402, "calib/step_q_w_n": 4945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23828125, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 799.0703125, "completions/mean_terminated_length": 1049.035888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 607.0, "epoch": 0.07466666666666667, "grad_norm": 0.005061451345682144, "kl": 0.09099578857421875, "learning_rate": 3.6111111111111115e-06, "loss": -0.3298, "num_tokens": 22172979.0, "reward": 0.5672096014022827, "reward_std": 0.36951565742492676, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.420981228351593, "rewards/format_reward_step": 0.73046875, "rewards/step_l2_reward": 0.4845317602157593, "step": 70 }, { "calib/answer_extract_rate": 0.83203125, "calib/auroc": 0.505050505050505, "calib/avg_num_step_conf": 23.44921875, "calib/ece": 0.45422065727699523, "calib/final_conf_rate": 0.83203125, "calib/format_rate": 0.828125, "calib/frac_conf_gt_0.9": 0.9953051643192489, "calib/gap": 0.0012222222222222356, "calib/mean_conf": 0.989431924882629, "calib/mu_c": 0.99, "calib/mu_w": 0.9887777777777778, "calib/nonempty_final_conf_rate": 0.83203125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45422065727699523, "calib/std_conf": 0.006821762645431273, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9538452309538094, "calib/step_q_c_n": 1667.0, "calib/step_q_gap": -0.011642315171651929, "calib/step_q_w": 0.9654875461254613, "calib/step_q_w_n": 4336.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16015625, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 844.984375, "completions/mean_terminated_length": 1006.1209106445312, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.07573333333333333, "grad_norm": 0.005491245537996292, "kl": 0.1047210693359375, "learning_rate": 3.5833333333333335e-06, "loss": -0.1971, "num_tokens": 22493703.0, "reward": 0.6107375621795654, "reward_std": 0.42661625146865845, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.44995468854904175, "rewards/format_reward_step": 0.828125, "rewards/step_l2_reward": 0.5168329477310181, "step": 71 }, { "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.49159663865546216, "calib/avg_num_step_conf": 18.8046875, "calib/ece": 0.49646382978723397, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00015966386554600032, "calib/mean_conf": 0.9900808510638297, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9901596638655459, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.49646382978723397, "calib/std_conf": 0.0008738855024744603, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9564070891514501, "calib/step_q_c_n": 1862.0, "calib/step_q_gap": -0.009698263152073006, "calib/step_q_w": 0.9661053523035231, "calib/step_q_w_n": 2952.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 863.10546875, "completions/mean_terminated_length": 932.299560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 671.0, "epoch": 0.0768, "grad_norm": 0.004759664181619883, "kl": 0.114837646484375, "learning_rate": 3.555555555555556e-06, "loss": -0.1401, "num_tokens": 22819066.0, "reward": 0.6763616800308228, "reward_std": 0.28232789039611816, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.4621824026107788, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.6163221597671509, "step": 72 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.48417207792207795, "calib/avg_num_step_conf": 14.89453125, "calib/ece": 0.2747560975609755, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00031493506493485146, "calib/mean_conf": 0.9902032520325202, "calib/mu_c": 0.9901136363636364, "calib/mu_w": 0.9904285714285712, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2747560975609755, "calib/std_conf": 0.0016745796686216258, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9499241466498104, "calib/step_q_c_n": 2373.0, "calib/step_q_gap": -0.010520297794634148, "calib/step_q_w": 0.9604444444444445, "calib/step_q_w_n": 1440.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1931.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 886.0625, "completions/mean_terminated_length": 910.9718627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 550.0, "epoch": 0.07786666666666667, "grad_norm": 0.004779706243425608, "kl": 0.1116943359375, "learning_rate": 3.5277777777777784e-06, "loss": -0.0236, "num_tokens": 23152930.0, "reward": 0.8753884434700012, "reward_std": 0.3158838748931885, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.6926394104957581, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7276686429977417, "step": 73 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49664249261348375, "calib/avg_num_step_conf": 13.62109375, "calib/ece": 0.3999596774193548, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9838709677419355, "calib/gap": 0.0004042438893367972, "calib/mean_conf": 0.9886693548387097, "calib/mu_c": 0.9888356164383562, "calib/mu_w": 0.9884313725490194, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3999596774193548, "calib/std_conf": 0.011405447039218237, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9459709241952233, "calib/step_q_c_n": 1926.0, "calib/step_q_gap": 0.004125953022897755, "calib/step_q_w": 0.9418449711723256, "calib/step_q_w_n": 1561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 872.28125, "completions/mean_terminated_length": 896.8031616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 545.0, "epoch": 0.07893333333333333, "grad_norm": 0.005274313036352396, "kl": 0.1123809814453125, "learning_rate": 3.5e-06, "loss": -0.0221, "num_tokens": 23480162.0, "reward": 0.7923901081085205, "reward_std": 0.24717894196510315, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5792031288146973, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6985458135604858, "step": 74 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5061356537260152, "calib/avg_num_step_conf": 13.1484375, "calib/ece": 0.3163832658569502, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9878542510121457, "calib/gap": 0.009314294213892738, "calib/mean_conf": 0.9830499325236167, "calib/mu_c": 0.9861044176706827, "calib/mu_w": 0.97679012345679, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3136842105263159, "calib/std_conf": 0.07549244973758543, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9461661341853036, "calib/step_q_c_n": 2191.0, "calib/step_q_gap": 0.0032725171640271, "calib/step_q_w": 0.9428936170212765, "calib/step_q_w_n": 1175.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 826.328125, "completions/mean_terminated_length": 849.5582275390625, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.08, "grad_norm": 0.004727415274828672, "kl": 0.1128387451171875, "learning_rate": 3.4722222222222224e-06, "loss": -0.0462, "num_tokens": 23796454.0, "reward": 0.8472334146499634, "reward_std": 0.25355038046836853, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6573490500450134, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7136803269386292, "step": 75 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4852607709750566, "calib/avg_num_step_conf": 11.3203125, "calib/ece": 0.32279761904761917, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": -0.0011607142857139596, "calib/mean_conf": 0.9894642857142858, "calib/mu_c": 0.9890773809523811, "calib/mu_w": 0.990238095238095, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32279761904761917, "calib/std_conf": 0.006770816980618662, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.941431401684002, "calib/step_q_c_n": 2019.0, "calib/step_q_gap": 0.00519704445988356, "calib/step_q_w": 0.9362343572241184, "calib/step_q_w_n": 879.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 870.45703125, "completions/mean_terminated_length": 877.31103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 587.0, "epoch": 0.08106666666666666, "grad_norm": 0.005026756785809994, "kl": 0.1142425537109375, "learning_rate": 3.444444444444445e-06, "loss": 0.0037, "num_tokens": 24122347.0, "reward": 0.8486015796661377, "reward_std": 0.19657891988754272, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6625018119812012, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7065763473510742, "step": 76 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5090935544885785, "calib/avg_num_step_conf": 10.78515625, "calib/ece": 0.3018181818181819, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.00010403026334959709, "calib/mean_conf": 0.9895652173913044, "calib/mu_c": 0.9895977011494252, "calib/mu_w": 0.9894936708860756, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3018181818181819, "calib/std_conf": 0.006240304388872749, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.9381355068772287, "calib/step_q_c_n": 1963.0, "calib/step_q_gap": 0.008110444220587154, "calib/step_q_w": 0.9300250626566415, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 866.44140625, "completions/mean_terminated_length": 869.8392944335938, "completions/min_length": 0.0, "completions/min_terminated_length": 467.0, "epoch": 0.08213333333333334, "grad_norm": 0.005225683096796274, "kl": 0.09732818603515625, "learning_rate": 3.416666666666667e-06, "loss": 0.025, "num_tokens": 24448820.0, "reward": 0.8727861046791077, "reward_std": 0.287240207195282, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6780972480773926, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7370061874389648, "step": 77 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4882682724252492, "calib/avg_num_step_conf": 9.40234375, "calib/ece": 0.3271653543307087, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": -0.0007364341085266402, "calib/mean_conf": 0.9885826771653544, "calib/mu_c": 0.9883333333333335, "calib/mu_w": 0.9890697674418601, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3271653543307087, "calib/std_conf": 0.011240002343207283, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9324408468244084, "calib/step_q_c_n": 1606.0, "calib/step_q_gap": 0.004425865551000108, "calib/step_q_w": 0.9280149812734083, "calib/step_q_w_n": 801.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 838.62109375, "completions/mean_terminated_length": 841.9098510742188, "completions/min_length": 0.0, "completions/min_terminated_length": 541.0, "epoch": 0.0832, "grad_norm": 0.005123285576701164, "kl": 0.1076202392578125, "learning_rate": 3.3888888888888893e-06, "loss": -0.0214, "num_tokens": 24771531.0, "reward": 0.8536770343780518, "reward_std": 0.20458891987800598, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6632612943649292, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.715186357498169, "step": 78 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4995257452574525, "calib/avg_num_step_conf": 9.51953125, "calib/ece": 0.343267716535433, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0016233062330622827, "calib/mean_conf": 0.9889370078740157, "calib/mu_c": 0.9895121951219511, "calib/mu_w": 0.9878888888888888, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.343267716535433, "calib/std_conf": 0.01242504964834014, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9310630630630632, "calib/step_q_c_n": 1665.0, "calib/step_q_gap": 0.009158917985342963, "calib/step_q_w": 0.9219041450777202, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 812.6875, "completions/mean_terminated_length": 819.0866088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 500.0, "epoch": 0.08426666666666667, "grad_norm": 0.012989317066967487, "kl": 0.1119384765625, "learning_rate": 3.3611111111111117e-06, "loss": 0.0019, "num_tokens": 25085955.0, "reward": 0.8487727642059326, "reward_std": 0.17182764410972595, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6488659977912903, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7221168279647827, "step": 79 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5034555507056598, "calib/avg_num_step_conf": 9.63671875, "calib/ece": 0.3015415019762845, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.0006219991270192127, "calib/mean_conf": 0.989288537549407, "calib/mu_c": 0.9894827586206896, "calib/mu_w": 0.9888607594936704, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3015415019762845, "calib/std_conf": 0.00801972291097068, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9293405963302752, "calib/step_q_c_n": 1744.0, "calib/step_q_gap": 0.005384856357937728, "calib/step_q_w": 0.9239557399723375, "calib/step_q_w_n": 723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 762.69140625, "completions/mean_terminated_length": 768.6968383789062, "completions/min_length": 0.0, "completions/min_terminated_length": 477.0, "epoch": 0.08533333333333333, "grad_norm": 0.005170705262571573, "kl": 0.1083984375, "learning_rate": 3.3333333333333333e-06, "loss": -0.0104, "num_tokens": 25383364.0, "reward": 0.8736566305160522, "reward_std": 0.22426272928714752, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6863855123519897, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7273339033126831, "step": 80 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5181405895691611, "calib/avg_num_step_conf": 8.78515625, "calib/ece": 0.32202380952380966, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.0033928571428575305, "calib/mean_conf": 0.9886904761904762, "calib/mu_c": 0.9898214285714287, "calib/mu_w": 0.9864285714285712, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32202380952380966, "calib/std_conf": 0.010959495618763079, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9301127819548872, "calib/step_q_c_n": 1596.0, "calib/step_q_gap": 0.01467633478796515, "calib/step_q_w": 0.9154364471669221, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 774.3515625, "completions/mean_terminated_length": 780.4487915039062, "completions/min_length": 0.0, "completions/min_terminated_length": 437.0, "epoch": 0.0864, "grad_norm": 0.005284500773996115, "kl": 0.1020355224609375, "learning_rate": 3.3055555555555558e-06, "loss": 0.0019, "num_tokens": 25687846.0, "reward": 0.8641499280929565, "reward_std": 0.2041071206331253, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6649121046066284, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7352626323699951, "step": 81 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.505049504950495, "calib/avg_num_step_conf": 8.4375, "calib/ece": 0.3915139442231076, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0011841584158415186, "calib/mean_conf": 0.9891235059760957, "calib/mu_c": 0.9895999999999998, "calib/mu_w": 0.9884158415841583, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3915139442231076, "calib/std_conf": 0.0069738188006505395, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9257049891540132, "calib/step_q_c_n": 1383.0, "calib/step_q_gap": 0.015833689282713337, "calib/step_q_w": 0.9098712998712999, "calib/step_q_w_n": 777.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 731.98828125, "completions/mean_terminated_length": 743.607177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 455.0, "epoch": 0.08746666666666666, "grad_norm": 0.0052449824288487434, "kl": 0.1048736572265625, "learning_rate": 3.277777777777778e-06, "loss": -0.0292, "num_tokens": 25980787.0, "reward": 0.7984567880630493, "reward_std": 0.25271064043045044, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.594914436340332, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6887179613113403, "step": 82 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5063323201621075, "calib/avg_num_step_conf": 7.796875, "calib/ece": 0.42782213438735195, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.008908371327254083, "calib/mean_conf": 0.9851343873517787, "calib/mu_c": 0.989078014184397, "calib/mu_w": 0.9801696428571429, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42782213438735195, "calib/std_conf": 0.06251296879930535, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9238847736625514, "calib/step_q_c_n": 1215.0, "calib/step_q_gap": 0.011024338323242677, "calib/step_q_w": 0.9128604353393087, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 783.79296875, "completions/mean_terminated_length": 789.9645385742188, "completions/min_length": 0.0, "completions/min_terminated_length": 458.0, "epoch": 0.08853333333333334, "grad_norm": 0.004777262452989817, "kl": 0.09439849853515625, "learning_rate": 3.2500000000000002e-06, "loss": -0.0205, "num_tokens": 26288702.0, "reward": 0.7667571306228638, "reward_std": 0.1584586352109909, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5640523433685303, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6616493463516235, "step": 83 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5064144736842104, "calib/avg_num_step_conf": 8.28125, "calib/ece": 0.36104575163398694, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": 0.00638267543859683, "calib/mean_conf": 0.9832679738562092, "calib/mu_c": 0.9856458333333332, "calib/mu_w": 0.9792631578947364, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35843137254901963, "calib/std_conf": 0.07412904662712728, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9221022727272729, "calib/step_q_c_n": 1408.0, "calib/step_q_gap": 0.008970250255362577, "calib/step_q_w": 0.9131320224719103, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 716.78125, "completions/mean_terminated_length": 719.5922241210938, "completions/min_length": 0.0, "completions/min_terminated_length": 469.0, "epoch": 0.0896, "grad_norm": 0.005447674542665482, "kl": 0.10772705078125, "learning_rate": 3.2222222222222227e-06, "loss": -0.0154, "num_tokens": 26578118.0, "reward": 0.8506171107292175, "reward_std": 0.21902960538864136, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6346380710601807, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7423772811889648, "step": 84 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5159602302459445, "calib/avg_num_step_conf": 7.7890625, "calib/ece": 0.403386454183267, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920318725099602, "calib/gap": 0.002143511250653818, "calib/mean_conf": 0.9890438247011953, "calib/mu_c": 0.9899319727891155, "calib/mu_w": 0.9877884615384617, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.403386454183267, "calib/std_conf": 0.0083189693298092, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9260125588697019, "calib/step_q_c_n": 1274.0, "calib/step_q_gap": 0.020234781091924003, "calib/step_q_w": 0.9057777777777779, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 763.03515625, "completions/mean_terminated_length": 775.1468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 452.0, "epoch": 0.09066666666666667, "grad_norm": 0.004860911518335342, "kl": 0.1051177978515625, "learning_rate": 3.1944444444444443e-06, "loss": -0.0159, "num_tokens": 26881279.0, "reward": 0.7905886173248291, "reward_std": 0.21564725041389465, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5839550495147705, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6862846612930298, "step": 85 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5000319284802044, "calib/avg_num_step_conf": 8.0546875, "calib/ece": 0.4165612648221343, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": 0.0007407407407409305, "calib/mean_conf": 0.9896837944664031, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9892592592592592, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4165612648221343, "calib/std_conf": 0.005753398568515022, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9202557755775579, "calib/step_q_c_n": 1212.0, "calib/step_q_gap": 0.008785187342263745, "calib/step_q_w": 0.9114705882352941, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 766.453125, "completions/mean_terminated_length": 772.4881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.09173333333333333, "grad_norm": 0.004800785332918167, "kl": 0.1052093505859375, "learning_rate": 3.1666666666666667e-06, "loss": -0.0242, "num_tokens": 27183003.0, "reward": 0.7796474695205688, "reward_std": 0.21087849140167236, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5753308534622192, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6730266809463501, "step": 86 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5027027027027028, "calib/avg_num_step_conf": 7.91015625, "calib/ece": 0.24003643724696355, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9919028340080972, "calib/gap": 0.0006353966870092931, "calib/mean_conf": 0.9890242914979757, "calib/mu_c": 0.9891837837837838, "calib/mu_w": 0.9885483870967745, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24003643724696355, "calib/std_conf": 0.00866512724465377, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9225045811518324, "calib/step_q_c_n": 1528.0, "calib/step_q_gap": 0.00765548658442794, "calib/step_q_w": 0.9148490945674045, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 724.14453125, "completions/mean_terminated_length": 747.5040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.0928, "grad_norm": 0.005489662755280733, "kl": 0.115203857421875, "learning_rate": 3.138888888888889e-06, "loss": -0.0351, "num_tokens": 27473880.0, "reward": 0.919964075088501, "reward_std": 0.21015943586826324, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7202027440071106, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7837879657745361, "step": 87 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5306094654409261, "calib/avg_num_step_conf": 7.78125, "calib/ece": 0.33861811023622057, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.003856656452161933, "calib/mean_conf": 0.988224409448819, "calib/mu_c": 0.9895757575757574, "calib/mu_w": 0.9857191011235955, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33861811023622057, "calib/std_conf": 0.011785519050806314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9261018181818182, "calib/step_q_c_n": 1375.0, "calib/step_q_gap": 0.01381008398408723, "calib/step_q_w": 0.912291734197731, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 782.9765625, "completions/mean_terminated_length": 789.1417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 415.0, "epoch": 0.09386666666666667, "grad_norm": 0.004990643355995417, "kl": 0.1031494140625, "learning_rate": 3.1111111111111116e-06, "loss": -0.0136, "num_tokens": 27784170.0, "reward": 0.861653208732605, "reward_std": 0.20241209864616394, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6541855335235596, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7417771816253662, "step": 88 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5184742587688134, "calib/avg_num_step_conf": 8.140625, "calib/ece": 0.44112000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.0014624378270138294, "calib/mean_conf": 0.9891200000000001, "calib/mu_c": 0.9897810218978104, "calib/mu_w": 0.9883185840707965, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44112000000000007, "calib/std_conf": 0.006872088474401363, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.927760834670947, "calib/step_q_c_n": 1246.0, "calib/step_q_gap": 0.01241477261844115, "calib/step_q_w": 0.9153460620525059, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 811.27734375, "completions/mean_terminated_length": 824.15478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.09493333333333333, "grad_norm": 0.005238137673586607, "kl": 0.1026458740234375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0025, "num_tokens": 28100745.0, "reward": 0.760799765586853, "reward_std": 0.1826324462890625, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5453069806098938, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6739487648010254, "step": 89 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5133941713276421, "calib/avg_num_step_conf": 8.58203125, "calib/ece": 0.30326693227091645, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9880478087649402, "calib/gap": 0.0037599352369739325, "calib/mean_conf": 0.9885258964143426, "calib/mu_c": 0.9897093023255812, "calib/mu_w": 0.9859493670886073, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30326693227091645, "calib/std_conf": 0.012166602722453094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9313706207744316, "calib/step_q_c_n": 1627.0, "calib/step_q_gap": 0.017265357616536914, "calib/step_q_w": 0.9141052631578946, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 774.16796875, "completions/mean_terminated_length": 783.3478393554688, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.096, "grad_norm": 0.004702141508460045, "kl": 0.104705810546875, "learning_rate": 3.055555555555556e-06, "loss": -0.01, "num_tokens": 28402252.0, "reward": 0.8856731653213501, "reward_std": 0.2053285539150238, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.68027263879776, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.760604739189148, "step": 90 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5, "calib/avg_num_step_conf": 8.91796875, "calib/ece": 0.3154117647058823, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00011627906976741098, "calib/mean_conf": 0.989921568627451, "calib/mu_c": 0.9898837209302325, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3154117647058823, "calib/std_conf": 0.0021678862684055317, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9328927680798005, "calib/step_q_c_n": 1604.0, "calib/step_q_gap": 0.010138718006162728, "calib/step_q_w": 0.9227540500736378, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 821.66015625, "completions/mean_terminated_length": 821.66015625, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.09706666666666666, "grad_norm": 0.004904957953840494, "kl": 0.11090087890625, "learning_rate": 3.0277777777777776e-06, "loss": 0.0295, "num_tokens": 28720309.0, "reward": 0.8835835456848145, "reward_std": 0.2481040209531784, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6782535314559937, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7553197741508484, "step": 91 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5059171597633136, "calib/avg_num_step_conf": 9.84375, "calib/ece": 0.3166932270916335, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3166932270916335, "calib/std_conf": 0.002186521551237013, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9379087232813402, "calib/step_q_c_n": 1731.0, "calib/step_q_gap": 0.007122918465117145, "calib/step_q_w": 0.9307858048162231, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 779.34765625, "completions/mean_terminated_length": 791.7183227539062, "completions/min_length": 0.0, "completions/min_terminated_length": 502.0, "epoch": 0.09813333333333334, "grad_norm": 0.004945567809045315, "kl": 0.119110107421875, "learning_rate": 3e-06, "loss": -0.0093, "num_tokens": 29026542.0, "reward": 0.8768991231918335, "reward_std": 0.20641082525253296, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6625539064407349, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.763900637626648, "step": 92 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5044438459466739, "calib/avg_num_step_conf": 9.76953125, "calib/ece": 0.3598148148148149, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.007563515575904622, "calib/mean_conf": 0.9867989417989419, "calib/mu_c": 0.9896202531645568, "calib/mu_w": 0.9820567375886522, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3598148148148149, "calib/std_conf": 0.0417251458271229, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9367091681845781, "calib/step_q_c_n": 1647.0, "calib/step_q_gap": 0.007727903547575687, "calib/step_q_w": 0.9289812646370024, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 814.578125, "completions/mean_terminated_length": 824.2371826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.0992, "grad_norm": 0.005267489701509476, "kl": 0.107086181640625, "learning_rate": 2.9722222222222225e-06, "loss": 0.0001, "num_tokens": 29340850.0, "reward": 0.8196635842323303, "reward_std": 0.29242652654647827, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.628401517868042, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6913944482803345, "step": 93 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5263157894736843, "calib/avg_num_step_conf": 9.03515625, "calib/ece": 0.3673306772908368, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.00305263157894764, "calib/mean_conf": 0.9888446215139441, "calib/mu_c": 0.99, "calib/mu_w": 0.9869473684210524, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3673306772908368, "calib/std_conf": 0.012491421795866407, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9362795275590551, "calib/step_q_c_n": 1524.0, "calib/step_q_gap": 0.0105000598784466, "calib/step_q_w": 0.9257794676806085, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2863.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 835.72265625, "completions/mean_terminated_length": 839.0000610351562, "completions/min_length": 0.0, "completions/min_terminated_length": 547.0, "epoch": 0.10026666666666667, "grad_norm": 0.004830800928175449, "kl": 0.10498046875, "learning_rate": 2.944444444444445e-06, "loss": 0.0379, "num_tokens": 29663475.0, "reward": 0.8131207227706909, "reward_std": 0.24699178338050842, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6187874674797058, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6894851326942444, "step": 94 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5205969937606352, "calib/avg_num_step_conf": 10.1796875, "calib/ece": 0.3141955380577427, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9881889763779528, "calib/gap": -0.0008797976933258012, "calib/mean_conf": 0.9861115485564305, "calib/mu_c": 0.9858275193798449, "calib/mu_w": 0.9867073170731707, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31157086614173224, "calib/std_conf": 0.04326495996495406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9403603174603175, "calib/step_q_c_n": 1890.0, "calib/step_q_gap": 0.014592161035736462, "calib/step_q_w": 0.925768156424581, "calib/step_q_w_n": 716.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 814.76171875, "completions/mean_terminated_length": 817.9569091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 392.0, "epoch": 0.10133333333333333, "grad_norm": 0.004907902330160141, "kl": 0.113800048828125, "learning_rate": 2.916666666666667e-06, "loss": -0.0125, "num_tokens": 29978182.0, "reward": 0.8689944744110107, "reward_std": 0.21350151300430298, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6782642602920532, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7269122004508972, "step": 95 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5010555946516538, "calib/avg_num_step_conf": 10.89453125, "calib/ece": 0.2180314960629921, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00026213933849383864, "calib/mean_conf": 0.9896850393700787, "calib/mu_c": 0.9897448979591837, "calib/mu_w": 0.9894827586206899, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2180314960629921, "calib/std_conf": 0.0027883356235631164, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9454087494476359, "calib/step_q_c_n": 2263.0, "calib/step_q_gap": 0.015579852109232739, "calib/step_q_w": 0.9298288973384031, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 788.76171875, "completions/mean_terminated_length": 791.8549194335938, "completions/min_length": 0.0, "completions/min_terminated_length": 507.0, "epoch": 0.1024, "grad_norm": 0.005191153381019831, "kl": 0.12872314453125, "learning_rate": 2.888888888888889e-06, "loss": -0.02, "num_tokens": 30285921.0, "reward": 0.9609732031822205, "reward_std": 0.18121843039989471, "rewards/accuracy_reward_step": 0.765625, "rewards/final_brier_reward_step": 0.766371488571167, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8047937154769897, "step": 96 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5103092783505154, "calib/avg_num_step_conf": 10.6171875, "calib/ece": 0.3804032258064517, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": 0.0018556701030921774, "calib/mean_conf": 0.9892741935483872, "calib/mu_c": 0.9899999999999997, "calib/mu_w": 0.9881443298969075, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3804032258064517, "calib/std_conf": 0.00804958294820038, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9442, "calib/step_q_c_n": 1700.0, "calib/step_q_gap": 0.009190176817288731, "calib/step_q_w": 0.9350098231827113, "calib/step_q_w_n": 1018.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 838.90625, "completions/mean_terminated_length": 848.853759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.10346666666666667, "grad_norm": 0.1059708297252655, "kl": 0.172393798828125, "learning_rate": 2.861111111111111e-06, "loss": -0.0033, "num_tokens": 30605753.0, "reward": 0.7861987352371216, "reward_std": 0.2624107599258423, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5986539125442505, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.662024736404419, "step": 97 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5114182692307693, "calib/avg_num_step_conf": 10.76953125, "calib/ece": 0.3707896825396825, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004102564102561601, "calib/mean_conf": 0.9898373015873015, "calib/mu_c": 0.9899935897435895, "calib/mu_w": 0.9895833333333334, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3707896825396825, "calib/std_conf": 0.003435534064140613, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9448157602663707, "calib/step_q_c_n": 1802.0, "calib/step_q_gap": 0.005758168643334005, "calib/step_q_w": 0.9390575916230367, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 877.87109375, "completions/mean_terminated_length": 881.3137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.10453333333333334, "grad_norm": 0.0050354162231087685, "kl": 0.1168670654296875, "learning_rate": 2.8333333333333335e-06, "loss": 0.0023, "num_tokens": 30936672.0, "reward": 0.8122867345809937, "reward_std": 0.24614278972148895, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6166933178901672, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6899112462997437, "step": 98 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5104166666666667, "calib/avg_num_step_conf": 11.515625, "calib/ece": 0.5651200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.0015277777777777946, "calib/mean_conf": 0.9891200000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9884722222222222, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5651200000000001, "calib/std_conf": 0.008391996186843746, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.947223459539718, "calib/step_q_c_n": 1347.0, "calib/step_q_gap": 0.011442697515982703, "calib/step_q_w": 0.9357807620237353, "calib/step_q_w_n": 1601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 893.859375, "completions/mean_terminated_length": 908.0476684570312, "completions/min_length": 0.0, "completions/min_terminated_length": 563.0, "epoch": 0.1056, "grad_norm": 0.0054698484018445015, "kl": 0.101318359375, "learning_rate": 2.805555555555556e-06, "loss": -0.0197, "num_tokens": 31271300.0, "reward": 0.6134436726570129, "reward_std": 0.32211142778396606, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.42684686183929443, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.5219154953956604, "step": 99 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5085255066387142, "calib/avg_num_step_conf": 11.02734375, "calib/ece": 0.3512329317269077, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001419287211741338, "calib/mean_conf": 0.9897871485943776, "calib/mu_c": 0.9897358490566036, "calib/mu_w": 0.9898777777777777, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3512329317269077, "calib/std_conf": 0.004122481646010663, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9469012023000523, "calib/step_q_c_n": 1913.0, "calib/step_q_gap": 0.010049553948403789, "calib/step_q_w": 0.9368516483516485, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 911.13671875, "completions/mean_terminated_length": 914.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 579.0, "epoch": 0.10666666666666667, "grad_norm": 0.00461991410702467, "kl": 0.1067962646484375, "learning_rate": 2.7777777777777783e-06, "loss": 0.0178, "num_tokens": 31611959.0, "reward": 0.8233909606933594, "reward_std": 0.24361805617809296, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6280152201652527, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7000167369842529, "step": 100 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5078730158730159, "calib/avg_num_step_conf": 11.29296875, "calib/ece": 0.4919123505976095, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00015873015873013596, "calib/mean_conf": 0.9899203187250996, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9898412698412697, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4919123505976095, "calib/std_conf": 0.0015440495916395314, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9490190915075707, "calib/step_q_c_n": 1519.0, "calib/step_q_gap": 0.00790393115771626, "calib/step_q_w": 0.9411151603498544, "calib/step_q_w_n": 1372.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 920.953125, "completions/mean_terminated_length": 931.87353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 614.0, "epoch": 0.10773333333333333, "grad_norm": 0.004821383859962225, "kl": 0.1078033447265625, "learning_rate": 2.7500000000000004e-06, "loss": -0.0131, "num_tokens": 31954715.0, "reward": 0.6957967281341553, "reward_std": 0.26600438356399536, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.49817925691604614, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.598882794380188, "step": 101 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5146694214876032, "calib/avg_num_step_conf": 12.94140625, "calib/ece": 0.3379011857707509, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002893939393935341, "calib/mean_conf": 0.9900750988142292, "calib/mu_c": 0.9901757575757574, "calib/mu_w": 0.9898863636363638, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3379011857707509, "calib/std_conf": 0.0012248628583871356, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9498866948754114, "calib/step_q_c_n": 2127.0, "calib/step_q_gap": 0.006657352548261275, "calib/step_q_w": 0.9432293423271502, "calib/step_q_w_n": 1186.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 859.8671875, "completions/mean_terminated_length": 866.6378173828125, "completions/min_length": 0.0, "completions/min_terminated_length": 506.0, "epoch": 0.1088, "grad_norm": 0.004850052297115326, "kl": 0.1102294921875, "learning_rate": 2.7222222222222224e-06, "loss": -0.024, "num_tokens": 32281537.0, "reward": 0.8422617316246033, "reward_std": 0.1845608949661255, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6513855457305908, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.706575334072113, "step": 102 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5133822363203806, "calib/avg_num_step_conf": 11.2421875, "calib/ece": 0.3730830039525691, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009919376156488635, "calib/mean_conf": 0.9896837944664031, "calib/mu_c": 0.9900641025641025, "calib/mu_w": 0.9890721649484536, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3730830039525691, "calib/std_conf": 0.003963620912797372, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9504706477732794, "calib/step_q_c_n": 1976.0, "calib/step_q_gap": 0.0138520224961175, "calib/step_q_w": 0.9366186252771619, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 957.0859375, "completions/mean_terminated_length": 957.0859375, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.10986666666666667, "grad_norm": 0.004068789537996054, "kl": 0.09625244140625, "learning_rate": 2.6944444444444444e-06, "loss": 0.0008, "num_tokens": 32631103.0, "reward": 0.8233869671821594, "reward_std": 0.1791730672121048, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6171547174453735, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7108692526817322, "step": 103 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.49331516095534783, "calib/avg_num_step_conf": 11.32421875, "calib/ece": 0.41605139442231076, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.0004063862928348483, "calib/mean_conf": 0.9897565737051793, "calib/mu_c": 0.9895833333333334, "calib/mu_w": 0.9899897196261682, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41605139442231076, "calib/std_conf": 0.00648754143248373, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9483588002263724, "calib/step_q_c_n": 1767.0, "calib/step_q_gap": 0.006842545809411349, "calib/step_q_w": 0.9415162544169611, "calib/step_q_w_n": 1132.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 889.6953125, "completions/mean_terminated_length": 893.1843872070312, "completions/min_length": 0.0, "completions/min_terminated_length": 575.0, "epoch": 0.11093333333333333, "grad_norm": 0.00504771014675498, "kl": 0.1142120361328125, "learning_rate": 2.666666666666667e-06, "loss": -0.0056, "num_tokens": 32965545.0, "reward": 0.7778856754302979, "reward_std": 0.17713813483715057, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5707237720489502, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6764536499977112, "step": 104 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5201628171147292, "calib/avg_num_step_conf": 11.73828125, "calib/ece": 0.4395252964426877, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.002687025116748676, "calib/mean_conf": 0.9889324110671938, "calib/mu_c": 0.9901431654676258, "calib/mu_w": 0.9874561403508771, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4395252964426877, "calib/std_conf": 0.013218990745179534, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9507614293507911, "calib/step_q_c_n": 1833.0, "calib/step_q_gap": 0.011537879862736466, "calib/step_q_w": 0.9392235494880546, "calib/step_q_w_n": 1172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 925.80078125, "completions/mean_terminated_length": 933.090576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 612.0, "epoch": 0.112, "grad_norm": 0.0046814256347715855, "kl": 0.1015777587890625, "learning_rate": 2.6388888888888893e-06, "loss": -0.0109, "num_tokens": 33308310.0, "reward": 0.7485441565513611, "reward_std": 0.23401093482971191, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5538468360900879, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6369913816452026, "step": 105 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4966216216216216, "calib/avg_num_step_conf": 12.3984375, "calib/ece": 0.3883333333333333, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -6.756756756765014e-05, "calib/mean_conf": 0.989959349593496, "calib/mu_c": 0.9899324324324323, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3883333333333333, "calib/std_conf": 0.0006362795057926237, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9537047898338222, "calib/step_q_c_n": 2046.0, "calib/step_q_gap": 0.012516846571410833, "calib/step_q_w": 0.9411879432624114, "calib/step_q_w_n": 1128.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 886.078125, "completions/mean_terminated_length": 903.7291259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.11306666666666666, "grad_norm": 0.004771126434206963, "kl": 0.11895751953125, "learning_rate": 2.6111111111111113e-06, "loss": -0.0239, "num_tokens": 33639730.0, "reward": 0.7789354920387268, "reward_std": 0.21612492203712463, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5856839418411255, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6635932922363281, "step": 106 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5052083333333334, "calib/avg_num_step_conf": 13.2890625, "calib/ece": 0.23638823529411768, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": -0.0008906250000003668, "calib/mean_conf": 0.9893294117647059, "calib/mu_c": 0.989109375, "calib/mu_w": 0.9900000000000003, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23638823529411768, "calib/std_conf": 0.008440409201994679, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9495932475884246, "calib/step_q_c_n": 2488.0, "calib/step_q_gap": -0.0028356364378337995, "calib/step_q_w": 0.9524288840262584, "calib/step_q_w_n": 914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 907.25, "completions/mean_terminated_length": 910.8079223632812, "completions/min_length": 0.0, "completions/min_terminated_length": 593.0, "epoch": 0.11413333333333334, "grad_norm": 0.0047623030841350555, "kl": 0.1190643310546875, "learning_rate": 2.5833333333333337e-06, "loss": 0.0302, "num_tokens": 33976602.0, "reward": 0.9565872550010681, "reward_std": 0.20818129181861877, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7547374963760376, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8092182278633118, "step": 107 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48958333333333337, "calib/avg_num_step_conf": 13.1328125, "calib/ece": 0.2266002656042498, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": -0.004930555555555771, "calib/mean_conf": 0.9862284196547145, "calib/mu_c": 0.9850694444444444, "calib/mu_w": 0.9900000000000002, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22394422310756987, "calib/std_conf": 0.04247993112842338, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9534159159159159, "calib/step_q_c_n": 2664.0, "calib/step_q_gap": 0.00844456921104475, "calib/step_q_w": 0.9449713467048712, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 952.60546875, "completions/mean_terminated_length": 963.9012451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 535.0, "epoch": 0.1152, "grad_norm": 0.004174269735813141, "kl": 0.11817169189453125, "learning_rate": 2.5555555555555557e-06, "loss": -0.0043, "num_tokens": 34323701.0, "reward": 0.9475168585777283, "reward_std": 0.17167723178863525, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.7526540756225586, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7962858080863953, "step": 108 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5160697887970614, "calib/avg_num_step_conf": 13.15625, "calib/ece": 0.47979757085020236, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004893086711266204, "calib/mean_conf": 0.9899190283400809, "calib/mu_c": 0.99015873015873, "calib/mu_w": 0.9896694214876034, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.47979757085020236, "calib/std_conf": 0.0022026672888640414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9556503422854135, "calib/step_q_c_n": 1899.0, "calib/step_q_gap": 0.017243262639395796, "calib/step_q_w": 0.9384070796460177, "calib/step_q_w_n": 1469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 939.6484375, "completions/mean_terminated_length": 954.5635375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 638.0, "epoch": 0.11626666666666667, "grad_norm": 0.0042589870281517506, "kl": 0.104736328125, "learning_rate": 2.5277777777777778e-06, "loss": -0.02, "num_tokens": 34668851.0, "reward": 0.7005583047866821, "reward_std": 0.1663442850112915, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5018503665924072, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6070786118507385, "step": 109 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.491990456714383, "calib/avg_num_step_conf": 13.57421875, "calib/ece": 0.3458458498023713, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00014976141785960628, "calib/mean_conf": 0.9901146245059287, "calib/mu_c": 0.9900613496932515, "calib/mu_w": 0.9902111111111112, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3458458498023713, "calib/std_conf": 0.001047632167986198, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9524288840262582, "calib/step_q_c_n": 2285.0, "calib/step_q_gap": 0.004791909236342207, "calib/step_q_w": 0.947636974789916, "calib/step_q_w_n": 1190.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 884.55078125, "completions/mean_terminated_length": 891.5157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 516.0, "epoch": 0.11733333333333333, "grad_norm": 0.005009052809327841, "kl": 0.119873046875, "learning_rate": 2.5e-06, "loss": -0.0159, "num_tokens": 35000216.0, "reward": 0.8415005207061768, "reward_std": 0.22774638235569, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6434261202812195, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7153560519218445, "step": 110 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5189348320620902, "calib/avg_num_step_conf": 12.75390625, "calib/ece": 0.35889459815546776, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9920948616600791, "calib/gap": 0.006768633748159947, "calib/mean_conf": 0.9873530961791831, "calib/mu_c": 0.9898679245283016, "calib/mu_w": 0.9830992907801417, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35889459815546776, "calib/std_conf": 0.04162146222796388, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9532540909090912, "calib/step_q_c_n": 2200.0, "calib/step_q_gap": 0.010693527528809477, "calib/step_q_w": 0.9425605633802817, "calib/step_q_w_n": 1065.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 965.71484375, "completions/mean_terminated_length": 965.71484375, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.1184, "grad_norm": 0.004538996610790491, "kl": 0.10672760009765625, "learning_rate": 2.4722222222222226e-06, "loss": -0.0, "num_tokens": 35354847.0, "reward": 0.8264460563659668, "reward_std": 0.2635301351547241, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6276901960372925, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7048894166946411, "step": 111 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5, "calib/avg_num_step_conf": 12.359375, "calib/ece": 0.36959183673469387, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.3306690738754696e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36959183673469387, "calib/std_conf": 0.0, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9527737556561087, "calib/step_q_c_n": 2210.0, "calib/step_q_gap": 0.019063063832209348, "calib/step_q_w": 0.9337106918238993, "calib/step_q_w_n": 954.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 918.04296875, "completions/mean_terminated_length": 947.6572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 588.0, "epoch": 0.11946666666666667, "grad_norm": 0.004856268409639597, "kl": 0.115386962890625, "learning_rate": 2.4444444444444447e-06, "loss": -0.045, "num_tokens": 35697786.0, "reward": 0.7879830598831177, "reward_std": 0.25175368785858154, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6009199023246765, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.664889931678772, "step": 112 }, { "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5095182595182595, "calib/avg_num_step_conf": 13.1953125, "calib/ece": 0.37834901960784306, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002024087024085075, "calib/mean_conf": 0.990113725490196, "calib/mu_c": 0.9901923076923075, "calib/mu_w": 0.989989898989899, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37834901960784306, "calib/std_conf": 0.001630231501405179, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9541280353200885, "calib/step_q_c_n": 2265.0, "calib/step_q_gap": 0.012846813397356938, "calib/step_q_w": 0.9412812219227316, "calib/step_q_w_n": 1113.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 898.83984375, "completions/mean_terminated_length": 902.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 555.0, "epoch": 0.12053333333333334, "grad_norm": 0.004978550132364035, "kl": 0.1248626708984375, "learning_rate": 2.4166666666666667e-06, "loss": 0.0074, "num_tokens": 36033089.0, "reward": 0.8190068006515503, "reward_std": 0.26815420389175415, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6170171499252319, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.6991214752197266, "step": 113 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.514314100677068, "calib/avg_num_step_conf": 13.48046875, "calib/ece": 0.3048964143426295, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00041146599941155326, "calib/mean_conf": 0.9901553784860557, "calib/mu_c": 0.9902848837209303, "calib/mu_w": 0.9898734177215187, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3048964143426295, "calib/std_conf": 0.001970843489457132, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9548995815899584, "calib/step_q_c_n": 2629.0, "calib/step_q_gap": 0.017819289619155354, "calib/step_q_w": 0.937080291970803, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 916.390625, "completions/mean_terminated_length": 927.2569580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 240.0, "epoch": 0.1216, "grad_norm": 0.004546697251498699, "kl": 0.1204681396484375, "learning_rate": 2.388888888888889e-06, "loss": -0.0201, "num_tokens": 36372709.0, "reward": 0.8801324963569641, "reward_std": 0.24433541297912598, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6780261397361755, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.75177001953125, "step": 114 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5166752016387146, "calib/avg_num_step_conf": 13.17578125, "calib/ece": 0.4128458498023715, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008347202662911668, "calib/mean_conf": 0.9899209486166007, "calib/mu_c": 0.9902739726027396, "calib/mu_w": 0.9894392523364485, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4128458498023715, "calib/std_conf": 0.004263281489721373, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9502658670664669, "calib/step_q_c_n": 2001.0, "calib/step_q_gap": 0.004660910798245377, "calib/step_q_w": 0.9456049562682215, "calib/step_q_w_n": 1372.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 909.41015625, "completions/mean_terminated_length": 916.5708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 621.0, "epoch": 0.12266666666666666, "grad_norm": 0.00454685278236866, "kl": 0.1140899658203125, "learning_rate": 2.361111111111111e-06, "loss": -0.0097, "num_tokens": 36710782.0, "reward": 0.7695689797401428, "reward_std": 0.2374623566865921, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5790222883224487, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.648396909236908, "step": 115 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5144851885684457, "calib/avg_num_step_conf": 12.67578125, "calib/ece": 0.37035294117647044, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00010309278350495443, "calib/mean_conf": 0.9899607843137254, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9898969072164948, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37035294117647044, "calib/std_conf": 0.002869452255674782, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.948682771194166, "calib/step_q_c_n": 2194.0, "calib/step_q_gap": 0.010186101355916577, "calib/step_q_w": 0.9384966698382494, "calib/step_q_w_n": 1051.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2186.0, "completions/max_terminated_length": 2186.0, "completions/mean_length": 962.65625, "completions/mean_terminated_length": 966.4314575195312, "completions/min_length": 0.0, "completions/min_terminated_length": 543.0, "epoch": 0.12373333333333333, "grad_norm": 0.004456583876162767, "kl": 0.1026153564453125, "learning_rate": 2.3333333333333336e-06, "loss": -0.018, "num_tokens": 37061742.0, "reward": 0.8288036584854126, "reward_std": 0.20433485507965088, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6247351169586182, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7102158069610596, "step": 116 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5039081885856079, "calib/avg_num_step_conf": 12.64453125, "calib/ece": 0.47751968503937003, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.00011042183622844703, "calib/mean_conf": 0.9893307086614173, "calib/mu_c": 0.9893846153846154, "calib/mu_w": 0.989274193548387, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.47751968503937003, "calib/std_conf": 0.00698316887446488, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9513340807174888, "calib/step_q_c_n": 1784.0, "calib/step_q_gap": 0.008092511550248482, "calib/step_q_w": 0.9432415691672403, "calib/step_q_w_n": 1453.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 935.56640625, "completions/mean_terminated_length": 935.56640625, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.1248, "grad_norm": 0.004536793567240238, "kl": 0.1120758056640625, "learning_rate": 2.305555555555556e-06, "loss": 0.0073, "num_tokens": 37407847.0, "reward": 0.7191907167434692, "reward_std": 0.26438361406326294, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5140968561172485, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.6258469820022583, "step": 117 }, { "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.49671052631578944, "calib/avg_num_step_conf": 13.03515625, "calib/ece": 0.36946938775510196, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00019736842105244623, "calib/mean_conf": 0.9898775510204081, "calib/mu_c": 0.9898026315789473, "calib/mu_w": 0.9899999999999998, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36946938775510196, "calib/std_conf": 0.002300245355690674, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9498751115075824, "calib/step_q_c_n": 2242.0, "calib/step_q_gap": 0.0121856137906875, "calib/step_q_w": 0.9376894977168949, "calib/step_q_w_n": 1095.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 938.55078125, "completions/mean_terminated_length": 968.8265991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 443.0, "epoch": 0.12586666666666665, "grad_norm": 0.004124051425606012, "kl": 0.1169281005859375, "learning_rate": 2.277777777777778e-06, "loss": -0.0438, "num_tokens": 37752124.0, "reward": 0.7813794016838074, "reward_std": 0.20420008897781372, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6007570028305054, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.653407871723175, "step": 118 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4985562718237981, "calib/avg_num_step_conf": 12.57421875, "calib/ece": 0.4038131720430107, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9919354838709677, "calib/gap": -0.005013564329841236, "calib/mean_conf": 0.9871465053763442, "calib/mu_c": 0.9850844748858448, "calib/mu_w": 0.9900980392156861, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40112499999999995, "calib/std_conf": 0.04200963733912335, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9516280120481928, "calib/step_q_c_n": 1992.0, "calib/step_q_gap": 0.013111304631729936, "calib/step_q_w": 0.9385167074164629, "calib/step_q_w_n": 1227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 988.0, "completions/mean_terminated_length": 999.7154541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 491.0, "epoch": 0.12693333333333334, "grad_norm": 0.004427317064255476, "kl": 0.1075897216796875, "learning_rate": 2.25e-06, "loss": -0.0173, "num_tokens": 38110116.0, "reward": 0.7786606550216675, "reward_std": 0.25082796812057495, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5724661946296692, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6770426034927368, "step": 119 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5176470588235295, "calib/avg_num_step_conf": 12.2109375, "calib/ece": 0.3263849205952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.003410003640014203, "calib/mean_conf": 0.9890833332936507, "calib/mu_c": 0.9902335329341315, "calib/mu_w": 0.9868235292941173, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3263849205952381, "calib/std_conf": 0.01328402337062744, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9498171052631579, "calib/step_q_c_n": 2280.0, "calib/step_q_gap": 0.015936458809257092, "calib/step_q_w": 0.9338806464539008, "calib/step_q_w_n": 846.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 929.75, "completions/mean_terminated_length": 944.5079956054688, "completions/min_length": 0.0, "completions/min_terminated_length": 578.0, "epoch": 0.128, "grad_norm": 0.004550413694232702, "kl": 0.109375, "learning_rate": 2.222222222222222e-06, "loss": -0.0381, "num_tokens": 38454820.0, "reward": 0.8674514293670654, "reward_std": 0.22823333740234375, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6608027219772339, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7467562556266785, "step": 120 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5083153784002155, "calib/avg_num_step_conf": 11.96875, "calib/ece": 0.36289682539682544, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.019929975760615e-05, "calib/mean_conf": 0.9898809523809524, "calib/mu_c": 0.9898734177215188, "calib/mu_w": 0.9898936170212764, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.36289682539682544, "calib/std_conf": 0.0030187434126442807, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9457457457457458, "calib/step_q_c_n": 1998.0, "calib/step_q_gap": 0.008663194150999032, "calib/step_q_w": 0.9370825515947467, "calib/step_q_w_n": 1066.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 963.42578125, "completions/mean_terminated_length": 974.849853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 632.0, "epoch": 0.12906666666666666, "grad_norm": 0.0043608820997178555, "kl": 0.102081298828125, "learning_rate": 2.1944444444444445e-06, "loss": -0.0092, "num_tokens": 38806513.0, "reward": 0.8166075348854065, "reward_std": 0.30062606930732727, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6166195273399353, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.698626697063446, "step": 121 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.4998512568793693, "calib/avg_num_step_conf": 12.5703125, "calib/ece": 0.3179311740890688, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00037974118696992853, "calib/mean_conf": 0.989995951417004, "calib/mu_c": 0.9901204819277106, "calib/mu_w": 0.9897407407407407, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3179311740890688, "calib/std_conf": 0.0028320004456135767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9506187797490264, "calib/step_q_c_n": 2311.0, "calib/step_q_gap": 0.013770929693899592, "calib/step_q_w": 0.9368478500551268, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 993.77734375, "completions/mean_terminated_length": 1001.6023559570312, "completions/min_length": 0.0, "completions/min_terminated_length": 597.0, "epoch": 0.13013333333333332, "grad_norm": 0.0043544284999370575, "kl": 0.10748291015625, "learning_rate": 2.166666666666667e-06, "loss": 0.0054, "num_tokens": 39168264.0, "reward": 0.8525604009628296, "reward_std": 0.1859087347984314, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6548253893852234, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7276390790939331, "step": 122 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4936641323195945, "calib/avg_num_step_conf": 11.6796875, "calib/ece": 0.39937751004016075, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.0006372549019606488, "calib/mean_conf": 0.9897389558232933, "calib/mu_c": 0.9899999999999998, "calib/mu_w": 0.9893627450980391, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39937751004016075, "calib/std_conf": 0.00604801143213114, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9483242655059848, "calib/step_q_c_n": 1838.0, "calib/step_q_gap": 0.011423223839318042, "calib/step_q_w": 0.9369010416666668, "calib/step_q_w_n": 1152.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 1024.8359375, "completions/mean_terminated_length": 1045.2509765625, "completions/min_length": 0.0, "completions/min_terminated_length": 537.0, "epoch": 0.1312, "grad_norm": 0.004120643250644207, "kl": 0.09653472900390625, "learning_rate": 2.138888888888889e-06, "loss": -0.015, "num_tokens": 39535910.0, "reward": 0.7791215777397156, "reward_std": 0.2641041874885559, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5824018120765686, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6680288314819336, "step": 123 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4970588235294118, "calib/avg_num_step_conf": 13.15625, "calib/ece": 0.31820948616600797, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010510276399688934, "calib/mean_conf": 0.9901462450592886, "calib/mu_c": 0.9901117647058825, "calib/mu_w": 0.9902168674698794, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31820948616600797, "calib/std_conf": 0.001925067559829116, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9502967959527826, "calib/step_q_c_n": 2372.0, "calib/step_q_gap": 0.007676314025071629, "calib/step_q_w": 0.942620481927711, "calib/step_q_w_n": 996.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 965.64453125, "completions/mean_terminated_length": 977.0949096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 621.0, "epoch": 0.13226666666666667, "grad_norm": 0.00459699472412467, "kl": 0.1124114990234375, "learning_rate": 2.1111111111111114e-06, "loss": -0.0273, "num_tokens": 39889931.0, "reward": 0.8741840124130249, "reward_std": 0.21621385216712952, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6703066229820251, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7475927472114563, "step": 124 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5106905130101527, "calib/avg_num_step_conf": 11.94140625, "calib/ece": 0.4249512195117886, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004983527206278149, "calib/mean_conf": 0.9899918699182927, "calib/mu_c": 0.990208633093525, "calib/mu_w": 0.9897102803728972, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4249512195117886, "calib/std_conf": 0.002964549169742379, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9500343698854338, "calib/step_q_c_n": 1833.0, "calib/step_q_gap": 0.01098789203502526, "calib/step_q_w": 0.9390464778504085, "calib/step_q_w_n": 1224.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 1007.0703125, "completions/mean_terminated_length": 1023.0556030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 626.0, "epoch": 0.13333333333333333, "grad_norm": 0.004465122241526842, "kl": 0.0935516357421875, "learning_rate": 2.0833333333333334e-06, "loss": 0.0012, "num_tokens": 40252549.0, "reward": 0.7530326247215271, "reward_std": 0.2753220200538635, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.551465630531311, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6538183689117432, "step": 125 }, { "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.5084134615384616, "calib/avg_num_step_conf": 11.56640625, "calib/ece": 0.4230833333333335, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007466063348414709, "calib/mean_conf": 0.9897500000000001, "calib/mu_c": 0.9900735294117647, "calib/mu_w": 0.9893269230769233, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4230833333333335, "calib/std_conf": 0.0037555514464145856, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9510777479892761, "calib/step_q_c_n": 1865.0, "calib/step_q_gap": 0.020895266237451193, "calib/step_q_w": 0.9301824817518249, "calib/step_q_w_n": 1096.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 945.94921875, "completions/mean_terminated_length": 992.4712524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 484.0, "epoch": 0.1344, "grad_norm": 0.004414580762386322, "kl": 0.0979461669921875, "learning_rate": 2.0555555555555555e-06, "loss": -0.0474, "num_tokens": 40600176.0, "reward": 0.7389653325080872, "reward_std": 0.26135197281837463, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5397324562072754, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.6452294588088989, "step": 126 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5186850519584333, "calib/avg_num_step_conf": 12.03515625, "calib/ece": 0.4272388663967611, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006437516653343689, "calib/mean_conf": 0.9899919028340081, "calib/mu_c": 0.990273381294964, "calib/mu_w": 0.9896296296296296, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4272388663967611, "calib/std_conf": 0.0029585420181484465, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9504181913225301, "calib/step_q_c_n": 1913.0, "calib/step_q_gap": 0.014262369404721809, "calib/step_q_w": 0.9361558219178083, "calib/step_q_w_n": 1168.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 962.21875, "completions/mean_terminated_length": 981.386474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 632.0, "epoch": 0.13546666666666668, "grad_norm": 0.004797594156116247, "kl": 0.1098480224609375, "learning_rate": 2.027777777777778e-06, "loss": -0.0181, "num_tokens": 40950176.0, "reward": 0.7391723394393921, "reward_std": 0.260296493768692, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5512328147888184, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6255492568016052, "step": 127 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.4902569235902569, "calib/avg_num_step_conf": 11.58203125, "calib/ece": 0.4413739837398373, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001781781781780012, "calib/mean_conf": 0.9901544715447154, "calib/mu_c": 0.9900740740740739, "calib/mu_w": 0.9902522522522519, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4413739837398373, "calib/std_conf": 0.001203197098284558, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9495187465025182, "calib/step_q_c_n": 1787.0, "calib/step_q_gap": 0.01348394174869827, "calib/step_q_w": 0.93603480475382, "calib/step_q_w_n": 1178.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 975.77734375, "completions/mean_terminated_length": 1007.2540283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 627.0, "epoch": 0.13653333333333334, "grad_norm": 0.004459763877093792, "kl": 0.0992889404296875, "learning_rate": 2.0000000000000003e-06, "loss": -0.0426, "num_tokens": 41306639.0, "reward": 0.7325985431671143, "reward_std": 0.2545444369316101, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5357023477554321, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6318385601043701, "step": 128 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5032876522702104, "calib/avg_num_step_conf": 12.66796875, "calib/ece": 0.32861811023622045, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00010465116279034792, "calib/mean_conf": 0.9900354330708662, "calib/mu_c": 0.99, "calib/mu_w": 0.9901046511627903, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.32861811023622045, "calib/std_conf": 0.0028621072452505655, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9488309922972361, "calib/step_q_c_n": 2207.0, "calib/step_q_gap": 0.008247015463259233, "calib/step_q_w": 0.9405839768339769, "calib/step_q_w_n": 1036.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 944.11328125, "completions/mean_terminated_length": 947.8157348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 504.0, "epoch": 0.1376, "grad_norm": 0.00469581875950098, "kl": 0.11328125, "learning_rate": 1.9722222222222224e-06, "loss": -0.0036, "num_tokens": 41650716.0, "reward": 0.8614320755004883, "reward_std": 0.17951692640781403, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6627917885780334, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7296035885810852, "step": 129 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5093904788112273, "calib/avg_num_step_conf": 14.2734375, "calib/ece": 0.3578, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.0010594386351130414, "calib/mean_conf": 0.9898, "calib/mu_c": 0.9901898734177215, "calib/mu_w": 0.9891304347826084, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3578, "calib/std_conf": 0.005827520913733384, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9547496896979727, "calib/step_q_c_n": 2417.0, "calib/step_q_gap": 0.006220991233946704, "calib/step_q_w": 0.948528698464026, "calib/step_q_w_n": 1237.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2905.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 956.38671875, "completions/mean_terminated_length": 967.727294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 419.0, "epoch": 0.13866666666666666, "grad_norm": 0.00415900768712163, "kl": 0.1133270263671875, "learning_rate": 1.944444444444445e-06, "loss": -0.0157, "num_tokens": 42000839.0, "reward": 0.8460250496864319, "reward_std": 0.20391161739826202, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6248651742935181, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7484347820281982, "step": 130 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5093816773504274, "calib/avg_num_step_conf": 11.81640625, "calib/ece": 0.5708696236559141, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9798387096774194, "calib/gap": -0.002829594017093928, "calib/mean_conf": 0.98484811827957, "calib/mu_c": 0.9832051282051282, "calib/mu_w": 0.9860347222222221, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5681814516129033, "calib/std_conf": 0.04766354373603634, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9534522992450241, "calib/step_q_c_n": 1457.0, "calib/step_q_gap": 0.016302426796044434, "calib/step_q_w": 0.9371498724489796, "calib/step_q_w_n": 1568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 939.6015625, "completions/mean_terminated_length": 958.3187255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 627.0, "epoch": 0.13973333333333332, "grad_norm": 0.00463265273720026, "kl": 0.09664154052734375, "learning_rate": 1.916666666666667e-06, "loss": -0.0152, "num_tokens": 42347585.0, "reward": 0.6049246788024902, "reward_std": 0.19286221265792847, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.4194599688053131, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.5161706209182739, "step": 131 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5387709497206703, "calib/avg_num_step_conf": 14.140625, "calib/ece": 0.28515747637795275, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0017243891992551053, "calib/mean_conf": 0.9898818858267716, "calib/mu_c": 0.9903910558659217, "calib/mu_w": 0.9886666666666666, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28515747637795275, "calib/std_conf": 0.00439059322740553, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.954889878990349, "calib/step_q_c_n": 2694.0, "calib/step_q_gap": 0.01234128287803804, "calib/step_q_w": 0.942548596112311, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1903.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 948.578125, "completions/mean_terminated_length": 956.0472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 634.0, "epoch": 0.1408, "grad_norm": 0.0045576319098472595, "kl": 0.116363525390625, "learning_rate": 1.888888888888889e-06, "loss": -0.0068, "num_tokens": 42696013.0, "reward": 0.9042675495147705, "reward_std": 0.2530564069747925, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7057386636734009, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7645150423049927, "step": 132 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49615901670827733, "calib/avg_num_step_conf": 12.51171875, "calib/ece": 0.498, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00016004097048860633, "calib/mean_conf": 0.99, "calib/mu_c": 0.990081300813008, "calib/mu_w": 0.9899212598425194, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.498, "calib/std_conf": 0.002828427124746193, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.952016036655212, "calib/step_q_c_n": 1746.0, "calib/step_q_gap": 0.013505398357339549, "calib/step_q_w": 0.9385106382978724, "calib/step_q_w_n": 1457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 1006.171875, "completions/mean_terminated_length": 1022.1429443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 629.0, "epoch": 0.14186666666666667, "grad_norm": 0.004659958649426699, "kl": 0.0912628173828125, "learning_rate": 1.8611111111111113e-06, "loss": -0.002, "num_tokens": 43059937.0, "reward": 0.7004643678665161, "reward_std": 0.3030105233192444, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.4902855455875397, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6200181841850281, "step": 133 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5173036511688993, "calib/avg_num_step_conf": 11.88671875, "calib/ece": 0.42309236947791173, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.0016449960598895474, "calib/mean_conf": 0.9893574297188756, "calib/mu_c": 0.9900709219858154, "calib/mu_w": 0.9884259259259258, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42309236947791173, "calib/std_conf": 0.00679510476853313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9480401479133651, "calib/step_q_c_n": 1893.0, "calib/step_q_gap": 0.012022756609017193, "calib/step_q_w": 0.9360173913043479, "calib/step_q_w_n": 1150.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2675.0, "completions/max_terminated_length": 2675.0, "completions/mean_length": 996.48046875, "completions/mean_terminated_length": 1000.3883056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 482.0, "epoch": 0.14293333333333333, "grad_norm": 0.004632034804672003, "kl": 0.0955963134765625, "learning_rate": 1.8333333333333333e-06, "loss": 0.0257, "num_tokens": 43423988.0, "reward": 0.7609483003616333, "reward_std": 0.23016947507858276, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5603917837142944, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6568171381950378, "step": 134 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5080032012805124, "calib/avg_num_step_conf": 12.71484375, "calib/ece": 0.38031872509960163, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.0003634787248232296, "calib/mean_conf": 0.9898804780876495, "calib/mu_c": 0.9897385620915033, "calib/mu_w": 0.9901020408163266, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38031872509960163, "calib/std_conf": 0.005886176101183293, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9522067307692308, "calib/step_q_c_n": 2080.0, "calib/step_q_gap": 0.008998220130932988, "calib/step_q_w": 0.9432085106382978, "calib/step_q_w_n": 1175.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 971.7734375, "completions/mean_terminated_length": 975.5843505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.144, "grad_norm": 0.004621756263077259, "kl": 0.102294921875, "learning_rate": 1.8055555555555557e-06, "loss": 0.0198, "num_tokens": 43778642.0, "reward": 0.8092292547225952, "reward_std": 0.2588251531124115, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6050999760627747, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6977335214614868, "step": 135 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.532553441084463, "calib/avg_num_step_conf": 11.44921875, "calib/ece": 0.44004016064257057, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0009247914494263654, "calib/mean_conf": 0.9902409638554219, "calib/mu_c": 0.9906569343065693, "calib/mu_w": 0.989732142857143, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.44004016064257057, "calib/std_conf": 0.0032223754419144616, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9512581344902385, "calib/step_q_c_n": 1844.0, "calib/step_q_gap": 0.018240655189410537, "calib/step_q_w": 0.933017479300828, "calib/step_q_w_n": 1087.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 917.66015625, "completions/mean_terminated_length": 935.9402465820312, "completions/min_length": 0.0, "completions/min_terminated_length": 566.0, "epoch": 0.14506666666666668, "grad_norm": 0.0050376043654978275, "kl": 0.1031646728515625, "learning_rate": 1.777777777777778e-06, "loss": -0.0162, "num_tokens": 44122051.0, "reward": 0.7339471578598022, "reward_std": 0.2788407802581787, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5438823699951172, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6240118145942688, "step": 136 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5068493150684932, "calib/avg_num_step_conf": 12.3828125, "calib/ece": 0.408406374501992, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00013698630136993462, "calib/mean_conf": 0.9900796812749004, "calib/mu_c": 0.9901369863013699, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.408406374501992, "calib/std_conf": 0.0008890802232837218, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9520726172465962, "calib/step_q_c_n": 1983.0, "calib/step_q_gap": 0.01182830385148248, "calib/step_q_w": 0.9402443133951137, "calib/step_q_w_n": 1187.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2332.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 948.15234375, "completions/mean_terminated_length": 959.3953247070312, "completions/min_length": 0.0, "completions/min_terminated_length": 528.0, "epoch": 0.14613333333333334, "grad_norm": 0.004428972490131855, "kl": 0.10677337646484375, "learning_rate": 1.75e-06, "loss": -0.0326, "num_tokens": 44471762.0, "reward": 0.7847567200660706, "reward_std": 0.23171323537826538, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5745124816894531, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6856259107589722, "step": 137 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5050662251655629, "calib/avg_num_step_conf": 12.26171875, "calib/ece": 0.38665338645418335, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.0007430463576159063, "calib/mean_conf": 0.9882470119521913, "calib/mu_c": 0.9885430463576158, "calib/mu_w": 0.9877999999999999, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38665338645418335, "calib/std_conf": 0.011813404038462101, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9536979166666667, "calib/step_q_c_n": 2112.0, "calib/step_q_gap": 0.016297721924699804, "calib/step_q_w": 0.9374001947419669, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 946.546875, "completions/mean_terminated_length": 961.5714721679688, "completions/min_length": 0.0, "completions/min_terminated_length": 540.0, "epoch": 0.1472, "grad_norm": 0.004463068209588528, "kl": 0.101226806640625, "learning_rate": 1.7222222222222224e-06, "loss": -0.0321, "num_tokens": 44818414.0, "reward": 0.8052668571472168, "reward_std": 0.24942906200885773, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5951968431472778, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7020555734634399, "step": 138 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5087719298245614, "calib/avg_num_step_conf": 14.5, "calib/ece": 0.314110671936759, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 1.1102230246251565e-16, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.99, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.314110671936759, "calib/std_conf": 0.0028116077855776687, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9570648215586308, "calib/step_q_c_n": 2746.0, "calib/step_q_gap": 0.013617616589686743, "calib/step_q_w": 0.9434472049689441, "calib/step_q_w_n": 966.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 912.30078125, "completions/mean_terminated_length": 923.1185913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 561.0, "epoch": 0.14826666666666666, "grad_norm": 0.004617730155587196, "kl": 0.115936279296875, "learning_rate": 1.6944444444444446e-06, "loss": -0.0425, "num_tokens": 45155059.0, "reward": 0.883562445640564, "reward_std": 0.176340714097023, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.6703718304634094, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7662842273712158, "step": 139 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.49999999999999994, "calib/avg_num_step_conf": 13.98046875, "calib/ece": 0.3235714285714286, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 3.3306690738754696e-16, "calib/mean_conf": 0.9902380952380953, "calib/mu_c": 0.9902380952380954, "calib/mu_w": 0.990238095238095, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3235714285714286, "calib/std_conf": 0.0015245533898649653, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9547047166072138, "calib/step_q_c_n": 2523.0, "calib/step_q_gap": 0.010585398425395631, "calib/step_q_w": 0.9441193181818182, "calib/step_q_w_n": 1056.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 962.09375, "completions/mean_terminated_length": 973.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 557.0, "epoch": 0.14933333333333335, "grad_norm": 0.004488389939069748, "kl": 0.1055145263671875, "learning_rate": 1.6666666666666667e-06, "loss": -0.0251, "num_tokens": 45506371.0, "reward": 0.8630602955818176, "reward_std": 0.2330675721168518, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6625601649284363, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.735435426235199, "step": 140 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5184154338874485, "calib/avg_num_step_conf": 13.10546875, "calib/ece": 0.31175510204081625, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9918367346938776, "calib/gap": 0.0010309592801588385, "calib/mean_conf": 0.9893061224489795, "calib/mu_c": 0.9896385542168673, "calib/mu_w": 0.9886075949367085, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31175510204081625, "calib/std_conf": 0.008276359452133807, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9547808604744672, "calib/step_q_c_n": 2487.0, "calib/step_q_gap": 0.014608049414559288, "calib/step_q_w": 0.9401728110599079, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 988.75390625, "completions/mean_terminated_length": 996.5393676757812, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.1504, "grad_norm": 0.004703023005276918, "kl": 0.1005096435546875, "learning_rate": 1.638888888888889e-06, "loss": -0.0096, "num_tokens": 45866588.0, "reward": 0.8537777066230774, "reward_std": 0.19736658036708832, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6552159786224365, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7320269346237183, "step": 141 }, { "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.5028449502133713, "calib/avg_num_step_conf": 14.265625, "calib/ece": 0.3771604938271606, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.010375533428165329, "calib/mean_conf": 0.9862139917695474, "calib/mu_c": 0.9902702702702701, "calib/mu_w": 0.9798947368421048, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3771604938271606, "calib/std_conf": 0.06341829494277257, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9550871408384363, "calib/step_q_c_n": 2123.0, "calib/step_q_gap": 0.0014573174244402187, "calib/step_q_w": 0.953629823413996, "calib/step_q_w_n": 1529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 965.5078125, "completions/mean_terminated_length": 992.6505737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.15146666666666667, "grad_norm": 0.004101898986846209, "kl": 0.1012115478515625, "learning_rate": 1.6111111111111113e-06, "loss": -0.0434, "num_tokens": 46218918.0, "reward": 0.769848108291626, "reward_std": 0.26609960198402405, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5889710783958435, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6460375785827637, "step": 142 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5184210526315789, "calib/avg_num_step_conf": 14.80078125, "calib/ece": 0.38094650205761316, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": -0.00017283072546192546, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899324324324323, "calib/mu_w": 0.9901052631578943, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.38094650205761316, "calib/std_conf": 0.006085806194501844, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.9591739302694136, "calib/step_q_c_n": 2524.0, "calib/step_q_gap": 0.010762863075737705, "calib/step_q_w": 0.9484110671936758, "calib/step_q_w_n": 1265.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 965.2578125, "completions/mean_terminated_length": 992.3935546875, "completions/min_length": 0.0, "completions/min_terminated_length": 527.0, "epoch": 0.15253333333333333, "grad_norm": 0.0043958332389593124, "kl": 0.1049346923828125, "learning_rate": 1.5833333333333333e-06, "loss": -0.047, "num_tokens": 46573360.0, "reward": 0.769221842288971, "reward_std": 0.22690513730049133, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5851054191589355, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6502132415771484, "step": 143 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.517992277992278, "calib/avg_num_step_conf": 13.42578125, "calib/ece": 0.2871807228915665, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007769111969112563, "calib/mean_conf": 0.9899919678714861, "calib/mu_c": 0.9902228571428571, "calib/mu_w": 0.9894459459459458, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2871807228915665, "calib/std_conf": 0.0029466364568107696, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9516439066551428, "calib/step_q_c_n": 2314.0, "calib/step_q_gap": 0.004944886174287899, "calib/step_q_w": 0.9466990204808549, "calib/step_q_w_n": 1123.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2126.0, "completions/max_terminated_length": 2126.0, "completions/mean_length": 932.8046875, "completions/mean_terminated_length": 955.1920166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 576.0, "epoch": 0.1536, "grad_norm": 0.004761853720992804, "kl": 0.104034423828125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0377, "num_tokens": 46916286.0, "reward": 0.8835018873214722, "reward_std": 0.24617859721183777, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6895893812179565, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7461644411087036, "step": 144 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.515655981493721, "calib/avg_num_step_conf": 14.859375, "calib/ece": 0.2625203252032519, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959349593495935, "calib/gap": 0.014727362855254444, "calib/mean_conf": 0.9860975609756097, "calib/mu_c": 0.9901685393258427, "calib/mu_w": 0.9754411764705883, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2625203252032519, "calib/std_conf": 0.06301547295267676, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9569343850842595, "calib/step_q_c_n": 2789.0, "calib/step_q_gap": 0.005978720059628961, "calib/step_q_w": 0.9509556650246306, "calib/step_q_w_n": 1015.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 936.41015625, "completions/mean_terminated_length": 970.5303955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 585.0, "epoch": 0.15466666666666667, "grad_norm": 0.00459950789809227, "kl": 0.1096038818359375, "learning_rate": 1.527777777777778e-06, "loss": -0.0683, "num_tokens": 47258711.0, "reward": 0.8716181516647339, "reward_std": 0.2779850959777832, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7042800784111023, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7084875106811523, "step": 145 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5163977436704709, "calib/avg_num_step_conf": 13.4921875, "calib/ece": 0.5000400809716596, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008075626393809765, "calib/mean_conf": 0.9899186234817812, "calib/mu_c": 0.9903305785123967, "calib/mu_w": 0.9895230158730157, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5000400809716596, "calib/std_conf": 0.003920451066744246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9545647321428572, "calib/step_q_c_n": 1792.0, "calib/step_q_gap": 0.008870688821557504, "calib/step_q_w": 0.9456940433212997, "calib/step_q_w_n": 1662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 960.92578125, "completions/mean_terminated_length": 987.939697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.15573333333333333, "grad_norm": 0.004820052534341812, "kl": 0.092498779296875, "learning_rate": 1.5e-06, "loss": -0.0324, "num_tokens": 47611924.0, "reward": 0.675005316734314, "reward_std": 0.19536207616329193, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.47887301445007324, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.5852000713348389, "step": 146 }, { "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5038683451350643, "calib/avg_num_step_conf": 14.109375, "calib/ece": 0.4877732793522267, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0007303960136373355, "calib/mean_conf": 0.9897975708502024, "calib/mu_c": 0.9901612903225805, "calib/mu_w": 0.9894308943089432, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4877732793522267, "calib/std_conf": 0.005931415893704436, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9564390756302522, "calib/step_q_c_n": 1904.0, "calib/step_q_gap": 0.006158045185287397, "calib/step_q_w": 0.9502810304449648, "calib/step_q_w_n": 1708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 981.27734375, "completions/mean_terminated_length": 992.9130859375, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.1568, "grad_norm": 0.00475219776853919, "kl": 0.09954833984375, "learning_rate": 1.4722222222222225e-06, "loss": -0.0076, "num_tokens": 47966811.0, "reward": 0.6794040203094482, "reward_std": 0.20438922941684723, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.4943191409111023, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.5754263401031494, "step": 147 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.517607158320104, "calib/avg_num_step_conf": 15.58203125, "calib/ece": 0.31673306772908383, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": -0.00012195121951219523, "calib/mean_conf": 0.9900398406374503, "calib/mu_c": 0.99, "calib/mu_w": 0.9901219512195122, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31673306772908383, "calib/std_conf": 0.006021079039780895, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9592242595204515, "calib/step_q_c_n": 2836.0, "calib/step_q_gap": 0.008946722660087225, "calib/step_q_w": 0.9502775368603643, "calib/step_q_w_n": 1153.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 949.7265625, "completions/mean_terminated_length": 960.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 495.0, "epoch": 0.15786666666666666, "grad_norm": 0.004767869599163532, "kl": 0.111907958984375, "learning_rate": 1.4444444444444445e-06, "loss": -0.0482, "num_tokens": 48315053.0, "reward": 0.8523479700088501, "reward_std": 0.2650758624076843, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.6624456644058228, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7141252160072327, "step": 148 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.48687467430953624, "calib/avg_num_step_conf": 13.9375, "calib/ece": 0.3918445322793149, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960474308300395, "calib/gap": -0.0042211655376062085, "calib/mean_conf": 0.9873649538866931, "calib/mu_c": 0.9856798245614035, "calib/mu_w": 0.9899009900990097, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3892094861660079, "calib/std_conf": 0.04129113896377587, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9571142369991477, "calib/step_q_c_n": 2346.0, "calib/step_q_gap": 0.010518492318296602, "calib/step_q_w": 0.9465957446808511, "calib/step_q_w_n": 1222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 980.69140625, "completions/mean_terminated_length": 984.5372924804688, "completions/min_length": 0.0, "completions/min_terminated_length": 457.0, "epoch": 0.15893333333333334, "grad_norm": 0.004556480795145035, "kl": 0.1001739501953125, "learning_rate": 1.4166666666666667e-06, "loss": -0.0033, "num_tokens": 48670566.0, "reward": 0.7927764058113098, "reward_std": 0.24334917962551117, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.5996427536010742, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6718475222587585, "step": 149 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.4903381642512077, "calib/avg_num_step_conf": 14.71875, "calib/ece": 0.36851700680272104, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": -0.004421947522970537, "calib/mean_conf": 0.9875646258503401, "calib/mu_c": 0.985904139433551, "calib/mu_w": 0.9903260869565216, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3657959183673469, "calib/std_conf": 0.041911374158060344, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9582697711762345, "calib/step_q_c_n": 2491.0, "calib/step_q_gap": 0.009099841653916418, "calib/step_q_w": 0.949169929522318, "calib/step_q_w_n": 1277.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2497.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 954.30078125, "completions/mean_terminated_length": 958.043212890625, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.16, "grad_norm": 0.004849243443459272, "kl": 0.11444091796875, "learning_rate": 1.3888888888888892e-06, "loss": -0.011, "num_tokens": 49019827.0, "reward": 0.8090563416481018, "reward_std": 0.19644436240196228, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6027803421020508, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7043949365615845, "step": 150 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5105932203389831, "calib/avg_num_step_conf": 13.16015625, "calib/ece": 0.4653588709677419, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.0017490221642766368, "calib/mean_conf": 0.9895524193548388, "calib/mu_c": 0.9903846153846154, "calib/mu_w": 0.9886355932203388, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4653588709677419, "calib/std_conf": 0.012219649457181642, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9550517711171662, "calib/step_q_c_n": 1835.0, "calib/step_q_gap": 0.0066945351328114855, "calib/step_q_w": 0.9483572359843547, "calib/step_q_w_n": 1534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 977.14453125, "completions/mean_terminated_length": 1000.5960693359375, "completions/min_length": 0.0, "completions/min_terminated_length": 515.0, "epoch": 0.16106666666666666, "grad_norm": 0.0045675914734601974, "kl": 0.0902557373046875, "learning_rate": 1.3611111111111112e-06, "loss": -0.0119, "num_tokens": 49377000.0, "reward": 0.7184681296348572, "reward_std": 0.2886176109313965, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5139772891998291, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6299901008605957, "step": 151 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5214285714285715, "calib/avg_num_step_conf": 13.2734375, "calib/ece": 0.42798795180722904, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004214285714286836, "calib/mean_conf": 0.9902369477911648, "calib/mu_c": 0.9904214285714286, "calib/mu_w": 0.9899999999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42798795180722904, "calib/std_conf": 0.0015090358105742643, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9583191780821918, "calib/step_q_c_n": 2190.0, "calib/step_q_gap": 0.018087390002721415, "calib/step_q_w": 0.9402317880794704, "calib/step_q_w_n": 1208.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 964.16015625, "completions/mean_terminated_length": 967.9412231445312, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.16213333333333332, "grad_norm": 0.005005593877285719, "kl": 0.099517822265625, "learning_rate": 1.3333333333333334e-06, "loss": 0.0024, "num_tokens": 49729217.0, "reward": 0.7581484317779541, "reward_std": 0.28604838252067566, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.555062472820282, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6596717834472656, "step": 152 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.49483043837882545, "calib/avg_num_step_conf": 13.63671875, "calib/ece": 0.36300401606425714, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959839357429718, "calib/gap": 0.00045368072787443925, "calib/mean_conf": 0.9895100401606427, "calib/mu_c": 0.9896794871794871, "calib/mu_w": 0.9892258064516126, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.36300401606425714, "calib/std_conf": 0.0066773683882772735, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9549181818181817, "calib/step_q_c_n": 2200.0, "calib/step_q_gap": 0.001990218998661919, "calib/step_q_w": 0.9529279628195197, "calib/step_q_w_n": 1291.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 960.0703125, "completions/mean_terminated_length": 979.1952514648438, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.1632, "grad_norm": 0.00486392667517066, "kl": 0.0968475341796875, "learning_rate": 1.3055555555555556e-06, "loss": 0.003, "num_tokens": 50082315.0, "reward": 0.804978609085083, "reward_std": 0.297091543674469, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6160004138946533, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6806756258010864, "step": 153 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4842529604434366, "calib/avg_num_step_conf": 14.48828125, "calib/ece": 0.49030952380952386, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0003015873015872472, "calib/mean_conf": 0.9903095238095239, "calib/mu_c": 0.99015873015873, "calib/mu_w": 0.9904603174603173, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.49030952380952386, "calib/std_conf": 0.0017111412960791395, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9582002022244692, "calib/step_q_c_n": 1978.0, "calib/step_q_gap": 0.007431282524873684, "calib/step_q_w": 0.9507689196995955, "calib/step_q_w_n": 1731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 934.92578125, "completions/mean_terminated_length": 946.0119018554688, "completions/min_length": 0.0, "completions/min_terminated_length": 566.0, "epoch": 0.16426666666666667, "grad_norm": 0.00515020452439785, "kl": 0.1073150634765625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0098, "num_tokens": 50426096.0, "reward": 0.7015895247459412, "reward_std": 0.24179303646087646, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.5014827847480774, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.606383740901947, "step": 154 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5233639029259468, "calib/avg_num_step_conf": 13.515625, "calib/ece": 0.4463346456692914, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9921259842519685, "calib/gap": 0.009784328404766396, "calib/mean_conf": 0.9857047244094488, "calib/mu_c": 0.9902116788321167, "calib/mu_w": 0.9804273504273503, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4463346456692914, "calib/std_conf": 0.0622920115431297, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9539157949790796, "calib/step_q_c_n": 1912.0, "calib/step_q_gap": 0.0029597226276585342, "calib/step_q_w": 0.9509560723514211, "calib/step_q_w_n": 1548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 922.296875, "completions/mean_terminated_length": 929.55908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 469.0, "epoch": 0.16533333333333333, "grad_norm": 0.005125963129103184, "kl": 0.1002655029296875, "learning_rate": 1.25e-06, "loss": -0.0105, "num_tokens": 50769420.0, "reward": 0.7513651847839355, "reward_std": 0.2288527935743332, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5489941239356995, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.6482675075531006, "step": 155 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5151798979394097, "calib/avg_num_step_conf": 12.71875, "calib/ece": 0.43792000000000025, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.009511013500419718, "calib/mean_conf": 0.98592, "calib/mu_c": 0.9902189781021897, "calib/mu_w": 0.98070796460177, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43792000000000025, "calib/std_conf": 0.06148945925929094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9517195325542571, "calib/step_q_c_n": 1797.0, "calib/step_q_gap": 0.003494721039520976, "calib/step_q_w": 0.9482248115147361, "calib/step_q_w_n": 1459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 943.8203125, "completions/mean_terminated_length": 955.0119018554688, "completions/min_length": 0.0, "completions/min_terminated_length": 458.0, "epoch": 0.1664, "grad_norm": 0.004947419743984938, "kl": 0.09226226806640625, "learning_rate": 1.2222222222222223e-06, "loss": -0.0022, "num_tokens": 51115798.0, "reward": 0.75379478931427, "reward_std": 0.2840968072414398, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5483015179634094, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.656944215297699, "step": 156 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5139285714285713, "calib/avg_num_step_conf": 14.21875, "calib/ece": 0.3035294117647058, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.0015357142857138628, "calib/mean_conf": 0.9898039215686274, "calib/mu_c": 0.990285714285714, "calib/mu_w": 0.9887500000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3035294117647058, "calib/std_conf": 0.006164476371519724, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9552984524686811, "calib/step_q_c_n": 2714.0, "calib/step_q_gap": 0.011097588537795411, "calib/step_q_w": 0.9442008639308856, "calib/step_q_w_n": 926.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 941.65625, "completions/mean_terminated_length": 945.3490600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 510.0, "epoch": 0.16746666666666668, "grad_norm": 0.010280285961925983, "kl": 0.130157470703125, "learning_rate": 1.1944444444444446e-06, "loss": -0.016, "num_tokens": 51460590.0, "reward": 0.9033623337745667, "reward_std": 0.23318590223789215, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.690483570098877, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7803035974502563, "step": 157 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5135525591634562, "calib/avg_num_step_conf": 13.1796875, "calib/ece": 0.35828000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00027105118326919975, "calib/mean_conf": 0.99028, "calib/mu_c": 0.9903797468354428, "calib/mu_w": 0.9901086956521736, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35828000000000004, "calib/std_conf": 0.001649727250184104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9547580645161292, "calib/step_q_c_n": 2232.0, "calib/step_q_gap": 0.009565420032766614, "calib/step_q_w": 0.9451926444833626, "calib/step_q_w_n": 1142.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 914.89453125, "completions/mean_terminated_length": 922.0984497070312, "completions/min_length": 0.0, "completions/min_terminated_length": 525.0, "epoch": 0.16853333333333334, "grad_norm": 0.0054487925954163074, "kl": 0.1057281494140625, "learning_rate": 1.1666666666666668e-06, "loss": -0.0167, "num_tokens": 51800043.0, "reward": 0.8083155155181885, "reward_std": 0.3000306487083435, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6241241693496704, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6745381355285645, "step": 158 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.508700917899031, "calib/avg_num_step_conf": 13.28125, "calib/ece": 0.4072834645669291, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002562468128504136, "calib/mean_conf": 0.9899606299212598, "calib/mu_c": 0.9900675675675674, "calib/mu_w": 0.989811320754717, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4072834645669291, "calib/std_conf": 0.003918262460014826, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9550254511800095, "calib/step_q_c_n": 2161.0, "calib/step_q_gap": 0.009335378540784256, "calib/step_q_w": 0.9456900726392252, "calib/step_q_w_n": 1239.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 896.63671875, "completions/mean_terminated_length": 900.1530151367188, "completions/min_length": 0.0, "completions/min_terminated_length": 480.0, "epoch": 0.1696, "grad_norm": 0.005623841192573309, "kl": 0.112762451171875, "learning_rate": 1.138888888888889e-06, "loss": -0.0077, "num_tokens": 52134366.0, "reward": 0.8077690005302429, "reward_std": 0.24357198178768158, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5825507640838623, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7197059392929077, "step": 159 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5033221476510068, "calib/avg_num_step_conf": 14.25, "calib/ece": 0.3910000000000001, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9919678714859438, "calib/gap": 0.00034020134228185395, "calib/mean_conf": 0.9893935742971889, "calib/mu_c": 0.9895302013422816, "calib/mu_w": 0.9891899999999998, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3910000000000001, "calib/std_conf": 0.008162365817485504, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9559621451104104, "calib/step_q_c_n": 2219.0, "calib/step_q_gap": 0.005789296964854085, "calib/step_q_w": 0.9501728481455564, "calib/step_q_w_n": 1429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 918.8125, "completions/mean_terminated_length": 933.3968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.17066666666666666, "grad_norm": 0.005115674342960119, "kl": 0.10626220703125, "learning_rate": 1.111111111111111e-06, "loss": -0.0216, "num_tokens": 52474422.0, "reward": 0.7822360396385193, "reward_std": 0.25444185733795166, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5863964557647705, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6679192781448364, "step": 160 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5150440495744362, "calib/avg_num_step_conf": 13.00390625, "calib/ece": 0.2804313725490195, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007122592205466294, "calib/mean_conf": 0.990235294117647, "calib/mu_c": 0.9904419889502761, "calib/mu_w": 0.9897297297297295, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2804313725490195, "calib/std_conf": 0.00318444894912342, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.952004707728521, "calib/step_q_c_n": 2549.0, "calib/step_q_gap": 0.010312400036213254, "calib/step_q_w": 0.9416923076923077, "calib/step_q_w_n": 780.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 894.79296875, "completions/mean_terminated_length": 898.302001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.17173333333333332, "grad_norm": 0.005225729662925005, "kl": 0.10350799560546875, "learning_rate": 1.0833333333333335e-06, "loss": -0.034, "num_tokens": 52807409.0, "reward": 0.9206104278564453, "reward_std": 0.17202883958816528, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7128636240959167, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7877320647239685, "step": 161 }, { "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.508955909369259, "calib/avg_num_step_conf": 14.1796875, "calib/ece": 0.26870196078431363, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00017980710349085882, "calib/mean_conf": 0.990270588235294, "calib/mu_c": 0.990320652173913, "calib/mu_w": 0.9901408450704221, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26870196078431363, "calib/std_conf": 0.0018389355528959507, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9571017130620985, "calib/step_q_c_n": 2802.0, "calib/step_q_gap": 0.01148577103311288, "calib/step_q_w": 0.9456159420289856, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 936.41015625, "completions/mean_terminated_length": 940.0823974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 472.0, "epoch": 0.1728, "grad_norm": 0.004868402611464262, "kl": 0.1038055419921875, "learning_rate": 1.0555555555555557e-06, "loss": -0.0271, "num_tokens": 53151274.0, "reward": 0.9340248107910156, "reward_std": 0.231055349111557, "rewards/accuracy_reward_step": 0.71875, "rewards/final_brier_reward_step": 0.7241210341453552, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8009597659111023, "step": 162 }, { "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.5189472952301488, "calib/avg_num_step_conf": 12.5390625, "calib/ece": 0.40584362139917707, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.9958847736625515, "calib/gap": 0.009989570296203731, "calib/mean_conf": 0.9860905349794239, "calib/mu_c": 0.9902836879432624, "calib/mu_w": 0.9802941176470586, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40584362139917707, "calib/std_conf": 0.06340106865914173, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.955283926852743, "calib/step_q_c_n": 2078.0, "calib/step_q_gap": 0.0167061883368419, "calib/step_q_w": 0.9385777385159011, "calib/step_q_w_n": 1132.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 970.22265625, "completions/mean_terminated_length": 985.623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 498.0, "epoch": 0.17386666666666667, "grad_norm": 0.004681031219661236, "kl": 0.0843353271484375, "learning_rate": 1.0277777777777777e-06, "loss": -0.0279, "num_tokens": 53504483.0, "reward": 0.7564293742179871, "reward_std": 0.2422761768102646, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.562407374382019, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.6512324810028076, "step": 163 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5289280760848027, "calib/avg_num_step_conf": 11.8671875, "calib/ece": 0.4004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.988, "calib/gap": 0.0027276930189552573, "calib/mean_conf": 0.9884000000000001, "calib/mu_c": 0.9895238095238094, "calib/mu_w": 0.9867961165048541, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4004, "calib/std_conf": 0.012768711759609894, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9509455292908531, "calib/step_q_c_n": 1946.0, "calib/step_q_gap": 0.01104626189158553, "calib/step_q_w": 0.9398992673992675, "calib/step_q_w_n": 1092.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 931.7578125, "completions/mean_terminated_length": 950.3187255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.17493333333333333, "grad_norm": 0.005017735995352268, "kl": 0.09310150146484375, "learning_rate": 1.0000000000000002e-06, "loss": -0.0558, "num_tokens": 53849149.0, "reward": 0.7908472418785095, "reward_std": 0.2797871232032776, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5834690928459167, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6888503432273865, "step": 164 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.513768115942029, "calib/avg_num_step_conf": 12.18359375, "calib/ece": 0.4447826086956521, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00027536231884039974, "calib/mean_conf": 0.9902371541501975, "calib/mu_c": 0.9903623188405796, "calib/mu_w": 0.9900869565217392, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4447826086956521, "calib/std_conf": 0.0015216107948553496, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9512252541466025, "calib/step_q_c_n": 1869.0, "calib/step_q_gap": 0.009857254146602323, "calib/step_q_w": 0.9413680000000002, "calib/step_q_w_n": 1250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 937.046875, "completions/mean_terminated_length": 944.4251708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 497.0, "epoch": 0.176, "grad_norm": 0.00506335124373436, "kl": 0.0945892333984375, "learning_rate": 9.722222222222224e-07, "loss": -0.0143, "num_tokens": 54194609.0, "reward": 0.7570241093635559, "reward_std": 0.20329761505126953, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.5478722453117371, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6607071757316589, "step": 165 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5084269662921348, "calib/avg_num_step_conf": 12.55859375, "calib/ece": 0.278, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1102230246251565e-16, "calib/mean_conf": 0.99, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.278, "calib/std_conf": 0.0028284271247461927, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9528125000000002, "calib/step_q_c_n": 2464.0, "calib/step_q_gap": 0.015875083222370412, "calib/step_q_w": 0.9369374167776298, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 957.59375, "completions/mean_terminated_length": 968.9486694335938, "completions/min_length": 0.0, "completions/min_terminated_length": 513.0, "epoch": 0.17706666666666668, "grad_norm": 0.004678467288613319, "kl": 0.095672607421875, "learning_rate": 9.444444444444445e-07, "loss": -0.0133, "num_tokens": 54545937.0, "reward": 0.9066218137741089, "reward_std": 0.21096336841583252, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7008320093154907, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7780365943908691, "step": 166 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5018316387559809, "calib/avg_num_step_conf": 13.55078125, "calib/ece": 0.29162698412698407, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001315789473679274, "calib/mean_conf": 0.9900396825396824, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.990131578947368, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29162698412698407, "calib/std_conf": 0.00302082926082036, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9527062169849655, "calib/step_q_c_n": 2461.0, "calib/step_q_gap": 0.0030435185722670344, "calib/step_q_w": 0.9496626984126985, "calib/step_q_w_n": 1008.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 944.98828125, "completions/mean_terminated_length": 956.1937255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 521.0, "epoch": 0.17813333333333334, "grad_norm": 0.004688553512096405, "kl": 0.1010894775390625, "learning_rate": 9.166666666666666e-07, "loss": -0.0489, "num_tokens": 54893462.0, "reward": 0.8880767822265625, "reward_std": 0.16871461272239685, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.693252682685852, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7485257387161255, "step": 167 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5121659949246157, "calib/avg_num_step_conf": 11.69921875, "calib/ece": 0.296812749003984, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0002448126586057642, "calib/mean_conf": 0.9900398406374502, "calib/mu_c": 0.9901149425287354, "calib/mu_w": 0.9898701298701297, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.296812749003984, "calib/std_conf": 0.001092534597673474, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9473900709219858, "calib/step_q_c_n": 2115.0, "calib/step_q_gap": 0.007560525467440349, "calib/step_q_w": 0.9398295454545454, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 949.01171875, "completions/mean_terminated_length": 960.2648315429688, "completions/min_length": 0.0, "completions/min_terminated_length": 602.0, "epoch": 0.1792, "grad_norm": 0.00453649228438735, "kl": 0.08453369140625, "learning_rate": 8.88888888888889e-07, "loss": -0.0161, "num_tokens": 55241081.0, "reward": 0.882291316986084, "reward_std": 0.26900747418403625, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.6856827735900879, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7468684911727905, "step": 168 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.48243464052287577, "calib/avg_num_step_conf": 13.6953125, "calib/ece": 0.37561847389558234, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -2.8390522875954005e-05, "calib/mean_conf": 0.9900763052208835, "calib/mu_c": 0.9900653594771242, "calib/mu_w": 0.9900937500000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37561847389558234, "calib/std_conf": 0.002958583049002384, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9515526191599812, "calib/step_q_c_n": 2119.0, "calib/step_q_gap": 0.0003810257930020322, "calib/step_q_w": 0.9511715933669792, "calib/step_q_w_n": 1387.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 953.50390625, "completions/mean_terminated_length": 972.498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 504.0, "epoch": 0.18026666666666666, "grad_norm": 0.004947459790855646, "kl": 0.0988616943359375, "learning_rate": 8.611111111111112e-07, "loss": -0.0335, "num_tokens": 55589362.0, "reward": 0.8120761513710022, "reward_std": 0.17349588871002197, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6049816012382507, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7051081657409668, "step": 169 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5292098627066405, "calib/avg_num_step_conf": 13.80859375, "calib/ece": 0.33047619047619037, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0019725413281033166, "calib/mean_conf": 0.9892063492063491, "calib/mu_c": 0.9898795180722891, "calib/mu_w": 0.9879069767441858, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33047619047619037, "calib/std_conf": 0.009928568256505396, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9552207062600321, "calib/step_q_c_n": 2492.0, "calib/step_q_gap": 0.011069220162237103, "calib/step_q_w": 0.944151486097795, "calib/step_q_w_n": 1043.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 976.74609375, "completions/mean_terminated_length": 984.43701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 586.0, "epoch": 0.18133333333333335, "grad_norm": 0.004628858063369989, "kl": 0.100830078125, "learning_rate": 8.333333333333333e-07, "loss": -0.0247, "num_tokens": 55943561.0, "reward": 0.8508857488632202, "reward_std": 0.2125195860862732, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6563507318496704, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7188581228256226, "step": 170 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.527501909854851, "calib/avg_num_step_conf": 13.5234375, "calib/ece": 0.4582602921646746, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9721115537848606, "calib/gap": 0.011525974025974328, "calib/mean_conf": 0.9841567065073041, "calib/mu_c": 0.9896212121212121, "calib/mu_w": 0.9780952380952378, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4582602921646746, "calib/std_conf": 0.04960966954172482, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9554654654654656, "calib/step_q_c_n": 1998.0, "calib/step_q_gap": 0.011572022842514684, "calib/step_q_w": 0.9438934426229509, "calib/step_q_w_n": 1464.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2060.0, "completions/max_terminated_length": 2060.0, "completions/mean_length": 913.53515625, "completions/mean_terminated_length": 928.0357666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 556.0, "epoch": 0.1824, "grad_norm": 0.004907242488116026, "kl": 0.09726715087890625, "learning_rate": 8.055555555555557e-07, "loss": -0.0188, "num_tokens": 56284322.0, "reward": 0.7263688445091248, "reward_std": 0.24334657192230225, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5333303809165955, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6201885938644409, "step": 171 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5148400119617225, "calib/avg_num_step_conf": 14.078125, "calib/ece": 0.2910277777777778, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9920634920634921, "calib/gap": 0.002401614832536203, "calib/mean_conf": 0.9894404761904761, "calib/mu_c": 0.9901647727272728, "calib/mu_w": 0.9877631578947366, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2910277777777778, "calib/std_conf": 0.00809225260074072, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9562340186915887, "calib/step_q_c_n": 2675.0, "calib/step_q_gap": 0.01201442773356931, "calib/step_q_w": 0.9442195909580194, "calib/step_q_w_n": 929.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2755.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 964.921875, "completions/mean_terminated_length": 976.3636474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 496.0, "epoch": 0.18346666666666667, "grad_norm": 0.004751425236463547, "kl": 0.1040191650390625, "learning_rate": 7.777777777777779e-07, "loss": -0.0471, "num_tokens": 56634694.0, "reward": 0.8968729376792908, "reward_std": 0.21206118166446686, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.6945913434028625, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7647793292999268, "step": 172 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.49613108784706417, "calib/avg_num_step_conf": 14.22265625, "calib/ece": 0.30610886639676105, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -7.452662721851855e-05, "calib/mean_conf": 0.9903193927125505, "calib/mu_c": 0.9902958579881658, "calib/mu_w": 0.9903703846153843, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.30610886639676105, "calib/std_conf": 0.0017467299800211816, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.9537875197472353, "calib/step_q_c_n": 2532.0, "calib/step_q_gap": 0.007905743372122354, "calib/step_q_w": 0.945881776375113, "calib/step_q_w_n": 1109.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2844.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 980.7265625, "completions/mean_terminated_length": 992.3557739257812, "completions/min_length": 0.0, "completions/min_terminated_length": 476.0, "epoch": 0.18453333333333333, "grad_norm": 0.004599784966558218, "kl": 0.093719482421875, "learning_rate": 7.5e-07, "loss": -0.0323, "num_tokens": 56988920.0, "reward": 0.8514807820320129, "reward_std": 0.21351027488708496, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.6619473099708557, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7175766825675964, "step": 173 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49652777777777785, "calib/avg_num_step_conf": 13.5546875, "calib/ece": 0.3965020576131686, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.9917695473251029, "calib/gap": 0.00017676767676777505, "calib/mean_conf": 0.9890946502057612, "calib/mu_c": 0.9891666666666666, "calib/mu_w": 0.9889898989898989, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3965020576131686, "calib/std_conf": 0.008842662915043422, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9503653653653654, "calib/step_q_c_n": 1998.0, "calib/step_q_gap": 0.000195528408843626, "calib/step_q_w": 0.9501698369565218, "calib/step_q_w_n": 1472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2364.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 1044.88671875, "completions/mean_terminated_length": 1074.260986328125, "completions/min_length": 0.0, "completions/min_terminated_length": 563.0, "epoch": 0.1856, "grad_norm": 0.004376276396214962, "kl": 0.07926177978515625, "learning_rate": 7.222222222222222e-07, "loss": -0.0519, "num_tokens": 57360643.0, "reward": 0.7660025358200073, "reward_std": 0.2759450078010559, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5708284974098206, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6588327288627625, "step": 174 }, { "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.5083781108597285, "calib/avg_num_step_conf": 14.19921875, "calib/ece": 0.5559583333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.9916666666666667, "calib/gap": 0.0014196832579184138, "calib/mean_conf": 0.9892916666666668, "calib/mu_c": 0.9900961538461538, "calib/mu_w": 0.9886764705882354, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5559583333333334, "calib/std_conf": 0.008361514848133413, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9566890243902441, "calib/step_q_c_n": 1640.0, "calib/step_q_gap": 0.010954688550645075, "calib/step_q_w": 0.945734335839599, "calib/step_q_w_n": 1995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 986.93359375, "completions/mean_terminated_length": 1022.894775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 553.0, "epoch": 0.18666666666666668, "grad_norm": 0.0047067576088011265, "kl": 0.0843353271484375, "learning_rate": 6.944444444444446e-07, "loss": -0.0407, "num_tokens": 57719122.0, "reward": 0.6393700838088989, "reward_std": 0.2753363847732544, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.41810816526412964, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.5911006331443787, "step": 175 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5051958433253397, "calib/avg_num_step_conf": 14.890625, "calib/ece": 0.4273684210526316, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0001132427391418167, "calib/mean_conf": 0.9901214574898786, "calib/mu_c": 0.9900719424460432, "calib/mu_w": 0.9901851851851851, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4273684210526316, "calib/std_conf": 0.0030490964184231176, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9543837602706622, "calib/step_q_c_n": 2069.0, "calib/step_q_gap": 0.0010504269373288189, "calib/step_q_w": 0.9533333333333334, "calib/step_q_w_n": 1743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 971.59375, "completions/mean_terminated_length": 994.9120483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 510.0, "epoch": 0.18773333333333334, "grad_norm": 0.004404004197567701, "kl": 0.09282684326171875, "learning_rate": 6.666666666666667e-07, "loss": -0.0371, "num_tokens": 58071914.0, "reward": 0.7828360795974731, "reward_std": 0.26098161935806274, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5511468648910522, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7129628658294678, "step": 176 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5127388535031847, "calib/avg_num_step_conf": 14.05859375, "calib/ece": 0.3720472440944882, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00025477707006349437, "calib/mean_conf": 0.9901574803149606, "calib/mu_c": 0.9902547770700635, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3720472440944882, "calib/std_conf": 0.0012449912047907017, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9573438704703162, "calib/step_q_c_n": 2594.0, "calib/step_q_gap": 0.019990636639470316, "calib/step_q_w": 0.9373532338308459, "calib/step_q_w_n": 1005.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 967.453125, "completions/mean_terminated_length": 971.2471313476562, "completions/min_length": 0.0, "completions/min_terminated_length": 470.0, "epoch": 0.1888, "grad_norm": 0.005024405661970377, "kl": 0.0969696044921875, "learning_rate": 6.388888888888889e-07, "loss": -0.0264, "num_tokens": 58423414.0, "reward": 0.8256941437721252, "reward_std": 0.20266000926494598, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6207616925239563, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7095328569412231, "step": 177 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5178435260027452, "calib/avg_num_step_conf": 16.0859375, "calib/ece": 0.3122040816326531, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00017462254079603223, "calib/mean_conf": 0.9897551020408164, "calib/mu_c": 0.9896987951807227, "calib/mu_w": 0.9898734177215187, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3122040816326531, "calib/std_conf": 0.005490364889717323, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9559977578475336, "calib/step_q_c_n": 2676.0, "calib/step_q_gap": -0.000895446035961589, "calib/step_q_w": 0.9568932038834952, "calib/step_q_w_n": 1442.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 936.54296875, "completions/mean_terminated_length": 978.591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 574.0, "epoch": 0.18986666666666666, "grad_norm": 0.004361789207905531, "kl": 0.09478759765625, "learning_rate": 6.111111111111112e-07, "loss": -0.0853, "num_tokens": 58769241.0, "reward": 0.8469783663749695, "reward_std": 0.26387226581573486, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6544034481048584, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7192407846450806, "step": 178 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5247173723994916, "calib/avg_num_step_conf": 14.43359375, "calib/ece": 0.3858760000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.001299819385911949, "calib/mean_conf": 0.9898760000000001, "calib/mu_c": 0.9903907284768211, "calib/mu_w": 0.9890909090909091, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3858760000000001, "calib/std_conf": 0.005891402549478348, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9584415124064593, "calib/step_q_c_n": 2539.0, "calib/step_q_gap": 0.022611062579469543, "calib/step_q_w": 0.9358304498269897, "calib/step_q_w_n": 1156.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2491.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 1000.64453125, "completions/mean_terminated_length": 1016.52783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 564.0, "epoch": 0.19093333333333334, "grad_norm": 0.0045681241899728775, "kl": 0.09053802490234375, "learning_rate": 5.833333333333334e-07, "loss": -0.0256, "num_tokens": 59131670.0, "reward": 0.8032589554786682, "reward_std": 0.23741401731967926, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5981472134590149, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6950893402099609, "step": 179 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5119186046511628, "calib/avg_num_step_conf": 14.18359375, "calib/ece": 0.3071825396825396, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.996031746031746, "calib/gap": 0.0012412790697673426, "calib/mean_conf": 0.9897222222222222, "calib/mu_c": 0.9901162790697673, "calib/mu_w": 0.988875, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3071825396825396, "calib/std_conf": 0.0058691267689480325, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9542917304747323, "calib/step_q_c_n": 2612.0, "calib/step_q_gap": 0.007510572476694954, "calib/step_q_w": 0.9467811579980373, "calib/step_q_w_n": 1019.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1042.140625, "completions/mean_terminated_length": 1054.498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 73.0, "epoch": 0.192, "grad_norm": 0.00463694566860795, "kl": 0.09368133544921875, "learning_rate": 5.555555555555555e-07, "loss": -0.0686, "num_tokens": 59502314.0, "reward": 0.8891960978507996, "reward_std": 0.19617441296577454, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6786901950836182, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7684519290924072, "step": 180 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5021220159151194, "calib/avg_num_step_conf": 14.42578125, "calib/ece": 0.40779116465863463, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 4.1777188328873116e-05, "calib/mean_conf": 0.990120481927711, "calib/mu_c": 0.9901379310344827, "calib/mu_w": 0.9900961538461538, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.40779116465863463, "calib/std_conf": 0.001411919343875258, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9558508771929826, "calib/step_q_c_n": 2280.0, "calib/step_q_gap": 0.00989192460982613, "calib/step_q_w": 0.9459589525831564, "calib/step_q_w_n": 1413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 969.40625, "completions/mean_terminated_length": 980.9012451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 524.0, "epoch": 0.19306666666666666, "grad_norm": 0.005453117191791534, "kl": 0.0947723388671875, "learning_rate": 5.277777777777779e-07, "loss": -0.0037, "num_tokens": 59856746.0, "reward": 0.774029016494751, "reward_std": 0.23932379484176636, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5704503655433655, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.6705763339996338, "step": 181 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5138689130881446, "calib/avg_num_step_conf": 13.86328125, "calib/ece": 0.36184000000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006013286761182846, "calib/mean_conf": 0.9898400000000002, "calib/mu_c": 0.990063694267516, "calib/mu_w": 0.9894623655913977, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36184000000000016, "calib/std_conf": 0.0026785070468453146, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9526551418439718, "calib/step_q_c_n": 2256.0, "calib/step_q_gap": 0.005083602787513963, "calib/step_q_w": 0.9475715390564579, "calib/step_q_w_n": 1293.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 982.8828125, "completions/mean_terminated_length": 1002.462158203125, "completions/min_length": 0.0, "completions/min_terminated_length": 580.0, "epoch": 0.19413333333333332, "grad_norm": 0.004719921387732029, "kl": 0.09320831298828125, "learning_rate": 5.000000000000001e-07, "loss": -0.0668, "num_tokens": 60214524.0, "reward": 0.8014435768127441, "reward_std": 0.21979346871376038, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6169238090515137, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6687757968902588, "step": 182 }, { "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5206188782831119, "calib/avg_num_step_conf": 12.875, "calib/ece": 0.44996062992125974, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.0016114542391916231, "calib/mean_conf": 0.9893307086614173, "calib/mu_c": 0.99007299270073, "calib/mu_w": 0.9884615384615384, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44996062992125974, "calib/std_conf": 0.006695344625579726, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9549951124144673, "calib/step_q_c_n": 2046.0, "calib/step_q_gap": 0.014995112414467382, "calib/step_q_w": 0.94, "calib/step_q_w_n": 1250.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1005.203125, "completions/mean_terminated_length": 1013.1181030273438, "completions/min_length": 0.0, "completions/min_terminated_length": 545.0, "epoch": 0.1952, "grad_norm": 0.004615194629877806, "kl": 0.0834503173828125, "learning_rate": 4.7222222222222226e-07, "loss": -0.0152, "num_tokens": 60578536.0, "reward": 0.7566883563995361, "reward_std": 0.2806575894355774, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5451648235321045, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.6635243892669678, "step": 183 }, { "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5048271671617607, "calib/avg_num_step_conf": 14.27734375, "calib/ece": 0.35390909090909084, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00010403726708074768, "calib/mean_conf": 0.9902727272727272, "calib/mu_c": 0.9903105590062111, "calib/mu_w": 0.9902065217391304, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35390909090909084, "calib/std_conf": 0.0016178131658055673, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9555344317701732, "calib/step_q_c_n": 2367.0, "calib/step_q_gap": 0.008435052888185579, "calib/step_q_w": 0.9470993788819876, "calib/step_q_w_n": 1288.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 1023.61328125, "completions/mean_terminated_length": 1031.6732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 464.0, "epoch": 0.19626666666666667, "grad_norm": 0.004508560057729483, "kl": 0.090850830078125, "learning_rate": 4.444444444444445e-07, "loss": -0.0372, "num_tokens": 60945861.0, "reward": 0.8435696363449097, "reward_std": 0.24632905423641205, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6358492374420166, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.727852463722229, "step": 184 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5033112582781457, "calib/avg_num_step_conf": 14.9765625, "calib/ece": 0.3733469387755102, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959183673469387, "calib/gap": 0.0010236719740737144, "calib/mean_conf": 0.989673469387755, "calib/mu_c": 0.9900662251655628, "calib/mu_w": 0.989042553191489, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3733469387755102, "calib/std_conf": 0.006052112989429228, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9567959595959596, "calib/step_q_c_n": 2475.0, "calib/step_q_gap": 0.012976975048498263, "calib/step_q_w": 0.9438189845474614, "calib/step_q_w_n": 1359.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 1003.09765625, "completions/mean_terminated_length": 1023.0797119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 550.0, "epoch": 0.19733333333333333, "grad_norm": 0.004620170220732689, "kl": 0.08580780029296875, "learning_rate": 4.1666666666666667e-07, "loss": -0.0247, "num_tokens": 61309574.0, "reward": 0.793574333190918, "reward_std": 0.2635893225669861, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.5977535247802734, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6800200939178467, "step": 185 }, { "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5205140377051404, "calib/avg_num_step_conf": 15.13671875, "calib/ece": 0.3988178137651821, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0012060897870610487, "calib/mean_conf": 0.989910931174089, "calib/mu_c": 0.990404109589041, "calib/mu_w": 0.98919801980198, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3988178137651821, "calib/std_conf": 0.005955312834333486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.95630730265935, "calib/step_q_c_n": 2369.0, "calib/step_q_gap": 0.004109427493347417, "calib/step_q_w": 0.9521978751660026, "calib/step_q_w_n": 1506.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 1023.27734375, "completions/mean_terminated_length": 1056.2862548828125, "completions/min_length": 0.0, "completions/min_terminated_length": 581.0, "epoch": 0.1984, "grad_norm": 0.004492494743317366, "kl": 0.0882720947265625, "learning_rate": 3.8888888888888895e-07, "loss": -0.0492, "num_tokens": 61676573.0, "reward": 0.786389172077179, "reward_std": 0.2791849374771118, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5787035226821899, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6870434880256653, "step": 186 }, { "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.49942881622161933, "calib/avg_num_step_conf": 15.3671875, "calib/ece": 0.37703703703703706, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.1423675567501235e-05, "calib/mean_conf": 0.9902057613168724, "calib/mu_c": 0.990201342281879, "calib/mu_w": 0.9902127659574465, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37703703703703706, "calib/std_conf": 0.001419603976186038, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9549226690234202, "calib/step_q_c_n": 2263.0, "calib/step_q_gap": -0.0017918731668851162, "calib/step_q_w": 0.9567145421903053, "calib/step_q_w_n": 1671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 1031.1953125, "completions/mean_terminated_length": 1068.769287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 632.0, "epoch": 0.19946666666666665, "grad_norm": 0.004322603810578585, "kl": 0.088409423828125, "learning_rate": 3.611111111111111e-07, "loss": -0.0475, "num_tokens": 62042103.0, "reward": 0.7957758903503418, "reward_std": 0.2533106207847595, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5891257524490356, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.6961760520935059, "step": 187 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4968436440959532, "calib/avg_num_step_conf": 14.23828125, "calib/ece": 0.3938400000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.0007282875938601219, "calib/mean_conf": 0.98984, "calib/mu_c": 0.9901342281879193, "calib/mu_w": 0.9894059405940592, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3938400000000001, "calib/std_conf": 0.005930800957712203, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9524339106654514, "calib/step_q_c_n": 2194.0, "calib/step_q_gap": 0.004963889990055037, "calib/step_q_w": 0.9474700206753963, "calib/step_q_w_n": 1451.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 1047.0078125, "completions/mean_terminated_length": 1059.4229736328125, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.20053333333333334, "grad_norm": 0.004464924335479736, "kl": 0.08341217041015625, "learning_rate": 3.3333333333333335e-07, "loss": -0.0392, "num_tokens": 62414209.0, "reward": 0.7849684953689575, "reward_std": 0.29570263624191284, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.5902554392814636, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6679627299308777, "step": 188 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5136424731182796, "calib/avg_num_step_conf": 14.80078125, "calib/ece": 0.3724262948207173, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.0011098118279568858, "calib/mean_conf": 0.989956175298805, "calib/mu_c": 0.9903806451612902, "calib/mu_w": 0.9892708333333333, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3724262948207173, "calib/std_conf": 0.006014762028882856, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9537346221441124, "calib/step_q_c_n": 2276.0, "calib/step_q_gap": -0.0004094624560198046, "calib/step_q_w": 0.9541440846001322, "calib/step_q_w_n": 1513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 976.66796875, "completions/mean_terminated_length": 996.12353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 539.0, "epoch": 0.2016, "grad_norm": 0.004789835307747126, "kl": 0.0922698974609375, "learning_rate": 3.055555555555556e-07, "loss": -0.0509, "num_tokens": 62772004.0, "reward": 0.810811460018158, "reward_std": 0.25243812799453735, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6133812665939331, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6910542249679565, "step": 189 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48040992749581707, "calib/avg_num_step_conf": 14.265625, "calib/ece": 0.3406693227091634, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0005720161740099261, "calib/mean_conf": 0.9900717131474104, "calib/mu_c": 0.9898711656441718, "calib/mu_w": 0.9904431818181817, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3406693227091634, "calib/std_conf": 0.0029340080165571433, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.954537369519833, "calib/step_q_c_n": 2395.0, "calib/step_q_gap": 0.006061633640755959, "calib/step_q_w": 0.9484757358790771, "calib/step_q_w_n": 1257.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1010.9765625, "completions/mean_terminated_length": 1027.02392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 540.0, "epoch": 0.20266666666666666, "grad_norm": 0.004535827320069075, "kl": 0.081787109375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0114, "num_tokens": 63136422.0, "reward": 0.8389154076576233, "reward_std": 0.18770283460617065, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6431839466094971, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7112092971801758, "step": 190 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.494969040247678, "calib/avg_num_step_conf": 15.87890625, "calib/ece": 0.44600000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.00032249742002055104, "calib/mean_conf": 0.9900000000000001, "calib/mu_c": 0.9898529411764706, "calib/mu_w": 0.9901754385964912, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44600000000000006, "calib/std_conf": 0.0028284271247461927, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9571325842696629, "calib/step_q_c_n": 2225.0, "calib/step_q_gap": 0.0028488886174888872, "calib/step_q_w": 0.954283695652174, "calib/step_q_w_n": 1840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2323.0, "completions/max_terminated_length": 2323.0, "completions/mean_length": 1007.81640625, "completions/mean_terminated_length": 1032.0040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 642.0, "epoch": 0.20373333333333332, "grad_norm": 0.004224616568535566, "kl": 0.09447479248046875, "learning_rate": 2.5000000000000004e-07, "loss": -0.0385, "num_tokens": 63498591.0, "reward": 0.7333133220672607, "reward_std": 0.21269133687019348, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5398167371749878, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.6260286569595337, "step": 191 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5209741404642962, "calib/avg_num_step_conf": 15.7734375, "calib/ece": 0.3256275303643722, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0017491918895095049, "calib/mean_conf": 0.9895951417004046, "calib/mu_c": 0.9901829268292682, "calib/mu_w": 0.9884337349397587, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3256275303643722, "calib/std_conf": 0.006349954308242158, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.9575814814814816, "calib/step_q_c_n": 2700.0, "calib/step_q_gap": 0.005765337983723806, "calib/step_q_w": 0.9518161434977578, "calib/step_q_w_n": 1338.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 1027.3203125, "completions/mean_terminated_length": 1051.97607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 616.0, "epoch": 0.2048, "grad_norm": 0.004591282922774553, "kl": 0.08702850341796875, "learning_rate": 2.2222222222222224e-07, "loss": -0.0471, "num_tokens": 63866561.0, "reward": 0.8294618725776672, "reward_std": 0.26131471991539, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6397891044616699, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7003844976425171, "step": 192 }, { "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5260966494000394, "calib/avg_num_step_conf": 13.76953125, "calib/ece": 0.38884920634920633, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841269841269841, "calib/gap": 0.0038600747491965848, "calib/mean_conf": 0.9880555555555556, "calib/mu_c": 0.9896026490066222, "calib/mu_w": 0.9857425742574256, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38884920634920633, "calib/std_conf": 0.01622598619085944, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9574805531547105, "calib/step_q_c_n": 2314.0, "calib/step_q_gap": 0.014342650594842521, "calib/step_q_w": 0.943137902559868, "calib/step_q_w_n": 1211.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 1029.44140625, "completions/mean_terminated_length": 1041.6483154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 624.0, "epoch": 0.20586666666666667, "grad_norm": 0.0044668312184512615, "kl": 0.08332061767578125, "learning_rate": 1.9444444444444447e-07, "loss": -0.0367, "num_tokens": 64235810.0, "reward": 0.8036348223686218, "reward_std": 0.25816673040390015, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6006941199302673, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.6917317509651184, "step": 193 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4989662747124952, "calib/avg_num_step_conf": 14.11328125, "calib/ece": 0.42433864541832667, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -1.214627212819952e-05, "calib/mean_conf": 0.9900756972111554, "calib/mu_c": 0.990070422535211, "calib/mu_w": 0.9900825688073392, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42433864541832667, "calib/std_conf": 0.0008458046287836837, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9559910313901346, "calib/step_q_c_n": 2230.0, "calib/step_q_gap": 0.011313518736483119, "calib/step_q_w": 0.9446775126536515, "calib/step_q_w_n": 1383.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1002.42578125, "completions/mean_terminated_length": 1018.3373413085938, "completions/min_length": 0.0, "completions/min_terminated_length": 552.0, "epoch": 0.20693333333333333, "grad_norm": 0.004665324464440346, "kl": 0.0855712890625, "learning_rate": 1.6666666666666668e-07, "loss": -0.0341, "num_tokens": 64598375.0, "reward": 0.7752304673194885, "reward_std": 0.2633058726787567, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5630354881286621, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6803940534591675, "step": 194 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5226934523809524, "calib/avg_num_step_conf": 14.0859375, "calib/ece": 0.37383200000000016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.00025148809523778404, "calib/mean_conf": 0.9898320000000002, "calib/mu_c": 0.9899285714285713, "calib/mu_w": 0.9896770833333335, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37383200000000016, "calib/std_conf": 0.006497674660984497, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.957998063516654, "calib/step_q_c_n": 2582.0, "calib/step_q_gap": 0.019829118204153895, "calib/step_q_w": 0.9381689453125001, "calib/step_q_w_n": 1024.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 1012.9765625, "completions/mean_terminated_length": 1024.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 610.0, "epoch": 0.208, "grad_norm": 0.004747307859361172, "kl": 0.08518218994140625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0138, "num_tokens": 64963681.0, "reward": 0.8059474229812622, "reward_std": 0.1763087809085846, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.609162449836731, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.6871071457862854, "step": 195 }, { "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5233547127718907, "calib/avg_num_step_conf": 15.98828125, "calib/ece": 0.3395577689243028, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.003665853318460588, "calib/mean_conf": 0.9889601593625499, "calib/mu_c": 0.9902453987730061, "calib/mu_w": 0.9865795454545455, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3395577689243028, "calib/std_conf": 0.015500880529368611, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9559278737791136, "calib/step_q_c_n": 2662.0, "calib/step_q_gap": 0.00360851668617157, "calib/step_q_w": 0.952319357092942, "calib/step_q_w_n": 1431.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1949.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 959.38671875, "completions/mean_terminated_length": 970.7628784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.20906666666666668, "grad_norm": 0.004649278707802296, "kl": 0.09981536865234375, "learning_rate": 1.1111111111111112e-07, "loss": -0.0537, "num_tokens": 65311828.0, "reward": 0.8333547711372375, "reward_std": 0.2155287116765976, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.6455902457237244, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.6976817846298218, "step": 196 }, { "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5071428571428571, "calib/avg_num_step_conf": 15.37109375, "calib/ece": 0.4209756097560976, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00014285714285722229, "calib/mean_conf": 0.9900813008130082, "calib/mu_c": 0.9901428571428572, "calib/mu_w": 0.99, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4209756097560976, "calib/std_conf": 0.0008979968306656318, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.9579148936170214, "calib/step_q_c_n": 2350.0, "calib/step_q_gap": 0.006564735888314743, "calib/step_q_w": 0.9513501577287067, "calib/step_q_w_n": 1585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2866.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 997.734375, "completions/mean_terminated_length": 1017.609619140625, "completions/min_length": 0.0, "completions/min_terminated_length": 510.0, "epoch": 0.21013333333333334, "grad_norm": 0.004834165330976248, "kl": 0.09095001220703125, "learning_rate": 8.333333333333334e-08, "loss": -0.0356, "num_tokens": 65672304.0, "reward": 0.7570745348930359, "reward_std": 0.2978481650352478, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5549831986427307, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.6583846211433411, "step": 197 }, { "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5098461321287407, "calib/avg_num_step_conf": 13.91796875, "calib/ece": 0.34365461847389556, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00019692264257475944, "calib/mean_conf": 0.9902409638554217, "calib/mu_c": 0.9903105590062111, "calib/mu_w": 0.9901136363636364, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34365461847389556, "calib/std_conf": 0.001533484585705767, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.95375, "calib/step_q_c_n": 2456.0, "calib/step_q_gap": 0.010353432700993537, "calib/step_q_w": 0.9433965672990064, "calib/step_q_w_n": 1107.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 974.3046875, "completions/mean_terminated_length": 993.7131958007812, "completions/min_length": 0.0, "completions/min_terminated_length": 551.0, "epoch": 0.2112, "grad_norm": 0.0046066842041909695, "kl": 0.08931732177734375, "learning_rate": 5.555555555555556e-08, "loss": -0.0364, "num_tokens": 66027110.0, "reward": 0.8316128849983215, "reward_std": 0.23040342330932617, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6356081962585449, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7065237760543823, "step": 198 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5073781743308168, "calib/avg_num_step_conf": 14.921875, "calib/ece": 0.3677068273092371, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00014522992450238803, "calib/mean_conf": 0.9901967871485945, "calib/mu_c": 0.9902516129032256, "calib/mu_w": 0.9901063829787232, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3677068273092371, "calib/std_conf": 0.0016420154010191998, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.9583099460292983, "calib/step_q_c_n": 2594.0, "calib/step_q_gap": 0.014460027595366576, "calib/step_q_w": 0.9438499184339317, "calib/step_q_w_n": 1226.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 1036.42578125, "completions/mean_terminated_length": 1057.07177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.21226666666666666, "grad_norm": 0.004693194758147001, "kl": 0.0880126953125, "learning_rate": 2.777777777777778e-08, "loss": -0.0373, "num_tokens": 66396635.0, "reward": 0.8157464265823364, "reward_std": 0.2801714539527893, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6126382350921631, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7032294869422913, "step": 199 }, { "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5088183421516754, "calib/avg_num_step_conf": 14.90234375, "calib/ece": 0.31534136546184754, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00042548500881822715, "calib/mean_conf": 0.9900401606425704, "calib/mu_c": 0.9901785714285714, "calib/mu_w": 0.9897530864197531, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31534136546184754, "calib/std_conf": 0.002284572313787615, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.959687174139729, "calib/step_q_c_n": 2877.0, "calib/step_q_gap": 0.020625340451029595, "calib/step_q_w": 0.9390618336886994, "calib/step_q_w_n": 938.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 1016.55078125, "completions/mean_terminated_length": 1040.947998046875, "completions/min_length": 0.0, "completions/min_terminated_length": 519.0, "epoch": 0.21333333333333335, "grad_norm": 0.004448637366294861, "kl": 0.08927154541015625, "learning_rate": 0.0, "loss": -0.0612, "num_tokens": 66764920.0, "reward": 0.8657457232475281, "reward_std": 0.20025767385959625, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.6626328229904175, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7430773973464966, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.029105301516246983, "train_runtime": 21859.3656, "train_samples_per_second": 2.342, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 66764920, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }