{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7490277290344238, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.5698086470140988, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9343300461769104, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04297444224357605, "learning_rate": 2.5000000000000004e-07, "loss": -0.0135, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.8933746814727783, "reward_std": 0.19672557711601257, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.7698483467102051, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5081617139634353, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9345317482948303, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.0404808484017849, "learning_rate": 5.000000000000001e-07, "loss": -0.0158, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.8337589502334595, "reward_std": 0.1928534209728241, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.7833774089813232, "adv/mean_abs_reasoning": 0.44472596049308777, "adv/mean_abs_step_conf": 0.7741047143936157, "adv/ratio_final_to_reasoning": 1.761483427036185, "adv/ratio_step_to_reasoning": 1.7406330710609537, "adv/std_final_conf": 0.9287951588630676, "adv/std_reasoning": 0.7013906240463257, "adv/std_step_conf": 0.9333337545394897, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4731928514270472, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.2416862745098039, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": -0.0023299546545747507, "calib/mean_conf": 0.8809019607843137, "calib/mu_c": 0.8800613496932514, "calib/mu_w": 0.8823913043478262, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2416862745098039, "calib/std_conf": 0.04315464889957421, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.798605577689243, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.042623926313096194, "calib/step_q_w": 0.7559816513761468, "calib/step_q_w_n": 545.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 501.80078125, "completions/mean_terminated_length": 501.80078125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.05636599287390709, "learning_rate": 7.5e-07, "loss": 0.0098, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03296242654323578, "mask/share_reasoning": 0.855893075466156, "mask/share_step_conf": 0.11114451289176941, "num_tokens": 692378.0, "reward": 0.8860100507736206, "reward_std": 0.17941723763942719, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7003523707389832, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.746667742729187, "step": 3 }, { "adv/mean_abs_final_conf": 0.7777718305587769, "adv/mean_abs_reasoning": 0.4602840542793274, "adv/mean_abs_step_conf": 0.7568343877792358, "adv/ratio_final_to_reasoning": 1.689764881767508, "adv/ratio_step_to_reasoning": 1.6442767911311225, "adv/std_final_conf": 0.930294394493103, "adv/std_reasoning": 0.720619261264801, "adv/std_step_conf": 0.9336671233177185, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.44653713983610893, "calib/avg_num_step_conf": 4.9296875, "calib/ece": 0.261897233201581, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.25296442687747034, "calib/gap": -0.00561260904044425, "calib/mean_conf": 0.878498023715415, "calib/mu_c": 0.8763461538461537, "calib/mu_w": 0.8819587628865979, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.261897233201581, "calib/std_conf": 0.04157137055533821, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7959510869565216, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.017224851215076775, "calib/step_q_w": 0.7787262357414448, "calib/step_q_w_n": 526.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 498.515625, "completions/mean_terminated_length": 500.4706115722656, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.004266666666666667, "grad_norm": 0.04587812349200249, "learning_rate": 1.0000000000000002e-06, "loss": 0.0175, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033491168171167374, "mask/share_reasoning": 0.8485321998596191, "mask/share_step_conf": 0.1140703409910202, "num_tokens": 926166.0, "reward": 0.8646347522735596, "reward_std": 0.1897309273481369, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6825304627418518, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7272076606750488, "step": 4 }, { "adv/mean_abs_final_conf": 0.7645877599716187, "adv/mean_abs_reasoning": 0.4408244490623474, "adv/mean_abs_step_conf": 0.773873507976532, "adv/ratio_final_to_reasoning": 1.7344495333639722, "adv/ratio_step_to_reasoning": 1.755514036534485, "adv/std_final_conf": 0.9312360286712646, "adv/std_reasoning": 0.7204880714416504, "adv/std_step_conf": 0.9340111017227173, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.46828609986504716, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.3475200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.292, "calib/gap": 0.0034355118565644327, "calib/mean_conf": 0.87952, "calib/mu_c": 0.8811278195488723, "calib/mu_w": 0.8776923076923079, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.3475200000000001, "calib/std_conf": 0.05802559435283711, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.7969736842105262, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.03234120557804765, "calib/step_q_w": 0.7646324786324785, "calib/step_q_w_n": 585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 515.2890625, "completions/mean_terminated_length": 517.309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.005333333333333333, "grad_norm": 0.037411727011203766, "learning_rate": 1.25e-06, "loss": -0.0572, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03343050926923752, "mask/share_reasoning": 0.850313663482666, "mask/share_step_conf": 0.11234962195158005, "num_tokens": 1164768.0, "reward": 0.7938537001609802, "reward_std": 0.17242825031280518, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6070008277893066, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6846128106117249, "step": 5 }, { "adv/mean_abs_final_conf": 0.767395555973053, "adv/mean_abs_reasoning": 0.43761974573135376, "adv/mean_abs_step_conf": 0.7543257474899292, "adv/ratio_final_to_reasoning": 1.753567025844721, "adv/ratio_step_to_reasoning": 1.7237013522534128, "adv/std_final_conf": 0.9321431517601013, "adv/std_reasoning": 0.7205691337585449, "adv/std_step_conf": 0.934429407119751, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4851431138493133, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.29980237154150197, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3438735177865613, "calib/gap": -0.002955333076626765, "calib/mean_conf": 0.8808300395256917, "calib/mu_c": 0.879591836734694, "calib/mu_w": 0.8825471698113208, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29980237154150197, "calib/std_conf": 0.04231099296039837, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7867022696929239, "calib/step_q_c_n": 749.0, "calib/step_q_gap": -0.009205321066152128, "calib/step_q_w": 0.795907590759076, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 459.4765625, "completions/mean_terminated_length": 461.2784729003906, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.0064, "grad_norm": 0.04331325367093086, "learning_rate": 1.5e-06, "loss": -0.0298, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03640598803758621, "mask/share_reasoning": 0.8320022821426392, "mask/share_step_conf": 0.1276855170726776, "num_tokens": 1388346.0, "reward": 0.8372479677200317, "reward_std": 0.19077688455581665, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6518738269805908, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7109032273292542, "step": 6 }, { "adv/mean_abs_final_conf": 0.7665672898292542, "adv/mean_abs_reasoning": 0.511370062828064, "adv/mean_abs_step_conf": 0.7519409656524658, "adv/ratio_final_to_reasoning": 1.4990460833586072, "adv/ratio_step_to_reasoning": 1.4704438533103728, "adv/std_final_conf": 0.9309672713279724, "adv/std_reasoning": 0.7575966119766235, "adv/std_step_conf": 0.9342412352561951, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.45781045751633986, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.28166007905138346, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3359683794466403, "calib/gap": -0.00788823529411764, "calib/mean_conf": 0.8825296442687747, "calib/mu_c": 0.8794117647058824, "calib/mu_w": 0.8873000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27972332015810286, "calib/std_conf": 0.04346553892207595, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7818596059113301, "calib/step_q_c_n": 812.0, "calib/step_q_gap": 0.009118226600985158, "calib/step_q_w": 0.7727413793103449, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 518.484375, "completions/mean_terminated_length": 522.5669555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.007466666666666667, "grad_norm": 0.04959415644407272, "learning_rate": 1.75e-06, "loss": 0.0708, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.031138010323047638, "mask/share_reasoning": 0.8528178334236145, "mask/share_step_conf": 0.10823164880275726, "num_tokens": 1628502.0, "reward": 0.8639912605285645, "reward_std": 0.1998172402381897, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6701984405517578, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7405965328216553, "step": 7 }, { "adv/mean_abs_final_conf": 0.7736358046531677, "adv/mean_abs_reasoning": 0.48989439010620117, "adv/mean_abs_step_conf": 0.7445034384727478, "adv/ratio_final_to_reasoning": 1.5791889441425446, "adv/ratio_step_to_reasoning": 1.5197223187457842, "adv/std_final_conf": 0.9313343167304993, "adv/std_reasoning": 0.7574949264526367, "adv/std_step_conf": 0.9349920153617859, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.3862290862290862, "calib/avg_num_step_conf": 4.91015625, "calib/ece": 0.3273306772908367, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.2549800796812749, "calib/gap": -0.019002574002574102, "calib/mean_conf": 0.8776892430278885, "calib/mu_c": 0.8692857142857142, "calib/mu_w": 0.8882882882882883, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32362549800796814, "calib/std_conf": 0.05307514634165364, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7813258785942492, "calib/step_q_c_n": 626.0, "calib/step_q_gap": -0.005805658648223022, "calib/step_q_w": 0.7871315372424722, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 531.32421875, "completions/mean_terminated_length": 533.4078979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.008533333333333334, "grad_norm": 0.036499813199043274, "learning_rate": 2.0000000000000003e-06, "loss": 0.0265, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03259691596031189, "mask/share_reasoning": 0.8604029417037964, "mask/share_step_conf": 0.10309390723705292, "num_tokens": 1871033.0, "reward": 0.8223322033882141, "reward_std": 0.1828649342060089, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6219609379768372, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7187970876693726, "step": 8 }, { "adv/mean_abs_final_conf": 0.8049112558364868, "adv/mean_abs_reasoning": 0.442771852016449, "adv/mean_abs_step_conf": 0.7661532163619995, "adv/ratio_final_to_reasoning": 1.817891657228889, "adv/ratio_step_to_reasoning": 1.73035664501441, "adv/std_final_conf": 0.9294659495353699, "adv/std_reasoning": 0.7015198469161987, "adv/std_step_conf": 0.9343773126602173, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4814764183185236, "calib/avg_num_step_conf": 5.03125, "calib/ece": 0.26160642570281123, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.2931726907630522, "calib/gap": 0.000470266575529954, "calib/mean_conf": 0.8800803212851405, "calib/mu_c": 0.8802597402597403, "calib/mu_w": 0.8797894736842103, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.26160642570281123, "calib/std_conf": 0.04436062482015925, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7749525101763906, "calib/step_q_c_n": 737.0, "calib/step_q_gap": 0.05956231053936689, "calib/step_q_w": 0.7153901996370237, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 541.3359375, "completions/mean_terminated_length": 543.4588623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.0096, "grad_norm": 0.05957135930657387, "learning_rate": 2.25e-06, "loss": 0.0745, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03317445516586304, "mask/share_reasoning": 0.8594812154769897, "mask/share_step_conf": 0.10343807935714722, "num_tokens": 2117151.0, "reward": 0.8370152115821838, "reward_std": 0.21288388967514038, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6695988178253174, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.6911502480506897, "step": 9 }, { "adv/mean_abs_final_conf": 0.785231351852417, "adv/mean_abs_reasoning": 0.4677700400352478, "adv/mean_abs_step_conf": 0.7750120162963867, "adv/ratio_final_to_reasoning": 1.6786696125156875, "adv/ratio_step_to_reasoning": 1.656822690563909, "adv/std_final_conf": 0.9293057918548584, "adv/std_reasoning": 0.720548152923584, "adv/std_step_conf": 0.9337735772132874, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48350653704324503, "calib/avg_num_step_conf": 5.09765625, "calib/ece": 0.2613888888888889, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.34523809523809523, "calib/gap": -0.0022225947033186477, "calib/mean_conf": 0.8844047619047619, "calib/mu_c": 0.8835668789808919, "calib/mu_w": 0.8857894736842106, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2613888888888889, "calib/std_conf": 0.0453387039135699, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7862021857923497, "calib/step_q_c_n": 732.0, "calib/step_q_gap": 0.00939590306983662, "calib/step_q_w": 0.7768062827225131, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 509.8984375, "completions/mean_terminated_length": 511.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.010666666666666666, "grad_norm": 0.04875704273581505, "learning_rate": 2.5e-06, "loss": 0.0244, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03302618861198425, "mask/share_reasoning": 0.8515468835830688, "mask/share_step_conf": 0.11152061820030212, "num_tokens": 2354485.0, "reward": 0.869525671005249, "reward_std": 0.17988115549087524, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6828699111938477, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7366502285003662, "step": 10 }, { "adv/mean_abs_final_conf": 0.7646981477737427, "adv/mean_abs_reasoning": 0.4427228271961212, "adv/mean_abs_step_conf": 0.7652941942214966, "adv/ratio_final_to_reasoning": 1.727261620135503, "adv/ratio_step_to_reasoning": 1.728607939799047, "adv/std_final_conf": 0.9307788014411926, "adv/std_reasoning": 0.7205932140350342, "adv/std_step_conf": 0.9334892630577087, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.4243208692873123, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.320748031496063, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3937007874015748, "calib/gap": -0.02912815596037066, "calib/mean_conf": 0.8799606299212598, "calib/mu_c": 0.8679194630872483, "calib/mu_w": 0.897047619047619, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30704724409448825, "calib/std_conf": 0.09864235099908569, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7714735516372797, "calib/step_q_c_n": 794.0, "calib/step_q_gap": -0.011438262006646993, "calib/step_q_w": 0.7829118136439267, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 527.08203125, "completions/mean_terminated_length": 529.1490478515625, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.011733333333333333, "grad_norm": 0.04273128882050514, "learning_rate": 2.7500000000000004e-06, "loss": -0.0182, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03175712376832962, "mask/share_reasoning": 0.848095178604126, "mask/share_step_conf": 0.11624141037464142, "num_tokens": 2593898.0, "reward": 0.8353488445281982, "reward_std": 0.18423444032669067, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6413179636001587, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7153171300888062, "step": 11 }, { "adv/mean_abs_final_conf": 0.7719697952270508, "adv/mean_abs_reasoning": 0.4082704186439514, "adv/mean_abs_step_conf": 0.762624979019165, "adv/ratio_final_to_reasoning": 1.890829606982322, "adv/ratio_step_to_reasoning": 1.8679408161683217, "adv/std_final_conf": 0.9258157014846802, "adv/std_reasoning": 0.681792676448822, "adv/std_step_conf": 0.9339790940284729, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.471031746031746, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.24112449799196795, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.42971887550200805, "calib/gap": -0.00808441558441575, "calib/mean_conf": 0.8933333333333333, "calib/mu_c": 0.8906060606060605, "calib/mu_w": 0.8986904761904763, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23590361445783142, "calib/std_conf": 0.0485423399560318, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7722249690976515, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.020690158971069295, "calib/step_q_w": 0.7515348101265822, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 482.05078125, "completions/mean_terminated_length": 485.8464660644531, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.0128, "grad_norm": 0.03738216683268547, "learning_rate": 3e-06, "loss": 0.0016, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03594135120511055, "mask/share_reasoning": 0.8288105726242065, "mask/share_step_conf": 0.1274355798959732, "num_tokens": 2821479.0, "reward": 0.8925774097442627, "reward_std": 0.18727068603038788, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.6938386559486389, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7686598300933838, "step": 12 }, { "adv/mean_abs_final_conf": 0.7644088268280029, "adv/mean_abs_reasoning": 0.40752342343330383, "adv/mean_abs_step_conf": 0.7455885410308838, "adv/ratio_final_to_reasoning": 1.8757420625985386, "adv/ratio_step_to_reasoning": 1.8295599667608025, "adv/std_final_conf": 0.9257137179374695, "adv/std_reasoning": 0.6815844178199768, "adv/std_step_conf": 0.9348680973052979, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5062893081761006, "calib/avg_num_step_conf": 4.67578125, "calib/ece": 0.2684705882352941, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4745098039215686, "calib/gap": 0.010391116352201357, "calib/mean_conf": 0.892, "calib/mu_c": 0.8959119496855346, "calib/mu_w": 0.8855208333333332, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2684705882352941, "calib/std_conf": 0.05516748118585945, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7643037974683544, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.022390217221440967, "calib/step_q_w": 0.7419135802469135, "calib/step_q_w_n": 486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 474.00390625, "completions/mean_terminated_length": 474.00390625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.013866666666666666, "grad_norm": 0.04036780446767807, "learning_rate": 3.2500000000000002e-06, "loss": 0.0452, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035814374685287476, "mask/share_reasoning": 0.8521537780761719, "mask/share_step_conf": 0.11203181743621826, "num_tokens": 3047416.0, "reward": 0.890767514705658, "reward_std": 0.1727554202079773, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6923031210899353, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7657943964004517, "step": 13 }, { "adv/mean_abs_final_conf": 0.7648312449455261, "adv/mean_abs_reasoning": 0.4855183959007263, "adv/mean_abs_step_conf": 0.7825720906257629, "adv/ratio_final_to_reasoning": 1.575287880753978, "adv/ratio_step_to_reasoning": 1.61182788795046, "adv/std_final_conf": 0.9267758131027222, "adv/std_reasoning": 0.7393484115600586, "adv/std_step_conf": 0.9343113899230957, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.451698717948718, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.39339999999999997, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.616, "calib/gap": -0.004999999999999893, "calib/mean_conf": 0.9134, "calib/mu_c": 0.911, "calib/mu_w": 0.9159999999999999, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.39339999999999997, "calib/std_conf": 0.03798999868386414, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7223162274618585, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.021849835143614382, "calib/step_q_w": 0.7004663923182441, "calib/step_q_w_n": 729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 535.94921875, "completions/mean_terminated_length": 542.3043823242188, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.014933333333333333, "grad_norm": 0.046021830290555954, "learning_rate": 3.5e-06, "loss": -0.0558, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.0320635549724102, "mask/share_reasoning": 0.840599775314331, "mask/share_step_conf": 0.11561790108680725, "num_tokens": 3290019.0, "reward": 0.8165234923362732, "reward_std": 0.19196359813213348, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5778292417526245, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7583426237106323, "step": 14 }, { "adv/mean_abs_final_conf": 0.764102578163147, "adv/mean_abs_reasoning": 0.351371169090271, "adv/mean_abs_step_conf": 0.7570402026176453, "adv/ratio_final_to_reasoning": 2.174630833091604, "adv/ratio_step_to_reasoning": 2.1545313594672124, "adv/std_final_conf": 0.9198284149169922, "adv/std_reasoning": 0.6185474991798401, "adv/std_step_conf": 0.9339830279350281, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5321258758758759, "calib/avg_num_step_conf": 4.71484375, "calib/ece": 0.3382812499999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.66015625, "calib/gap": 0.007077077077077054, "calib/mean_conf": 0.9164062499999999, "calib/mu_c": 0.9193918918918919, "calib/mu_w": 0.9123148148148148, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3382812499999999, "calib/std_conf": 0.04166897780048725, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6848280802292264, "calib/step_q_c_n": 698.0, "calib/step_q_gap": -0.021380171244250956, "calib/step_q_w": 0.7062082514734773, "calib/step_q_w_n": 509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 449.796875, "completions/mean_terminated_length": 451.5608215332031, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.016, "grad_norm": 0.035176970064640045, "learning_rate": 3.7500000000000005e-06, "loss": -0.0255, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03521725535392761, "mask/share_reasoning": 0.8471869230270386, "mask/share_step_conf": 0.11368949711322784, "num_tokens": 3513047.0, "reward": 0.8684121370315552, "reward_std": 0.153379887342453, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.643385112285614, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7778139114379883, "step": 15 }, { "adv/mean_abs_final_conf": 0.776970624923706, "adv/mean_abs_reasoning": 0.40734297037124634, "adv/mean_abs_step_conf": 0.7721484303474426, "adv/ratio_final_to_reasoning": 1.907411398840605, "adv/ratio_step_to_reasoning": 1.8955732306948074, "adv/std_final_conf": 0.9214528203010559, "adv/std_reasoning": 0.6816308498382568, "adv/std_step_conf": 0.9345147609710693, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.548993288590604, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.33036144578313253, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7710843373493976, "calib/gap": 0.0064422818791946, "calib/mean_conf": 0.9287550200803213, "calib/mu_c": 0.9313422818791945, "calib/mu_w": 0.9248999999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.33036144578313253, "calib/std_conf": 0.038854793847339794, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6253367346938775, "calib/step_q_c_n": 980.0, "calib/step_q_gap": -0.005105163833127424, "calib/step_q_w": 0.6304418985270049, "calib/step_q_w_n": 611.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 623.08203125, "completions/mean_terminated_length": 627.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.017066666666666667, "grad_norm": 0.042664941400289536, "learning_rate": 4.000000000000001e-06, "loss": -0.0754, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0263189896941185, "mask/share_reasoning": 0.8583731651306152, "mask/share_step_conf": 0.10749533772468567, "num_tokens": 3781404.0, "reward": 0.8671663999557495, "reward_std": 0.16728359460830688, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6342976689338684, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7890976667404175, "step": 16 }, { "adv/mean_abs_final_conf": 0.7509101629257202, "adv/mean_abs_reasoning": 0.4348292946815491, "adv/mean_abs_step_conf": 0.7716478109359741, "adv/ratio_final_to_reasoning": 1.726907943209428, "adv/ratio_step_to_reasoning": 1.774599412629494, "adv/std_final_conf": 0.9179096221923828, "adv/std_reasoning": 0.7013534307479858, "adv/std_step_conf": 0.934508740901947, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6060933448573897, "calib/avg_num_step_conf": 5.61328125, "calib/ece": 0.23671874999999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8046875, "calib/gap": 0.01822529530394701, "calib/mean_conf": 0.93203125, "calib/mu_c": 0.9375842696629213, "calib/mu_w": 0.9193589743589743, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.23671874999999998, "calib/std_conf": 0.04428881375062442, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6054375614552606, "calib/step_q_c_n": 1017.0, "calib/step_q_gap": 0.014270894788594002, "calib/step_q_w": 0.5911666666666666, "calib/step_q_w_n": 420.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 501.57421875, "completions/mean_terminated_length": 503.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.018133333333333335, "grad_norm": 0.049540191888809204, "learning_rate": 4.25e-06, "loss": 0.0029, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03329847380518913, "mask/share_reasoning": 0.840636134147644, "mask/share_step_conf": 0.12215914577245712, "num_tokens": 4013335.0, "reward": 0.9493035078048706, "reward_std": 0.1720583438873291, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.72954922914505, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8323389291763306, "step": 17 }, { "adv/mean_abs_final_conf": 0.7602880001068115, "adv/mean_abs_reasoning": 0.4945884346961975, "adv/mean_abs_step_conf": 0.7385061979293823, "adv/ratio_final_to_reasoning": 1.5372134622877318, "adv/ratio_step_to_reasoning": 1.4931732044705253, "adv/std_final_conf": 0.918209433555603, "adv/std_reasoning": 0.7575898766517639, "adv/std_step_conf": 0.9352489709854126, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.46421066456527454, "calib/avg_num_step_conf": 5.13671875, "calib/ece": 0.37911646586345377, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.891566265060241, "calib/gap": -0.0016154452324664836, "calib/mean_conf": 0.9453815261044177, "calib/mu_c": 0.9446808510638297, "calib/mu_w": 0.9462962962962962, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.37911646586345377, "calib/std_conf": 0.0393822149644015, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6216381766381767, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.044199351189563196, "calib/step_q_w": 0.5774388254486135, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 518.8203125, "completions/mean_terminated_length": 522.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.0192, "grad_norm": 0.027683550491929054, "learning_rate": 4.5e-06, "loss": -0.0408, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0321136899292469, "mask/share_reasoning": 0.8539448976516724, "mask/share_step_conf": 0.10612886399030685, "num_tokens": 4256873.0, "reward": 0.8278531432151794, "reward_std": 0.2118024230003357, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.5909234285354614, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7616577744483948, "step": 18 }, { "adv/mean_abs_final_conf": 0.7561960816383362, "adv/mean_abs_reasoning": 0.4372809827327728, "adv/mean_abs_step_conf": 0.7717266082763672, "adv/ratio_final_to_reasoning": 1.7293138999837454, "adv/ratio_step_to_reasoning": 1.7648300263448176, "adv/std_final_conf": 0.9090594053268433, "adv/std_reasoning": 0.6816985607147217, "adv/std_step_conf": 0.9350162148475647, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5666752395036327, "calib/avg_num_step_conf": 4.6328125, "calib/ece": 0.3575590551181102, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9291338582677166, "calib/gap": 0.012099273452066961, "calib/mean_conf": 0.9520472440944882, "calib/mu_c": 0.9569536423841057, "calib/mu_w": 0.9448543689320388, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3575590551181102, "calib/std_conf": 0.043927975816489856, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5789985486211902, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.013404987252980893, "calib/step_q_w": 0.5655935613682093, "calib/step_q_w_n": 497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 495.1953125, "completions/mean_terminated_length": 495.1953125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.020266666666666665, "grad_norm": 0.029574958607554436, "learning_rate": 4.75e-06, "loss": 0.012, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.031995534896850586, "mask/share_reasoning": 0.8613891005516052, "mask/share_step_conf": 0.10661540180444717, "num_tokens": 4488403.0, "reward": 0.8745803833007812, "reward_std": 0.1895061880350113, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6300226449966431, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8027318716049194, "step": 19 }, { "adv/mean_abs_final_conf": 0.7200021743774414, "adv/mean_abs_reasoning": 0.45562490820884705, "adv/mean_abs_step_conf": 0.7624484300613403, "adv/ratio_final_to_reasoning": 1.5802520042372452, "adv/ratio_step_to_reasoning": 1.673412529307667, "adv/std_final_conf": 0.8881762623786926, "adv/std_reasoning": 0.7392408847808838, "adv/std_step_conf": 0.9350560903549194, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49484369964550434, "calib/avg_num_step_conf": 5.5546875, "calib/ece": 0.39174603174603184, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841269841269841, "calib/gap": -0.000243635191749636, "calib/mean_conf": 0.9650000000000002, "calib/mu_c": 0.9648965517241381, "calib/mu_w": 0.9651401869158878, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3906746031746033, "calib/std_conf": 0.028984122637736764, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5484782608695652, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.012501584484725492, "calib/step_q_w": 0.5359766763848397, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 494.953125, "completions/mean_terminated_length": 494.953125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.021333333333333333, "grad_norm": 0.027805835008621216, "learning_rate": 5e-06, "loss": 0.0766, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03540211543440819, "mask/share_reasoning": 0.8361475467681885, "mask/share_step_conf": 0.12845034897327423, "num_tokens": 4719983.0, "reward": 0.8451118469238281, "reward_std": 0.19244298338890076, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5935140252113342, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7865532040596008, "step": 20 }, { "adv/mean_abs_final_conf": 0.730783224105835, "adv/mean_abs_reasoning": 0.5323715209960938, "adv/mean_abs_step_conf": 0.7476698756217957, "adv/ratio_final_to_reasoning": 1.3726940591008756, "adv/ratio_step_to_reasoning": 1.4044137338955847, "adv/std_final_conf": 0.9159253835678101, "adv/std_reasoning": 0.7927213907241821, "adv/std_step_conf": 0.935133695602417, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5736628088833282, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.4265748031496064, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": 0.004594932749452663, "calib/mean_conf": 0.9738188976377953, "calib/mu_c": 0.9758992805755397, "calib/mu_w": 0.971304347826087, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4265748031496064, "calib/std_conf": 0.0204087820076742, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5161962365591397, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.00516127152417456, "calib/step_q_w": 0.5110349650349651, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 500.796875, "completions/mean_terminated_length": 500.796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0224, "grad_norm": 0.027695661410689354, "learning_rate": 4.9722222222222224e-06, "loss": 0.0169, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033906854689121246, "mask/share_reasoning": 0.8413227796554565, "mask/share_step_conf": 0.12477035075426102, "num_tokens": 4951147.0, "reward": 0.8373119831085205, "reward_std": 0.213922381401062, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5676566362380981, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.7999359965324402, "step": 21 }, { "adv/mean_abs_final_conf": 0.6912540197372437, "adv/mean_abs_reasoning": 0.38511404395103455, "adv/mean_abs_step_conf": 0.7476215362548828, "adv/ratio_final_to_reasoning": 1.7949332946817524, "adv/ratio_step_to_reasoning": 1.9412990723078887, "adv/std_final_conf": 0.8591064214706421, "adv/std_reasoning": 0.6612229943275452, "adv/std_step_conf": 0.934601366519928, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.48827392120075047, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.33431372549019595, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9921568627450981, "calib/gap": -0.00037523452157595116, "calib/mean_conf": 0.9774509803921568, "calib/mu_c": 0.9773170731707316, "calib/mu_w": 0.9776923076923075, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33431372549019595, "calib/std_conf": 0.018916030887293125, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5184591194968553, "calib/step_q_c_n": 954.0, "calib/step_q_gap": 0.014644090595121173, "calib/step_q_w": 0.5038150289017341, "calib/step_q_w_n": 519.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 485.66796875, "completions/mean_terminated_length": 485.66796875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.023466666666666667, "grad_norm": 0.03960420563817024, "learning_rate": 4.944444444444445e-06, "loss": -0.0118, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03314411640167236, "mask/share_reasoning": 0.8393899202346802, "mask/share_step_conf": 0.12746594846248627, "num_tokens": 5177294.0, "reward": 0.899250328540802, "reward_std": 0.16604021191596985, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.6556215286254883, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8155354261398315, "step": 22 }, { "adv/mean_abs_final_conf": 0.6920627355575562, "adv/mean_abs_reasoning": 0.4975743293762207, "adv/mean_abs_step_conf": 0.7552143335342407, "adv/ratio_final_to_reasoning": 1.3908730710146442, "adv/ratio_step_to_reasoning": 1.517791993974062, "adv/std_final_conf": 0.8740708231925964, "adv/std_reasoning": 0.7574687600135803, "adv/std_step_conf": 0.9346686005592346, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.47294472114616, "calib/avg_num_step_conf": 5.5625, "calib/ece": 0.44109374999999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.99609375, "calib/gap": -0.0029650126053004655, "calib/mean_conf": 0.9825781250000002, "calib/mu_c": 0.9812230215827337, "calib/mu_w": 0.9841880341880341, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44035156249999996, "calib/std_conf": 0.015499299064292392, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5508701298701298, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.03620652130743862, "calib/step_q_w": 0.5146636085626912, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 502.3828125, "completions/mean_terminated_length": 504.35296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.024533333333333334, "grad_norm": 0.031366974115371704, "learning_rate": 4.9166666666666665e-06, "loss": -0.0028, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034005146473646164, "mask/share_reasoning": 0.8374161720275879, "mask/share_step_conf": 0.12467247992753983, "num_tokens": 5409840.0, "reward": 0.8348518013954163, "reward_std": 0.1989361047744751, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5568780899047852, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8042316436767578, "step": 23 }, { "adv/mean_abs_final_conf": 0.7143040895462036, "adv/mean_abs_reasoning": 0.5833674669265747, "adv/mean_abs_step_conf": 0.765878438949585, "adv/ratio_final_to_reasoning": 1.2244496480228118, "adv/ratio_step_to_reasoning": 1.3128576452584078, "adv/std_final_conf": 0.9137648940086365, "adv/std_reasoning": 0.8265848159790039, "adv/std_step_conf": 0.9351579546928406, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5699480138629699, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.4768979591836736, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003844974673420265, "calib/mean_conf": 0.9830204081632654, "calib/mu_c": 0.9849193548387096, "calib/mu_w": 0.9810743801652894, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.4768979591836736, "calib/std_conf": 0.011457979655793888, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5617678100263852, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.03691368502638526, "calib/step_q_w": 0.524854125, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 612.45703125, "completions/mean_terminated_length": 612.45703125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0256, "grad_norm": 0.036034706979990005, "learning_rate": 4.888888888888889e-06, "loss": 0.0124, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03095119819045067, "mask/share_reasoning": 0.8476190567016602, "mask/share_step_conf": 0.12142970412969589, "num_tokens": 5671141.0, "reward": 0.7759683728218079, "reward_std": 0.24071954190731049, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5017863512039185, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7626503705978394, "step": 24 }, { "adv/mean_abs_final_conf": 0.6553305983543396, "adv/mean_abs_reasoning": 0.42382392287254333, "adv/mean_abs_step_conf": 0.7621168494224548, "adv/ratio_final_to_reasoning": 1.5462331477485223, "adv/ratio_step_to_reasoning": 1.798192146061671, "adv/std_final_conf": 0.8428783416748047, "adv/std_reasoning": 0.7013589143753052, "adv/std_step_conf": 0.9344834089279175, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5387245961438248, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.3846245059288538, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007731891610215369, "calib/mean_conf": 0.9854150197628458, "calib/mu_c": 0.9857236842105263, "calib/mu_w": 0.9849504950495047, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3846245059288538, "calib/std_conf": 0.009257284905080909, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5470713391739674, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.01965523179141715, "calib/step_q_w": 0.5274161073825503, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 486.20703125, "completions/mean_terminated_length": 488.11376953125, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.02666666666666667, "grad_norm": 0.046743251383304596, "learning_rate": 4.861111111111111e-06, "loss": -0.03, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03313041105866432, "mask/share_reasoning": 0.8396614789962769, "mask/share_step_conf": 0.12330187857151031, "num_tokens": 5898834.0, "reward": 0.8585910797119141, "reward_std": 0.18948256969451904, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6053300499916077, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7954457402229309, "step": 25 }, { "adv/mean_abs_final_conf": 0.5678051710128784, "adv/mean_abs_reasoning": 0.40700194239616394, "adv/mean_abs_step_conf": 0.7359786033630371, "adv/ratio_final_to_reasoning": 1.3950920422394282, "adv/ratio_step_to_reasoning": 1.808292606738119, "adv/std_final_conf": 0.7967524528503418, "adv/std_reasoning": 0.6817211508750916, "adv/std_step_conf": 0.9349690675735474, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5277665043290043, "calib/avg_num_step_conf": 5.4609375, "calib/ece": 0.37139999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0006831709956709453, "calib/mean_conf": 0.9873999999999999, "calib/mu_c": 0.9876623376623378, "calib/mu_w": 0.9869791666666669, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.37139999999999995, "calib/std_conf": 0.007158212067269318, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5790322580645161, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.07202919996054047, "calib/step_q_w": 0.5070030581039756, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2547.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 517.3203125, "completions/mean_terminated_length": 521.3936767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 221.0, "epoch": 0.027733333333333332, "grad_norm": 0.04296651855111122, "learning_rate": 4.833333333333333e-06, "loss": 0.0238, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030154578387737274, "mask/share_reasoning": 0.8507611751556396, "mask/share_step_conf": 0.1112716943025589, "num_tokens": 6136508.0, "reward": 0.862993597984314, "reward_std": 0.19204775989055634, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6108921766281128, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8002512454986572, "step": 26 }, { "adv/mean_abs_final_conf": 0.5804013013839722, "adv/mean_abs_reasoning": 0.4634118974208832, "adv/mean_abs_step_conf": 0.7674976587295532, "adv/ratio_final_to_reasoning": 1.2524523099518872, "adv/ratio_step_to_reasoning": 1.656188939043339, "adv/std_final_conf": 0.8160413503646851, "adv/std_reasoning": 0.7392897605895996, "adv/std_step_conf": 0.9348019957542419, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5466738881101059, "calib/avg_num_step_conf": 6.0234375, "calib/ece": 0.4584462151394423, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001320249776984861, "calib/mean_conf": 0.9883266932270917, "calib/mu_c": 0.9889473684210525, "calib/mu_w": 0.9876271186440676, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.4584462151394423, "calib/std_conf": 0.006708860513081049, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5329224904701397, "calib/step_q_c_n": 787.0, "calib/step_q_gap": -0.0029583042318469532, "calib/step_q_w": 0.5358807947019867, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3051.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 521.30859375, "completions/mean_terminated_length": 525.4133911132812, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.0288, "grad_norm": 0.03022969514131546, "learning_rate": 4.805555555555556e-06, "loss": 0.0168, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032159917056560516, "mask/share_reasoning": 0.8321437835693359, "mask/share_step_conf": 0.12788382172584534, "num_tokens": 6375179.0, "reward": 0.8111745119094849, "reward_std": 0.2072419822216034, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5307597517967224, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7915892004966736, "step": 27 }, { "adv/mean_abs_final_conf": 0.6053963899612427, "adv/mean_abs_reasoning": 0.4087706208229065, "adv/mean_abs_step_conf": 0.7480028867721558, "adv/ratio_final_to_reasoning": 1.4810173704326006, "adv/ratio_step_to_reasoning": 1.82988416649497, "adv/std_final_conf": 0.7858245968818665, "adv/std_reasoning": 0.6818886995315552, "adv/std_step_conf": 0.9349893927574158, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5225119381490184, "calib/avg_num_step_conf": 5.0625, "calib/ece": 0.3102032520325204, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.00048737967103795743, "calib/mean_conf": 0.9890650406504066, "calib/mu_c": 0.9892215568862275, "calib/mu_w": 0.9887341772151895, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3102032520325204, "calib/std_conf": 0.007066743178062233, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5377925211097708, "calib/step_q_c_n": 829.0, "calib/step_q_gap": 0.03231072239456745, "calib/step_q_w": 0.5054817987152034, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2992.0, "completions/max_terminated_length": 2992.0, "completions/mean_length": 573.41015625, "completions/mean_terminated_length": 573.41015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.029866666666666666, "grad_norm": 0.04057210683822632, "learning_rate": 4.777777777777778e-06, "loss": 0.0274, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.02998589724302292, "mask/share_reasoning": 0.865739107131958, "mask/share_step_conf": 0.10427501797676086, "num_tokens": 6628916.0, "reward": 0.8797591924667358, "reward_std": 0.2143753170967102, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6552281379699707, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7824151515960693, "step": 28 }, { "adv/mean_abs_final_conf": 0.6272884607315063, "adv/mean_abs_reasoning": 0.4339601397514343, "adv/mean_abs_step_conf": 0.7357267141342163, "adv/ratio_final_to_reasoning": 1.445497877041904, "adv/ratio_step_to_reasoning": 1.6953785537898232, "adv/std_final_conf": 0.8206400275230408, "adv/std_reasoning": 0.7014197111129761, "adv/std_step_conf": 0.9344742894172668, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5213920817369093, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.45179282868525916, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0010964240102171452, "calib/mean_conf": 0.9896414342629484, "calib/mu_c": 0.9901481481481481, "calib/mu_w": 0.989051724137931, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45179282868525916, "calib/std_conf": 0.006991095384555814, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5233561643835616, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.06595616438356156, "calib/step_q_w": 0.45740000000000003, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 621.78125, "completions/mean_terminated_length": 624.2196655273438, "completions/min_length": 0.0, "completions/min_terminated_length": 207.0, "epoch": 0.030933333333333334, "grad_norm": 0.050297658890485764, "learning_rate": 4.75e-06, "loss": -0.0194, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027649879455566406, "mask/share_reasoning": 0.8580390214920044, "mask/share_step_conf": 0.11040481925010681, "num_tokens": 6895220.0, "reward": 0.831365704536438, "reward_std": 0.18563544750213623, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5371124744415283, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8240563869476318, "step": 29 }, { "adv/mean_abs_final_conf": 0.7028975486755371, "adv/mean_abs_reasoning": 0.5859472155570984, "adv/mean_abs_step_conf": 0.7569370269775391, "adv/ratio_final_to_reasoning": 1.199591925711681, "adv/ratio_step_to_reasoning": 1.2918177727969395, "adv/std_final_conf": 0.86234050989151, "adv/std_reasoning": 0.8099966645240784, "adv/std_step_conf": 0.9351039528846741, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6026850507982584, "calib/avg_num_step_conf": 6.09765625, "calib/ece": 0.41361445783132533, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0034272331442141146, "calib/mean_conf": 0.9879116465863454, "calib/mu_c": 0.9893706293706291, "calib/mu_w": 0.985943396226415, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.41361445783132533, "calib/std_conf": 0.007089235072647791, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.48403614457831323, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.028659947587889145, "calib/step_q_w": 0.4553761969904241, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 637.62890625, "completions/mean_terminated_length": 637.62890625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.032, "grad_norm": 0.037932589650154114, "learning_rate": 4.722222222222222e-06, "loss": 0.034, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.027265505865216255, "mask/share_reasoning": 0.8616552352905273, "mask/share_step_conf": 0.11107931286096573, "num_tokens": 7165437.0, "reward": 0.829302966594696, "reward_std": 0.25291186571121216, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5660597681999207, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7878586053848267, "step": 30 }, { "adv/mean_abs_final_conf": 0.6084252595901489, "adv/mean_abs_reasoning": 0.4179837703704834, "adv/mean_abs_step_conf": 0.7556965351104736, "adv/ratio_final_to_reasoning": 1.4556193391213876, "adv/ratio_step_to_reasoning": 1.8079566449210593, "adv/std_final_conf": 0.8313267827033997, "adv/std_reasoning": 0.7013306617736816, "adv/std_step_conf": 0.9342578053474426, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.47851465474416294, "calib/avg_num_step_conf": 6.32421875, "calib/ece": 0.5090157480314961, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0002893691008444277, "calib/mean_conf": 0.9893307086614174, "calib/mu_c": 0.9891803278688525, "calib/mu_w": 0.9894696969696969, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5090157480314961, "calib/std_conf": 0.005677077985938627, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4703526093088857, "calib/step_q_c_n": 709.0, "calib/step_q_gap": 0.05286909282536928, "calib/step_q_w": 0.41748351648351645, "calib/step_q_w_n": 910.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 589.484375, "completions/mean_terminated_length": 589.484375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.03306666666666667, "grad_norm": 0.06749963015317917, "learning_rate": 4.694444444444445e-06, "loss": -0.0608, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.028801191598176956, "mask/share_reasoning": 0.8561272621154785, "mask/share_step_conf": 0.11507159471511841, "num_tokens": 7422257.0, "reward": 0.803888201713562, "reward_std": 0.1765424907207489, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.48727697134017944, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8267495036125183, "step": 31 }, { "adv/mean_abs_final_conf": 0.6257480382919312, "adv/mean_abs_reasoning": 0.44627708196640015, "adv/mean_abs_step_conf": 0.7526584267616272, "adv/ratio_final_to_reasoning": 1.4021514067779157, "adv/ratio_step_to_reasoning": 1.6865271759984624, "adv/std_final_conf": 0.8275420665740967, "adv/std_reasoning": 0.7205285429954529, "adv/std_step_conf": 0.9348967671394348, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5414493293591655, "calib/avg_num_step_conf": 5.81640625, "calib/ece": 0.46748031496063003, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.001327620466964552, "calib/mean_conf": 0.9871653543307087, "calib/mu_c": 0.9878030303030302, "calib/mu_w": 0.9864754098360656, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46748031496063003, "calib/std_conf": 0.006568552830858001, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5145, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.11222014475271408, "calib/step_q_w": 0.4022798552472859, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 551.62890625, "completions/mean_terminated_length": 551.62890625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.034133333333333335, "grad_norm": 0.04086047038435936, "learning_rate": 4.666666666666667e-06, "loss": 0.033, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.030038587749004364, "mask/share_reasoning": 0.8557687401771545, "mask/share_step_conf": 0.11419267952442169, "num_tokens": 7670178.0, "reward": 0.8285809755325317, "reward_std": 0.192602276802063, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5283093452453613, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8272901773452759, "step": 32 }, { "adv/mean_abs_final_conf": 0.6068408489227295, "adv/mean_abs_reasoning": 0.4078954756259918, "adv/mean_abs_step_conf": 0.7853239178657532, "adv/ratio_final_to_reasoning": 1.4877361608177164, "adv/ratio_step_to_reasoning": 1.925306763112601, "adv/std_final_conf": 0.798178493976593, "adv/std_reasoning": 0.6613113284111023, "adv/std_step_conf": 0.9336666464805603, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5599961479198767, "calib/avg_num_step_conf": 5.72265625, "calib/ece": 0.5142000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0017398561890086262, "calib/mean_conf": 0.9862000000000001, "calib/mu_c": 0.9871186440677965, "calib/mu_w": 0.9853787878787879, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5142000000000001, "calib/std_conf": 0.007124605252222757, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.509986013986014, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.03103934731934732, "calib/step_q_w": 0.47894666666666663, "calib/step_q_w_n": 750.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 576.09375, "completions/mean_terminated_length": 578.3529663085938, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.0352, "grad_norm": 0.06224135309457779, "learning_rate": 4.638888888888889e-06, "loss": 0.0111, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.029109252616763115, "mask/share_reasoning": 0.8572879433631897, "mask/share_step_conf": 0.10969658195972443, "num_tokens": 7924530.0, "reward": 0.777721643447876, "reward_std": 0.16657274961471558, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.47578006982803345, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7921631336212158, "step": 33 }, { "adv/mean_abs_final_conf": 0.7188222408294678, "adv/mean_abs_reasoning": 0.48082005977630615, "adv/mean_abs_step_conf": 0.7605365514755249, "adv/ratio_final_to_reasoning": 1.4949922038691321, "adv/ratio_step_to_reasoning": 1.5817487977297626, "adv/std_final_conf": 0.8647890686988831, "adv/std_reasoning": 0.7206701040267944, "adv/std_step_conf": 0.93436199426651, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4588960657662948, "calib/avg_num_step_conf": 5.96875, "calib/ece": 0.4540322580645162, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.0015502055196713327, "calib/mean_conf": 0.9822580645161291, "calib/mu_c": 0.9815267175572517, "calib/mu_w": 0.9830769230769231, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4540322580645162, "calib/std_conf": 0.009361788297375138, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5020348058902275, "calib/step_q_c_n": 747.0, "calib/step_q_gap": 0.04005017080700091, "calib/step_q_w": 0.4619846350832266, "calib/step_q_w_n": 781.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 525.140625, "completions/mean_terminated_length": 529.2755737304688, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.03626666666666667, "grad_norm": 0.06206882745027542, "learning_rate": 4.611111111111112e-06, "loss": 0.0014, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03148376941680908, "mask/share_reasoning": 0.8310195207595825, "mask/share_step_conf": 0.1296842396259308, "num_tokens": 8164078.0, "reward": 0.8166903257369995, "reward_std": 0.2020808607339859, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5267976522445679, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8097079396247864, "step": 34 }, { "adv/mean_abs_final_conf": 0.641136884689331, "adv/mean_abs_reasoning": 0.4908233880996704, "adv/mean_abs_step_conf": 0.7527388334274292, "adv/ratio_final_to_reasoning": 1.306247624367763, "adv/ratio_step_to_reasoning": 1.5336246227829962, "adv/std_final_conf": 0.8167731761932373, "adv/std_reasoning": 0.7394091486930847, "adv/std_step_conf": 0.9343294501304626, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5396810506566605, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.4589723320158102, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0011894934333956941, "calib/mean_conf": 0.9728063241106719, "calib/mu_c": 0.9733846153846153, "calib/mu_w": 0.9721951219512196, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4589723320158102, "calib/std_conf": 0.007252506689845704, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5793015873015873, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.09160043787629996, "calib/step_q_w": 0.48770114942528736, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 572.7890625, "completions/mean_terminated_length": 575.0353393554688, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.037333333333333336, "grad_norm": 0.06816098839044571, "learning_rate": 4.583333333333333e-06, "loss": -0.0431, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.02892582304775715, "mask/share_reasoning": 0.868759274482727, "mask/share_step_conf": 0.09840866923332214, "num_tokens": 8419968.0, "reward": 0.8196977376937866, "reward_std": 0.2199438214302063, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.53374844789505, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8064281940460205, "step": 35 }, { "adv/mean_abs_final_conf": 0.4820092022418976, "adv/mean_abs_reasoning": 0.343949556350708, "adv/mean_abs_step_conf": 0.747795820236206, "adv/ratio_final_to_reasoning": 1.4013950398889805, "adv/ratio_step_to_reasoning": 2.1741438720558093, "adv/std_final_conf": 0.7245348691940308, "adv/std_reasoning": 0.640119731426239, "adv/std_step_conf": 0.9329690337181091, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5227504244482173, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.21769841269841272, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0007130730050936007, "calib/mean_conf": 0.9716666666666667, "calib/mu_c": 0.971842105263158, "calib/mu_w": 0.9711290322580644, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21769841269841272, "calib/std_conf": 0.0053079754308590415, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6222383419689119, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.021569737317749138, "calib/step_q_w": 0.6006686046511628, "calib/step_q_w_n": 344.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 513.18359375, "completions/mean_terminated_length": 513.18359375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0384, "grad_norm": 0.05701802670955658, "learning_rate": 4.555555555555556e-06, "loss": 0.0635, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03495626896619797, "mask/share_reasoning": 0.8420665264129639, "mask/share_step_conf": 0.12297721952199936, "num_tokens": 8654055.0, "reward": 0.9756142497062683, "reward_std": 0.15324629843235016, "rewards/accuracy_reward_step": 0.74609375, "rewards/final_brier_reward_step": 0.7553539276123047, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8497809171676636, "step": 36 }, { "adv/mean_abs_final_conf": 0.5717979073524475, "adv/mean_abs_reasoning": 0.3663122057914734, "adv/mean_abs_step_conf": 0.7703781127929688, "adv/ratio_final_to_reasoning": 1.5609578340885226, "adv/ratio_step_to_reasoning": 2.103064273079433, "adv/std_final_conf": 0.7795237302780151, "adv/std_reasoning": 0.6403769254684448, "adv/std_step_conf": 0.9331871271133423, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5274299344066786, "calib/avg_num_step_conf": 5.15234375, "calib/ece": 0.4965040650406506, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0005326972768836535, "calib/mean_conf": 0.9721138211382115, "calib/mu_c": 0.9723931623931625, "calib/mu_w": 0.9718604651162789, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4965040650406506, "calib/std_conf": 0.005808884773299599, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6934782608695652, "calib/step_q_c_n": 506.0, "calib/step_q_gap": 0.14440077009465746, "calib/step_q_w": 0.5490774907749078, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 512.2578125, "completions/mean_terminated_length": 518.33203125, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.039466666666666664, "grad_norm": 0.05277741327881813, "learning_rate": 4.527777777777778e-06, "loss": -0.0848, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03219691663980484, "mask/share_reasoning": 0.8508118391036987, "mask/share_step_conf": 0.10527247190475464, "num_tokens": 8892289.0, "reward": 0.7643100619316101, "reward_std": 0.17383795976638794, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.48461097478866577, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7604154348373413, "step": 37 }, { "adv/mean_abs_final_conf": 0.6224730610847473, "adv/mean_abs_reasoning": 0.40885424613952637, "adv/mean_abs_step_conf": 0.7654492855072021, "adv/ratio_final_to_reasoning": 1.5224815859496321, "adv/ratio_step_to_reasoning": 1.872181328027552, "adv/std_final_conf": 0.8149917721748352, "adv/std_reasoning": 0.7014630436897278, "adv/std_step_conf": 0.9333143830299377, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5132510013351135, "calib/avg_num_step_conf": 4.8359375, "calib/ece": 0.40668016194332, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0003718291054739531, "calib/mean_conf": 0.9734817813765184, "calib/mu_c": 0.9736428571428571, "calib/mu_w": 0.9732710280373832, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.40668016194332, "calib/std_conf": 0.007311924711082845, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.670029542097489, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.056250575965581584, "calib/step_q_w": 0.6137789661319074, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 536.30078125, "completions/mean_terminated_length": 540.5236206054688, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.04053333333333333, "grad_norm": 0.06288687139749527, "learning_rate": 4.5e-06, "loss": 0.035, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03290407732129097, "mask/share_reasoning": 0.8492652773857117, "mask/share_step_conf": 0.11001814901828766, "num_tokens": 9136470.0, "reward": 0.8197071552276611, "reward_std": 0.1978277713060379, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.5682578086853027, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7695938348770142, "step": 38 }, { "adv/mean_abs_final_conf": 0.70289146900177, "adv/mean_abs_reasoning": 0.47861048579216003, "adv/mean_abs_step_conf": 0.7290204763412476, "adv/ratio_final_to_reasoning": 1.4686085864550105, "adv/ratio_step_to_reasoning": 1.5232020567510713, "adv/std_final_conf": 0.862726628780365, "adv/std_reasoning": 0.7575022578239441, "adv/std_step_conf": 0.9335668087005615, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5760536398467433, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.4377290836653387, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0031577266922091507, "calib/mean_conf": 0.9755776892430279, "calib/mu_c": 0.9770370370370368, "calib/mu_w": 0.9738793103448277, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4377290836653387, "calib/std_conf": 0.00905741646755723, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.644745508982036, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.0755895719291032, "calib/step_q_w": 0.5691559370529328, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2262.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 502.2421875, "completions/mean_terminated_length": 506.19683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.0416, "grad_norm": 0.0513564832508564, "learning_rate": 4.472222222222223e-06, "loss": -0.0513, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03278471902012825, "mask/share_reasoning": 0.8430602550506592, "mask/share_step_conf": 0.11634252965450287, "num_tokens": 9371132.0, "reward": 0.8304038047790527, "reward_std": 0.20517049729824066, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5503504276275635, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8088946342468262, "step": 39 }, { "adv/mean_abs_final_conf": 0.7009073495864868, "adv/mean_abs_reasoning": 0.5143710374832153, "adv/mean_abs_step_conf": 0.7778093814849854, "adv/ratio_final_to_reasoning": 1.3626493299777953, "adv/ratio_step_to_reasoning": 1.5121562545409963, "adv/std_final_conf": 0.8636792898178101, "adv/std_reasoning": 0.7575141787528992, "adv/std_step_conf": 0.9339827299118042, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5441338390017127, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.5020312500000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0015904086126746764, "calib/mean_conf": 0.9785937500000002, "calib/mu_c": 0.9794262295081969, "calib/mu_w": 0.9778358208955222, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5020312500000002, "calib/std_conf": 0.009373697826231661, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5497952218430033, "calib/step_q_c_n": 586.0, "calib/step_q_gap": 0.02492171984566649, "calib/step_q_w": 0.5248735019973368, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 531.94140625, "completions/mean_terminated_length": 534.0274658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.042666666666666665, "grad_norm": 0.05533498525619507, "learning_rate": 4.444444444444444e-06, "loss": -0.0268, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.032788656651973724, "mask/share_reasoning": 0.8522244691848755, "mask/share_step_conf": 0.11108061671257019, "num_tokens": 9614069.0, "reward": 0.7950161099433899, "reward_std": 0.20596131682395935, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.49921953678131104, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.7955002188682556, "step": 40 }, { "adv/mean_abs_final_conf": 0.5990781784057617, "adv/mean_abs_reasoning": 0.3729305863380432, "adv/mean_abs_step_conf": 0.7478874921798706, "adv/ratio_final_to_reasoning": 1.606406662130756, "adv/ratio_step_to_reasoning": 2.0054335031183186, "adv/std_final_conf": 0.7746619582176208, "adv/std_reasoning": 0.6612075567245483, "adv/std_step_conf": 0.9326661229133606, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7066062176165804, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.21948616600790516, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.008082037996545943, "calib/mean_conf": 0.9823320158102767, "calib/mu_c": 0.9842487046632125, "calib/mu_w": 0.9761666666666665, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21948616600790516, "calib/std_conf": 0.009642648733238994, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5260993852459016, "calib/step_q_c_n": 976.0, "calib/step_q_gap": 0.0586591122083589, "calib/step_q_w": 0.4674402730375427, "calib/step_q_w_n": 293.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 469.05859375, "completions/mean_terminated_length": 472.751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.04373333333333333, "grad_norm": 0.0854325145483017, "learning_rate": 4.416666666666667e-06, "loss": -0.0355, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034799594432115555, "mask/share_reasoning": 0.837127149105072, "mask/share_step_conf": 0.12026076018810272, "num_tokens": 9841396.0, "reward": 0.973730742931366, "reward_std": 0.15697798132896423, "rewards/accuracy_reward_step": 0.75390625, "rewards/final_brier_reward_step": 0.7607718706130981, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8390333652496338, "step": 41 }, { "adv/mean_abs_final_conf": 0.47649919986724854, "adv/mean_abs_reasoning": 0.34694188833236694, "adv/mean_abs_step_conf": 0.7612089514732361, "adv/ratio_final_to_reasoning": 1.3734265474763512, "adv/ratio_step_to_reasoning": 2.1940531745305005, "adv/std_final_conf": 0.7270616292953491, "adv/std_reasoning": 0.6401383876800537, "adv/std_step_conf": 0.9325536489486694, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5395939397656874, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.4519140625000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0014347052689687168, "calib/mean_conf": 0.9870703125000001, "calib/mu_c": 0.9877372262773723, "calib/mu_w": 0.9863025210084035, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4519140625000001, "calib/std_conf": 0.006988476668941797, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46700947225981054, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.05923169448203286, "calib/step_q_w": 0.4077777777777777, "calib/step_q_w_n": 711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 442.296875, "completions/mean_terminated_length": 444.0314025878906, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0448, "grad_norm": 0.04096578061580658, "learning_rate": 4.388888888888889e-06, "loss": -0.0075, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03530528023838997, "mask/share_reasoning": 0.8296844959259033, "mask/share_step_conf": 0.1311040222644806, "num_tokens": 10058992.0, "reward": 0.8483582735061646, "reward_std": 0.14226624369621277, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5476745963096619, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8420105576515198, "step": 42 }, { "adv/mean_abs_final_conf": 0.6245371103286743, "adv/mean_abs_reasoning": 0.5628688335418701, "adv/mean_abs_step_conf": 0.7604486346244812, "adv/ratio_final_to_reasoning": 1.1095606526990571, "adv/ratio_step_to_reasoning": 1.3510228126139687, "adv/std_final_conf": 0.8431703448295593, "adv/std_reasoning": 0.8098235726356506, "adv/std_step_conf": 0.9338532090187073, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5481167608286253, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.45549407114624507, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0016315128688011438, "calib/mean_conf": 0.9890909090909091, "calib/mu_c": 0.9898518518518519, "calib/mu_w": 0.9882203389830507, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.45549407114624507, "calib/std_conf": 0.003921617946185762, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4100944669365722, "calib/step_q_c_n": 741.0, "calib/step_q_gap": 0.02415178217035796, "calib/step_q_w": 0.38594268476621424, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 527.6796875, "completions/mean_terminated_length": 529.7490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.04586666666666667, "grad_norm": 0.045065395534038544, "learning_rate": 4.361111111111112e-06, "loss": -0.0102, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03222114220261574, "mask/share_reasoning": 0.8443077206611633, "mask/share_step_conf": 0.11956489086151123, "num_tokens": 10299302.0, "reward": 0.8342925310134888, "reward_std": 0.21557238698005676, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.5379925966262817, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8282486200332642, "step": 43 }, { "adv/mean_abs_final_conf": 0.554121732711792, "adv/mean_abs_reasoning": 0.42493686079978943, "adv/mean_abs_step_conf": 0.7594295740127563, "adv/ratio_final_to_reasoning": 1.3040095690189335, "adv/ratio_step_to_reasoning": 1.7871586206557977, "adv/std_final_conf": 0.7965189218521118, "adv/std_reasoning": 0.7013244032859802, "adv/std_step_conf": 0.9335551261901855, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5689793154395469, "calib/avg_num_step_conf": 6.0859375, "calib/ece": 0.501921568627451, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.003040507264220671, "calib/mean_conf": 0.9881960784313726, "calib/mu_c": 0.989758064516129, "calib/mu_w": 0.9867175572519084, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.501921568627451, "calib/std_conf": 0.007186642314638623, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.40267929634641403, "calib/step_q_c_n": 739.0, "calib/step_q_gap": 0.05079895446607208, "calib/step_q_w": 0.35188034188034195, "calib/step_q_w_n": 819.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 560.40234375, "completions/mean_terminated_length": 560.40234375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.046933333333333334, "grad_norm": 0.038281019777059555, "learning_rate": 4.333333333333334e-06, "loss": -0.0563, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.028894424438476562, "mask/share_reasoning": 0.8506920337677002, "mask/share_step_conf": 0.12041356414556503, "num_tokens": 10549085.0, "reward": 0.8303428888320923, "reward_std": 0.16250118613243103, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.4977785050868988, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8668135404586792, "step": 44 }, { "adv/mean_abs_final_conf": 0.5395326018333435, "adv/mean_abs_reasoning": 0.43404123187065125, "adv/mean_abs_step_conf": 0.7650111317634583, "adv/ratio_final_to_reasoning": 1.243044582442181, "adv/ratio_step_to_reasoning": 1.7625310122413427, "adv/std_final_conf": 0.7722030282020569, "adv/std_reasoning": 0.681717038154602, "adv/std_step_conf": 0.9333689212799072, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5348484848484848, "calib/avg_num_step_conf": 6.06640625, "calib/ece": 0.4596428571428573, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841269841269841, "calib/gap": 0.007606060606060505, "calib/mean_conf": 0.9811507936507936, "calib/mu_c": 0.9847727272727274, "calib/mu_w": 0.9771666666666668, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4584920634920636, "calib/std_conf": 0.06814917897237072, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40513907284768214, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.049236817208584416, "calib/step_q_w": 0.3559022556390977, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 521.2734375, "completions/mean_terminated_length": 523.3176879882812, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.048, "grad_norm": 0.04909267649054527, "learning_rate": 4.305555555555556e-06, "loss": -0.0485, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03378835693001747, "mask/share_reasoning": 0.8306214809417725, "mask/share_step_conf": 0.13168391585350037, "num_tokens": 10787579.0, "reward": 0.8356795310974121, "reward_std": 0.16655045747756958, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5321097373962402, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8392492532730103, "step": 45 }, { "adv/mean_abs_final_conf": 0.4849938750267029, "adv/mean_abs_reasoning": 0.3777017593383789, "adv/mean_abs_step_conf": 0.7518752813339233, "adv/ratio_final_to_reasoning": 1.2840657027287028, "adv/ratio_step_to_reasoning": 1.9906586684980898, "adv/std_final_conf": 0.728912353515625, "adv/std_reasoning": 0.6612749099731445, "adv/std_step_conf": 0.9323993921279907, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5283769902413971, "calib/avg_num_step_conf": 6.5703125, "calib/ece": 0.46016, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0010092449922959323, "calib/mean_conf": 0.98816, "calib/mu_c": 0.9886363636363639, "calib/mu_w": 0.9876271186440679, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46016, "calib/std_conf": 0.005423504402137058, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38947630922693266, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.05152176377238721, "calib/step_q_w": 0.33795454545454545, "calib/step_q_w_n": 880.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 558.0078125, "completions/mean_terminated_length": 562.4015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.04906666666666667, "grad_norm": 0.048358555883169174, "learning_rate": 4.277777777777778e-06, "loss": -0.067, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03367871418595314, "mask/share_reasoning": 0.8230705261230469, "mask/share_step_conf": 0.1354382336139679, "num_tokens": 11035197.0, "reward": 0.8295394778251648, "reward_std": 0.1529095470905304, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5268656015396118, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8337757587432861, "step": 46 }, { "adv/mean_abs_final_conf": 0.5534183979034424, "adv/mean_abs_reasoning": 0.4260653257369995, "adv/mean_abs_step_conf": 0.7502398490905762, "adv/ratio_final_to_reasoning": 1.2989050374990032, "adv/ratio_step_to_reasoning": 1.7608563846232403, "adv/std_final_conf": 0.7891755700111389, "adv/std_reasoning": 0.7013570070266724, "adv/std_step_conf": 0.9326286315917969, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6437258687258687, "calib/avg_num_step_conf": 6.5390625, "calib/ece": 0.39592885375494075, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.01943307593307575, "calib/mean_conf": 0.9772727272727274, "calib/mu_c": 0.9853378378378377, "calib/mu_w": 0.9659047619047619, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.39411067193675897, "calib/std_conf": 0.0814157869391294, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.37110948081264106, "calib/step_q_c_n": 886.0, "calib/step_q_gap": 0.048685622944620754, "calib/step_q_w": 0.3224238578680203, "calib/step_q_w_n": 788.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 579.1171875, "completions/mean_terminated_length": 579.1171875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.050133333333333335, "grad_norm": 0.04179922118782997, "learning_rate": 4.25e-06, "loss": 0.0534, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030462728813290596, "mask/share_reasoning": 0.8413854837417603, "mask/share_step_conf": 0.1281517744064331, "num_tokens": 11289427.0, "reward": 0.8785368800163269, "reward_std": 0.16066467761993408, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5990324020385742, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8447600603103638, "step": 47 }, { "adv/mean_abs_final_conf": 0.7094321250915527, "adv/mean_abs_reasoning": 0.4535606801509857, "adv/mean_abs_step_conf": 0.7399888038635254, "adv/ratio_final_to_reasoning": 1.5641393889245205, "adv/ratio_step_to_reasoning": 1.6315100409876595, "adv/std_final_conf": 0.8601517081260681, "adv/std_reasoning": 0.7014860510826111, "adv/std_step_conf": 0.9333368539810181, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6279370078740159, "calib/avg_num_step_conf": 6.4140625, "calib/ece": 0.459920634920635, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9126984126984127, "calib/gap": 0.03109165354330712, "calib/mean_conf": 0.9486507936507936, "calib/mu_c": 0.9643200000000001, "calib/mu_w": 0.933228346456693, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4562698412698414, "calib/std_conf": 0.12007836842188596, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.36418791946308726, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.005369636296978009, "calib/step_q_w": 0.35881828316610925, "calib/step_q_w_n": 897.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 506.65234375, "completions/mean_terminated_length": 512.6600952148438, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.0512, "grad_norm": 0.05462892726063728, "learning_rate": 4.222222222222223e-06, "loss": -0.1157, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03352171555161476, "mask/share_reasoning": 0.8146562576293945, "mask/share_step_conf": 0.14010323584079742, "num_tokens": 11522818.0, "reward": 0.8315355777740479, "reward_std": 0.1631363332271576, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5375875234603882, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8317336440086365, "step": 48 }, { "adv/mean_abs_final_conf": 0.6704999208450317, "adv/mean_abs_reasoning": 0.4311797022819519, "adv/mean_abs_step_conf": 0.7662805318832397, "adv/ratio_final_to_reasoning": 1.555035910309587, "adv/ratio_step_to_reasoning": 1.7771720881753443, "adv/std_final_conf": 0.840907096862793, "adv/std_reasoning": 0.7014207243919373, "adv/std_step_conf": 0.9324670433998108, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6658229491173417, "calib/avg_num_step_conf": 6.609375, "calib/ece": 0.38549800796812755, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9362549800796812, "calib/gap": 0.037870586708203424, "calib/mean_conf": 0.959203187250996, "calib/mu_c": 0.9753472222222221, "calib/mu_w": 0.9374766355140187, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.38549800796812755, "calib/std_conf": 0.07276663122148436, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3687181510710259, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.037103244238727795, "calib/step_q_w": 0.3316149068322981, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 518.890625, "completions/mean_terminated_length": 520.925537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.05226666666666667, "grad_norm": 0.07325834035873413, "learning_rate": 4.194444444444445e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03269638121128082, "mask/share_reasoning": 0.8214113116264343, "mask/share_step_conf": 0.14198604226112366, "num_tokens": 11760190.0, "reward": 0.877612292766571, "reward_std": 0.1599796712398529, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6075612902641296, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8398506045341492, "step": 49 }, { "adv/mean_abs_final_conf": 0.7164367437362671, "adv/mean_abs_reasoning": 0.42041462659835815, "adv/mean_abs_step_conf": 0.7300196290016174, "adv/ratio_final_to_reasoning": 1.7041194535334585, "adv/ratio_step_to_reasoning": 1.7364277615846118, "adv/std_final_conf": 0.8789020776748657, "adv/std_reasoning": 0.6817552447319031, "adv/std_step_conf": 0.9324972629547119, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.723894389438944, "calib/avg_num_step_conf": 6.44921875, "calib/ece": 0.3033067729083666, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7569721115537849, "calib/gap": 0.09033597359735956, "calib/mean_conf": 0.9009163346613547, "calib/mu_c": 0.9372666666666665, "calib/mu_w": 0.8469306930693069, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3033067729083666, "calib/std_conf": 0.13599990102861245, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.35842902208201893, "calib/step_q_c_n": 951.0, "calib/step_q_gap": 0.007071879224876065, "calib/step_q_w": 0.35135714285714287, "calib/step_q_w_n": 700.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 553.734375, "completions/mean_terminated_length": 553.734375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.05333333333333334, "grad_norm": 0.07642224431037903, "learning_rate": 4.166666666666667e-06, "loss": -0.0541, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.031201042234897614, "mask/share_reasoning": 0.8356307744979858, "mask/share_step_conf": 0.13316819071769714, "num_tokens": 12007306.0, "reward": 0.916384220123291, "reward_std": 0.16534239053726196, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6789581775665283, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8405289053916931, "step": 50 }, { "adv/mean_abs_final_conf": 0.7365193367004395, "adv/mean_abs_reasoning": 0.38996899127960205, "adv/mean_abs_step_conf": 0.7523127794265747, "adv/ratio_final_to_reasoning": 1.8886612863338303, "adv/ratio_step_to_reasoning": 1.929160513398814, "adv/std_final_conf": 0.9118251800537109, "adv/std_reasoning": 0.6815586686134338, "adv/std_step_conf": 0.9332376718521118, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6966089466089466, "calib/avg_num_step_conf": 6.86328125, "calib/ece": 0.21509881422924906, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.48221343873517786, "calib/gap": 0.10533910533910529, "calib/mean_conf": 0.8164426877470355, "calib/mu_c": 0.8576623376623378, "calib/mu_w": 0.7523232323232325, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2114229249011858, "calib/std_conf": 0.17825015085694332, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3530892678034102, "calib/step_q_c_n": 997.0, "calib/step_q_gap": 0.016365583592883926, "calib/step_q_w": 0.3367236842105263, "calib/step_q_w_n": 760.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 562.5859375, "completions/mean_terminated_length": 564.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.0544, "grad_norm": 0.14050939679145813, "learning_rate": 4.138888888888889e-06, "loss": -0.0199, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030356088653206825, "mask/share_reasoning": 0.8331259489059448, "mask/share_step_conf": 0.1326117217540741, "num_tokens": 12260624.0, "reward": 0.9410929679870605, "reward_std": 0.12373049557209015, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7284257411956787, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8357914686203003, "step": 51 }, { "adv/mean_abs_final_conf": 0.6990074515342712, "adv/mean_abs_reasoning": 0.37972062826156616, "adv/mean_abs_step_conf": 0.7402435541152954, "adv/ratio_final_to_reasoning": 1.840846663333676, "adv/ratio_step_to_reasoning": 1.9494425612437025, "adv/std_final_conf": 0.8831400871276855, "adv/std_reasoning": 0.6612834930419922, "adv/std_step_conf": 0.9324589371681213, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7261456628477906, "calib/avg_num_step_conf": 6.12109375, "calib/ece": 0.0904347826086957, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4980237154150198, "calib/gap": 0.17239607201309315, "calib/mean_conf": 0.7810276679841898, "calib/mu_c": 0.8253191489361702, "calib/mu_w": 0.652923076923077, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06418972332015814, "calib/std_conf": 0.22313020008731313, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37573489630297563, "calib/step_q_c_n": 1109.0, "calib/step_q_gap": 0.024424852634853333, "calib/step_q_w": 0.3513100436681223, "calib/step_q_w_n": 458.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 527.47265625, "completions/mean_terminated_length": 527.47265625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.055466666666666664, "grad_norm": 0.1133529543876648, "learning_rate": 4.111111111111111e-06, "loss": 0.0067, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03268003463745117, "mask/share_reasoning": 0.8374246954917908, "mask/share_step_conf": 0.12989526987075806, "num_tokens": 12503609.0, "reward": 0.9939903616905212, "reward_std": 0.13481810688972473, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.8108843564987183, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8325650691986084, "step": 52 }, { "adv/mean_abs_final_conf": 0.6250590085983276, "adv/mean_abs_reasoning": 0.4454294443130493, "adv/mean_abs_step_conf": 0.7550910711288452, "adv/ratio_final_to_reasoning": 1.4032727664923608, "adv/ratio_step_to_reasoning": 1.695197928132844, "adv/std_final_conf": 0.8168118596076965, "adv/std_reasoning": 0.7014402151107788, "adv/std_step_conf": 0.9312266111373901, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7757667331061533, "calib/avg_num_step_conf": 6.66015625, "calib/ece": 0.28917322834645665, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7125984251968503, "calib/gap": 0.14546454060309888, "calib/mean_conf": 0.8836614173228348, "calib/mu_c": 0.9426490066225164, "calib/mu_w": 0.7971844660194175, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28917322834645665, "calib/std_conf": 0.17132984812295177, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3879324055666004, "calib/step_q_c_n": 1006.0, "calib/step_q_gap": 0.039262877669604734, "calib/step_q_w": 0.3486695278969957, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 528.0078125, "completions/mean_terminated_length": 528.0078125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.05653333333333333, "grad_norm": 0.07778654247522354, "learning_rate": 4.083333333333334e-06, "loss": 0.0237, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03123798780143261, "mask/share_reasoning": 0.8346073627471924, "mask/share_step_conf": 0.13415467739105225, "num_tokens": 12744603.0, "reward": 0.93428635597229, "reward_std": 0.14244474470615387, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7104933261871338, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8416730165481567, "step": 53 }, { "adv/mean_abs_final_conf": 0.41671222448349, "adv/mean_abs_reasoning": 0.2910095453262329, "adv/mean_abs_step_conf": 0.7577744722366333, "adv/ratio_final_to_reasoning": 1.4319538007467747, "adv/ratio_step_to_reasoning": 2.6039505727798, "adv/std_final_conf": 0.6897390484809875, "adv/std_reasoning": 0.5959850549697876, "adv/std_step_conf": 0.9312926530838013, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7402930402930403, "calib/avg_num_step_conf": 6.12109375, "calib/ece": 0.20330708661417324, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.905511811023622, "calib/gap": 0.13131298331298313, "calib/mean_conf": 0.9474015748031497, "calib/mu_c": 0.981005291005291, "calib/mu_w": 0.8496923076923079, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20330708661417324, "calib/std_conf": 0.13252967338761779, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4237527372262774, "calib/step_q_c_n": 1096.0, "calib/step_q_gap": 0.05451706843646842, "calib/step_q_w": 0.36923566878980896, "calib/step_q_w_n": 471.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 462.609375, "completions/mean_terminated_length": 464.4235534667969, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.0576, "grad_norm": 0.05564220994710922, "learning_rate": 4.055555555555556e-06, "loss": -0.0125, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.036595359444618225, "mask/share_reasoning": 0.818428635597229, "mask/share_step_conf": 0.14106974005699158, "num_tokens": 12969263.0, "reward": 0.9893507957458496, "reward_std": 0.10805167257785797, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7944375276565552, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8381702899932861, "step": 54 }, { "adv/mean_abs_final_conf": 0.5934556722640991, "adv/mean_abs_reasoning": 0.49654126167297363, "adv/mean_abs_step_conf": 0.7678463459014893, "adv/ratio_final_to_reasoning": 1.19517896713074, "adv/ratio_step_to_reasoning": 1.5463898071922964, "adv/std_final_conf": 0.8131335377693176, "adv/std_reasoning": 0.739345371723175, "adv/std_step_conf": 0.9309093952178955, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6096491228070176, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.3576377952755906, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9015748031496063, "calib/gap": 0.05930727554179582, "calib/mean_conf": 0.9451968503937008, "calib/mu_c": 0.969013157894737, "calib/mu_w": 0.9097058823529411, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35220472440944883, "calib/std_conf": 0.13621630779669894, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.43842369263607256, "calib/step_q_c_n": 937.0, "calib/step_q_gap": 0.025516203649288416, "calib/step_q_w": 0.41290748898678414, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 504.35546875, "completions/mean_terminated_length": 506.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.058666666666666666, "grad_norm": 0.057675618678331375, "learning_rate": 4.027777777777779e-06, "loss": -0.0774, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03339093178510666, "mask/share_reasoning": 0.8275409936904907, "mask/share_step_conf": 0.13516178727149963, "num_tokens": 13206202.0, "reward": 0.9117587804794312, "reward_std": 0.19418179988861084, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6443132758140564, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8620166778564453, "step": 55 }, { "adv/mean_abs_final_conf": 0.49654895067214966, "adv/mean_abs_reasoning": 0.4492354691028595, "adv/mean_abs_step_conf": 0.7373233437538147, "adv/ratio_final_to_reasoning": 1.1053200043704852, "adv/ratio_step_to_reasoning": 1.6412847926417693, "adv/std_final_conf": 0.7745915651321411, "adv/std_reasoning": 0.7391836047172546, "adv/std_step_conf": 0.9308094382286072, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6026096505393708, "calib/avg_num_step_conf": 6.95703125, "calib/ece": 0.4257600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.936, "calib/gap": 0.008552419094373875, "calib/mean_conf": 0.9692000000000001, "calib/mu_c": 0.9730656934306572, "calib/mu_w": 0.9645132743362833, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4234800000000001, "calib/std_conf": 0.07442687686582046, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4720138089758343, "calib/step_q_c_n": 869.0, "calib/step_q_gap": 0.04145459844951854, "calib/step_q_w": 0.43055921052631574, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 494.6875, "completions/mean_terminated_length": 502.5397033691406, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.05973333333333333, "grad_norm": 0.03915965557098389, "learning_rate": 4.000000000000001e-06, "loss": -0.098, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03251350671052933, "mask/share_reasoning": 0.8159323930740356, "mask/share_step_conf": 0.13592907786369324, "num_tokens": 13439682.0, "reward": 0.8552824854850769, "reward_std": 0.1606539785861969, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5601484775543213, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8480727672576904, "step": 56 }, { "adv/mean_abs_final_conf": 0.5040005445480347, "adv/mean_abs_reasoning": 0.4262211322784424, "adv/mean_abs_step_conf": 0.7493640184402466, "adv/ratio_final_to_reasoning": 1.1824860533165222, "adv/ratio_step_to_reasoning": 1.7581578239314068, "adv/std_final_conf": 0.7608724236488342, "adv/std_reasoning": 0.7013793587684631, "adv/std_step_conf": 0.9318333268165588, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5604005167958657, "calib/avg_num_step_conf": 6.65625, "calib/ece": 0.2947430830039526, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9604743083003953, "calib/gap": 0.026538185472293807, "calib/mean_conf": 0.9745849802371542, "calib/mu_c": 0.9830813953488371, "calib/mu_w": 0.9565432098765433, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2947430830039526, "calib/std_conf": 0.07132515931620358, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48574823943661966, "calib/step_q_c_n": 1136.0, "calib/step_q_gap": 0.05451584507042245, "calib/step_q_w": 0.4312323943661972, "calib/step_q_w_n": 568.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 516.07421875, "completions/mean_terminated_length": 516.07421875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0608, "grad_norm": 0.03392103314399719, "learning_rate": 3.972222222222223e-06, "loss": -0.0328, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03265305608510971, "mask/share_reasoning": 0.8259487152099609, "mask/share_step_conf": 0.14139823615550995, "num_tokens": 13678589.0, "reward": 0.9377506971359253, "reward_std": 0.17583677172660828, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.6937090158462524, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8489797115325928, "step": 57 }, { "adv/mean_abs_final_conf": 0.5565872192382812, "adv/mean_abs_reasoning": 0.49917328357696533, "adv/mean_abs_step_conf": 0.7326595783233643, "adv/ratio_final_to_reasoning": 1.115018045937676, "adv/ratio_step_to_reasoning": 1.467745976053221, "adv/std_final_conf": 0.7942405343055725, "adv/std_reasoning": 0.7575743198394775, "adv/std_step_conf": 0.9322904348373413, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5544391315585422, "calib/avg_num_step_conf": 7.35546875, "calib/ece": 0.381984126984127, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9206349206349206, "calib/gap": 0.023688291548203844, "calib/mean_conf": 0.9547619047619048, "calib/mu_c": 0.9647260273972604, "calib/mu_w": 0.9410377358490566, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3786904761904762, "calib/std_conf": 0.13093506379005287, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.49481312670920685, "calib/step_q_c_n": 1097.0, "calib/step_q_gap": 0.05720498421556813, "calib/step_q_w": 0.4376081424936387, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 579.21484375, "completions/mean_terminated_length": 581.486328125, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.06186666666666667, "grad_norm": 0.04124533385038376, "learning_rate": 3.944444444444445e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030935410410165787, "mask/share_reasoning": 0.827335000038147, "mask/share_step_conf": 0.13782331347465515, "num_tokens": 13933188.0, "reward": 0.871442437171936, "reward_std": 0.19315657019615173, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5968499779701233, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8358785510063171, "step": 58 }, { "adv/mean_abs_final_conf": 0.47267991304397583, "adv/mean_abs_reasoning": 0.4489648938179016, "adv/mean_abs_step_conf": 0.7724525332450867, "adv/ratio_final_to_reasoning": 1.0528215447412976, "adv/ratio_step_to_reasoning": 1.7205187841666534, "adv/std_final_conf": 0.7284379601478577, "adv/std_reasoning": 0.7206330895423889, "adv/std_step_conf": 0.9324396848678589, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5234210526315789, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.38547619047619064, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9880952380952381, "calib/gap": 0.0003500000000001835, "calib/mean_conf": 0.9861111111111113, "calib/mu_c": 0.9862500000000002, "calib/mu_w": 0.9859, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3842063492063494, "calib/std_conf": 0.02774521901392222, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5309918319719953, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.008837985818149097, "calib/step_q_w": 0.5221538461538462, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 531.265625, "completions/mean_terminated_length": 533.3490600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.06293333333333333, "grad_norm": 0.037306271493434906, "learning_rate": 3.916666666666667e-06, "loss": -0.0795, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03339418023824692, "mask/share_reasoning": 0.836950421333313, "mask/share_step_conf": 0.12574917078018188, "num_tokens": 14175440.0, "reward": 0.8658956289291382, "reward_std": 0.1877882331609726, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.5960062742233276, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8217225074768066, "step": 59 }, { "adv/mean_abs_final_conf": 0.5334903597831726, "adv/mean_abs_reasoning": 0.5123406648635864, "adv/mean_abs_step_conf": 0.735711932182312, "adv/ratio_final_to_reasoning": 1.0412805314315963, "adv/ratio_step_to_reasoning": 1.4359819210879923, "adv/std_final_conf": 0.7582880854606628, "adv/std_reasoning": 0.7577045559883118, "adv/std_step_conf": 0.932331919670105, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5634365634365635, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.4020564516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.967741935483871, "calib/gap": 0.026596736596736337, "calib/mean_conf": 0.9786693548387099, "calib/mu_c": 0.9899300699300697, "calib/mu_w": 0.9633333333333334, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4020564516129033, "calib/std_conf": 0.06951041019437462, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5287763157894737, "calib/step_q_c_n": 760.0, "calib/step_q_gap": 0.06046661022785976, "calib/step_q_w": 0.46830970556161394, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 496.81640625, "completions/mean_terminated_length": 502.70751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.064, "grad_norm": 0.03891824930906296, "learning_rate": 3.88888888888889e-06, "loss": -0.0029, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034683212637901306, "mask/share_reasoning": 0.8176734447479248, "mask/share_step_conf": 0.1359245479106903, "num_tokens": 14411481.0, "reward": 0.86177659034729, "reward_std": 0.2099033147096634, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5835503935813904, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8345340490341187, "step": 60 }, { "adv/mean_abs_final_conf": 0.3984984755516052, "adv/mean_abs_reasoning": 0.3724913001060486, "adv/mean_abs_step_conf": 0.7677532434463501, "adv/ratio_final_to_reasoning": 1.0698195513241582, "adv/ratio_step_to_reasoning": 2.0611306713143906, "adv/std_final_conf": 0.6604509353637695, "adv/std_reasoning": 0.64032381772995, "adv/std_step_conf": 0.9302454590797424, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5178571428571428, "calib/avg_num_step_conf": 6.25, "calib/ece": 0.32440000000000013, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.996, "calib/gap": 0.004761904761904745, "calib/mean_conf": 0.9884000000000001, "calib/mu_c": 0.9899999999999999, "calib/mu_w": 0.9852380952380951, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.32440000000000013, "calib/std_conf": 0.022800000000000004, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5489644351464434, "calib/step_q_c_n": 956.0, "calib/step_q_gap": 0.09121598794147445, "calib/step_q_w": 0.457748447204969, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 456.85546875, "completions/mean_terminated_length": 456.85546875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.04500626400113106, "learning_rate": 3.861111111111112e-06, "loss": 0.0342, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03986175358295441, "mask/share_reasoning": 0.8108029365539551, "mask/share_step_conf": 0.14933526515960693, "num_tokens": 14632500.0, "reward": 0.9083384275436401, "reward_std": 0.1528836041688919, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6574859619140625, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.834190845489502, "step": 61 }, { "adv/mean_abs_final_conf": 0.5679522752761841, "adv/mean_abs_reasoning": 0.5531355738639832, "adv/mean_abs_step_conf": 0.7585821151733398, "adv/ratio_final_to_reasoning": 1.0267867447191967, "adv/ratio_step_to_reasoning": 1.371421674932584, "adv/std_final_conf": 0.7788091897964478, "adv/std_reasoning": 0.7755100727081299, "adv/std_step_conf": 0.9326937794685364, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.49162539103232533, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.4342570281124499, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9357429718875502, "calib/gap": -0.0028480187695517634, "calib/mean_conf": 0.9600401606425704, "calib/mu_c": 0.9587591240875912, "calib/mu_w": 0.961607142857143, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42204819277108446, "calib/std_conf": 0.11564902585972572, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5245710784313725, "calib/step_q_c_n": 816.0, "calib/step_q_gap": 0.06565829467987905, "calib/step_q_w": 0.45891278375149347, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 512.97265625, "completions/mean_terminated_length": 517.0117797851562, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.06613333333333334, "grad_norm": 0.045611463487148285, "learning_rate": 3.833333333333334e-06, "loss": -0.0846, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03405851125717163, "mask/share_reasoning": 0.8291573524475098, "mask/share_step_conf": 0.12897160649299622, "num_tokens": 14870901.0, "reward": 0.8375222086906433, "reward_std": 0.21086472272872925, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5541878938674927, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8192939162254333, "step": 62 }, { "adv/mean_abs_final_conf": 0.4292864501476288, "adv/mean_abs_reasoning": 0.41687119007110596, "adv/mean_abs_step_conf": 0.7617143392562866, "adv/ratio_final_to_reasoning": 1.0297820055024793, "adv/ratio_step_to_reasoning": 1.8272175132235944, "adv/std_final_conf": 0.7025664448738098, "adv/std_reasoning": 0.6816080808639526, "adv/std_step_conf": 0.9308570027351379, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5537826595606395, "calib/avg_num_step_conf": 6.74609375, "calib/ece": 0.3613333333333332, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9686274509803922, "calib/gap": 0.022671259586637182, "calib/mean_conf": 0.9770196078431372, "calib/mu_c": 0.9857324840764331, "calib/mu_w": 0.9630612244897959, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3613333333333332, "calib/std_conf": 0.07232104084473495, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5041708043694141, "calib/step_q_c_n": 1007.0, "calib/step_q_gap": 0.03707358214719175, "calib/step_q_w": 0.46709722222222233, "calib/step_q_w_n": 720.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 576.10546875, "completions/mean_terminated_length": 576.10546875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.0672, "grad_norm": 0.039514731615781784, "learning_rate": 3.8055555555555556e-06, "loss": 0.0053, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03206784278154373, "mask/share_reasoning": 0.8391179442405701, "mask/share_step_conf": 0.12881425023078918, "num_tokens": 15127024.0, "reward": 0.9113430976867676, "reward_std": 0.15366439521312714, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6358265280723572, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8649846315383911, "step": 63 }, { "adv/mean_abs_final_conf": 0.5828859210014343, "adv/mean_abs_reasoning": 0.5493597984313965, "adv/mean_abs_step_conf": 0.744002103805542, "adv/ratio_final_to_reasoning": 1.0610276228179891, "adv/ratio_step_to_reasoning": 1.3543075156389557, "adv/std_final_conf": 0.8123847842216492, "adv/std_reasoning": 0.7929263710975647, "adv/std_step_conf": 0.9302653670310974, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5185117569738572, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.3153815261044177, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.005405286987001601, "calib/mean_conf": 0.9860642570281126, "calib/mu_c": 0.9878443113772454, "calib/mu_w": 0.9824390243902438, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3153815261044177, "calib/std_conf": 0.03338608106961565, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48400000000000004, "calib/step_q_c_n": 1015.0, "calib/step_q_gap": 0.022465608465608522, "calib/step_q_w": 0.4615343915343915, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 505.39453125, "completions/mean_terminated_length": 509.3740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.06826666666666667, "grad_norm": 0.08964262902736664, "learning_rate": 3.777777777777778e-06, "loss": 0.0616, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0351257249712944, "mask/share_reasoning": 0.8196390867233276, "mask/share_step_conf": 0.13742271065711975, "num_tokens": 15360181.0, "reward": 0.9109358787536621, "reward_std": 0.21645523607730865, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.6623206734657288, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8345509767532349, "step": 64 }, { "adv/mean_abs_final_conf": 0.2017216831445694, "adv/mean_abs_reasoning": 0.19740091264247894, "adv/mean_abs_step_conf": 0.7495631575584412, "adv/ratio_final_to_reasoning": 1.021888300536462, "adv/ratio_step_to_reasoning": 3.797161560828274, "adv/std_final_conf": 0.4967176914215088, "adv/std_reasoning": 0.4958440661430359, "adv/std_step_conf": 0.9305339455604553, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5045454545454545, "calib/avg_num_step_conf": 5.8359375, "calib/ece": 0.41956862745098045, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.0041818181818185085, "calib/mean_conf": 0.9881960784313726, "calib/mu_c": 0.9900000000000001, "calib/mu_w": 0.9858181818181816, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.41956862745098045, "calib/std_conf": 0.028749778930330372, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5030480656506449, "calib/step_q_c_n": 853.0, "calib/step_q_gap": 0.05794666159448253, "calib/step_q_w": 0.44510140405616233, "calib/step_q_w_n": 641.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 418.296875, "completions/mean_terminated_length": 419.9372863769531, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.06933333333333333, "grad_norm": 0.03971258923411369, "learning_rate": 3.7500000000000005e-06, "loss": -0.0452, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03916507214307785, "mask/share_reasoning": 0.8115283250808716, "mask/share_step_conf": 0.14540034532546997, "num_tokens": 15572289.0, "reward": 0.8780311346054077, "reward_std": 0.0837891697883606, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.5776315927505493, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8659305572509766, "step": 65 }, { "adv/mean_abs_final_conf": 0.4690343737602234, "adv/mean_abs_reasoning": 0.43525969982147217, "adv/mean_abs_step_conf": 0.7657510042190552, "adv/ratio_final_to_reasoning": 1.0775966025630317, "adv/ratio_step_to_reasoning": 1.7592968164365748, "adv/std_final_conf": 0.7395740747451782, "adv/std_reasoning": 0.7205032706260681, "adv/std_step_conf": 0.9319112300872803, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.555878084179971, "calib/avg_num_step_conf": 6.92578125, "calib/ece": 0.4014457831325302, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.963855421686747, "calib/gap": 0.026591238949729368, "calib/mean_conf": 0.9757429718875502, "calib/mu_c": 0.9870629370629369, "calib/mu_w": 0.9604716981132075, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4014457831325302, "calib/std_conf": 0.07292752495723935, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45279886363636374, "calib/step_q_c_n": 880.0, "calib/step_q_gap": 0.05453458592079824, "calib/step_q_w": 0.3982642777155655, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 577.3984375, "completions/mean_terminated_length": 579.6627807617188, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.0704, "grad_norm": 0.045602548867464066, "learning_rate": 3.7222222222222225e-06, "loss": -0.005, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03310517594218254, "mask/share_reasoning": 0.8324896097183228, "mask/share_step_conf": 0.13049902021884918, "num_tokens": 15826455.0, "reward": 0.8676662445068359, "reward_std": 0.17554137110710144, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.5855827927589417, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8434996008872986, "step": 66 }, { "adv/mean_abs_final_conf": 0.34227991104125977, "adv/mean_abs_reasoning": 0.23182302713394165, "adv/mean_abs_step_conf": 0.7574501037597656, "adv/ratio_final_to_reasoning": 1.4764707167916449, "adv/ratio_step_to_reasoning": 3.2673635277919546, "adv/std_final_conf": 0.621044933795929, "adv/std_reasoning": 0.5227372050285339, "adv/std_step_conf": 0.9281366467475891, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5851394211708736, "calib/avg_num_step_conf": 6.390625, "calib/ece": 0.3375686274509805, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9450980392156862, "calib/gap": 0.05679067001453708, "calib/mean_conf": 0.9689411764705883, "calib/mu_c": 0.9898757763975156, "calib/mu_w": 0.9330851063829785, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3375686274509805, "calib/std_conf": 0.09115842958767131, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46348416289592764, "calib/step_q_c_n": 884.0, "calib/step_q_gap": 0.06419825864060846, "calib/step_q_w": 0.3992859042553192, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 504.78125, "completions/mean_terminated_length": 504.78125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.07146666666666666, "grad_norm": 0.04864136129617691, "learning_rate": 3.694444444444445e-06, "loss": 0.0039, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03638178110122681, "mask/share_reasoning": 0.8302202820777893, "mask/share_step_conf": 0.13339796662330627, "num_tokens": 16060687.0, "reward": 0.9319067001342773, "reward_std": 0.10852377116680145, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.6688085794448853, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8700047135353088, "step": 67 }, { "adv/mean_abs_final_conf": 0.4381498098373413, "adv/mean_abs_reasoning": 0.43346601724624634, "adv/mean_abs_step_conf": 0.7439643740653992, "adv/ratio_final_to_reasoning": 1.01080544357514, "adv/ratio_step_to_reasoning": 1.7163153383781014, "adv/std_final_conf": 0.7203687429428101, "adv/std_reasoning": 0.7204791903495789, "adv/std_step_conf": 0.9302870631217957, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5455629139072847, "calib/avg_num_step_conf": 6.4609375, "calib/ece": 0.36521912350597613, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9561752988047809, "calib/gap": 0.04423708609271493, "calib/mean_conf": 0.9668127490039842, "calib/mu_c": 0.9844370860927151, "calib/mu_w": 0.9402000000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36521912350597613, "calib/std_conf": 0.11096193308714032, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4352251184834124, "calib/step_q_c_n": 844.0, "calib/step_q_gap": 0.08008931601427655, "calib/step_q_w": 0.35513580246913584, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 510.81640625, "completions/mean_terminated_length": 510.81640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.07253333333333334, "grad_norm": 0.04457082226872444, "learning_rate": 3.6666666666666666e-06, "loss": 0.0289, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036559879779815674, "mask/share_reasoning": 0.8241531848907471, "mask/share_step_conf": 0.13928695023059845, "num_tokens": 16295544.0, "reward": 0.8986823558807373, "reward_std": 0.1547134965658188, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6234105229377747, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8591104745864868, "step": 68 }, { "adv/mean_abs_final_conf": 0.5651110410690308, "adv/mean_abs_reasoning": 0.531714677810669, "adv/mean_abs_step_conf": 0.753547191619873, "adv/ratio_final_to_reasoning": 1.0628088045187525, "adv/ratio_step_to_reasoning": 1.417202163240251, "adv/std_final_conf": 0.7929477691650391, "adv/std_reasoning": 0.7928342819213867, "adv/std_step_conf": 0.93269282579422, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5532371794871795, "calib/avg_num_step_conf": 6.75, "calib/ece": 0.49424000000000007, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.964, "calib/gap": 0.025339743589743446, "calib/mean_conf": 0.9742400000000002, "calib/mu_c": 0.9874166666666667, "calib/mu_w": 0.9620769230769233, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.49424000000000007, "calib/std_conf": 0.08760606371707381, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4182626538987688, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.05224459973628126, "calib/step_q_w": 0.36601805416248756, "calib/step_q_w_n": 997.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 570.68359375, "completions/mean_terminated_length": 582.0518188476562, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0736, "grad_norm": 0.04093863442540169, "learning_rate": 3.638888888888889e-06, "loss": -0.1506, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031352706253528595, "mask/share_reasoning": 0.8293837904930115, "mask/share_step_conf": 0.11973226070404053, "num_tokens": 16546135.0, "reward": 0.8139256238937378, "reward_std": 0.19919061660766602, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.4989679753780365, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8406020402908325, "step": 69 }, { "adv/mean_abs_final_conf": 0.5300338268280029, "adv/mean_abs_reasoning": 0.46458911895751953, "adv/mean_abs_step_conf": 0.7436332702636719, "adv/ratio_final_to_reasoning": 1.14086577838356, "adv/ratio_step_to_reasoning": 1.6006256709849187, "adv/std_final_conf": 0.7751262784004211, "adv/std_reasoning": 0.7392660975456238, "adv/std_step_conf": 0.9322122931480408, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.646482577251808, "calib/avg_num_step_conf": 6.6171875, "calib/ece": 0.3817408906882591, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.854251012145749, "calib/gap": 0.15967521367521398, "calib/mean_conf": 0.9080566801619432, "calib/mu_c": 0.9836923076923079, "calib/mu_w": 0.8240170940170939, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3817408906882591, "calib/std_conf": 0.22664841243704695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.411068493150685, "calib/step_q_c_n": 730.0, "calib/step_q_gap": 0.10200210310919117, "calib/step_q_w": 0.30906639004149383, "calib/step_q_w_n": 964.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 535.23046875, "completions/mean_terminated_length": 543.7261962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.07466666666666667, "grad_norm": 0.08670484274625778, "learning_rate": 3.6111111111111115e-06, "loss": -0.1004, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03494434431195259, "mask/share_reasoning": 0.8160851001739502, "mask/share_step_conf": 0.1333456039428711, "num_tokens": 16790146.0, "reward": 0.8777591586112976, "reward_std": 0.1863366961479187, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6109519004821777, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8492538928985596, "step": 70 }, { "adv/mean_abs_final_conf": 0.5125067234039307, "adv/mean_abs_reasoning": 0.44174620509147644, "adv/mean_abs_step_conf": 0.7183400988578796, "adv/ratio_final_to_reasoning": 1.160183647299927, "adv/ratio_step_to_reasoning": 1.6261375662732096, "adv/std_final_conf": 0.7567806243896484, "adv/std_reasoning": 0.7014018893241882, "adv/std_step_conf": 0.9308453798294067, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5626856803327391, "calib/avg_num_step_conf": 7.6796875, "calib/ece": 0.32253968253968246, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8452380952380952, "calib/gap": 0.046179441473559124, "calib/mean_conf": 0.9123809523809525, "calib/mu_c": 0.9305228758169934, "calib/mu_w": 0.8843434343434343, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.31388888888888883, "calib/std_conf": 0.19549054526493564, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.38961165048543694, "calib/step_q_c_n": 1030.0, "calib/step_q_gap": 0.07593109492988137, "calib/step_q_w": 0.3136805555555556, "calib/step_q_w_n": 936.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 525.77734375, "completions/mean_terminated_length": 529.9172973632812, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.07573333333333333, "grad_norm": 0.04918983578681946, "learning_rate": 3.5833333333333335e-06, "loss": -0.0623, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03495267033576965, "mask/share_reasoning": 0.8089569211006165, "mask/share_step_conf": 0.14827793836593628, "num_tokens": 17029153.0, "reward": 0.9064410924911499, "reward_std": 0.15928587317466736, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6419327855110168, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8545430302619934, "step": 71 }, { "adv/mean_abs_final_conf": 0.49696022272109985, "adv/mean_abs_reasoning": 0.3554950952529907, "adv/mean_abs_step_conf": 0.7681937217712402, "adv/ratio_final_to_reasoning": 1.3979383382699961, "adv/ratio_step_to_reasoning": 2.1609122939503553, "adv/std_final_conf": 0.7404859066009521, "adv/std_reasoning": 0.6185715794563293, "adv/std_step_conf": 0.9309424757957458, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6303114400246685, "calib/avg_num_step_conf": 5.97265625, "calib/ece": 0.3733203124999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.83984375, "calib/gap": 0.10738390379278451, "calib/mean_conf": 0.9241015625000001, "calib/mu_c": 0.9723404255319148, "calib/mu_w": 0.8649565217391303, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3733203124999999, "calib/std_conf": 0.1616177173457743, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4070370370370371, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.06840004120949883, "calib/step_q_w": 0.33863699582753826, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 456.06640625, "completions/mean_terminated_length": 457.85491943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.0768, "grad_norm": 0.0759405568242073, "learning_rate": 3.555555555555556e-06, "loss": -0.0303, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03548864275217056, "mask/share_reasoning": 0.8234069347381592, "mask/share_step_conf": 0.13719822466373444, "num_tokens": 17250314.0, "reward": 0.9098681807518005, "reward_std": 0.13719965517520905, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6402285099029541, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8693515062332153, "step": 72 }, { "adv/mean_abs_final_conf": 0.5311991572380066, "adv/mean_abs_reasoning": 0.4925178289413452, "adv/mean_abs_step_conf": 0.7761179208755493, "adv/ratio_final_to_reasoning": 1.0785379249717841, "adv/ratio_step_to_reasoning": 1.575816905032241, "adv/std_final_conf": 0.7763379812240601, "adv/std_reasoning": 0.7574392557144165, "adv/std_step_conf": 0.9311206936836243, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6871885382059801, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.23546875000000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.78515625, "calib/gap": 0.185753045404208, "calib/mean_conf": 0.8849218750000001, "calib/mu_c": 0.9458720930232558, "calib/mu_w": 0.7601190476190478, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22425781250000004, "calib/std_conf": 0.21694611980048037, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4082613510520487, "calib/step_q_c_n": 903.0, "calib/step_q_gap": 0.05640568094895598, "calib/step_q_w": 0.3518556701030927, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 430.1015625, "completions/mean_terminated_length": 431.78826904296875, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.07786666666666667, "grad_norm": 0.08444429188966751, "learning_rate": 3.5277777777777784e-06, "loss": -0.057, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03705844283103943, "mask/share_reasoning": 0.8286069631576538, "mask/share_step_conf": 0.13042829930782318, "num_tokens": 17467452.0, "reward": 0.9845550060272217, "reward_std": 0.14298292994499207, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7689882516860962, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8657466173171997, "step": 73 }, { "adv/mean_abs_final_conf": 0.5872247219085693, "adv/mean_abs_reasoning": 0.41632869839668274, "adv/mean_abs_step_conf": 0.775867223739624, "adv/ratio_final_to_reasoning": 1.4104834093110126, "adv/ratio_step_to_reasoning": 1.8635929416529649, "adv/std_final_conf": 0.8109135031700134, "adv/std_reasoning": 0.6815744638442993, "adv/std_step_conf": 0.931576669216156, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6481874999999999, "calib/avg_num_step_conf": 6.21484375, "calib/ece": 0.2945059288537548, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6245059288537549, "calib/gap": 0.15327250000000014, "calib/mean_conf": 0.7945849802371543, "calib/mu_c": 0.8703125, "calib/mu_w": 0.7170399999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.29158102766798405, "calib/std_conf": 0.276334572824399, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.37592087312414735, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.07019826240153665, "calib/step_q_w": 0.3057226107226107, "calib/step_q_w_n": 858.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 501.54296875, "completions/mean_terminated_length": 503.50982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.07893333333333333, "grad_norm": 0.0911051332950592, "learning_rate": 3.5e-06, "loss": -0.0789, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03497898578643799, "mask/share_reasoning": 0.8209805488586426, "mask/share_step_conf": 0.14013421535491943, "num_tokens": 17699775.0, "reward": 0.9126991033554077, "reward_std": 0.14510872960090637, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6552554368972778, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.873267650604248, "step": 74 }, { "adv/mean_abs_final_conf": 0.41595423221588135, "adv/mean_abs_reasoning": 0.33379027247428894, "adv/mean_abs_step_conf": 0.7602725625038147, "adv/ratio_final_to_reasoning": 1.2461544464209073, "adv/ratio_step_to_reasoning": 2.2776953829964492, "adv/std_final_conf": 0.6846178770065308, "adv/std_reasoning": 0.6185281872749329, "adv/std_step_conf": 0.9292907118797302, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8232046332046332, "calib/avg_num_step_conf": 6.3046875, "calib/ece": 0.10133333333333334, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6705882352941176, "calib/gap": 0.3401737451737451, "calib/mean_conf": 0.8240784313725492, "calib/mu_c": 0.9174594594594595, "calib/mu_w": 0.5772857142857144, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0999607843137255, "calib/std_conf": 0.25405386514977374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39514535714285715, "calib/step_q_c_n": 1120.0, "calib/step_q_gap": 0.07751377819548871, "calib/step_q_w": 0.31763157894736843, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 433.2734375, "completions/mean_terminated_length": 434.9725646972656, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.08, "grad_norm": 0.07387439161539078, "learning_rate": 3.4722222222222224e-06, "loss": -0.0055, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03749409317970276, "mask/share_reasoning": 0.8042377829551697, "mask/share_step_conf": 0.15436190366744995, "num_tokens": 17915445.0, "reward": 1.0359642505645752, "reward_std": 0.08390337228775024, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8587093353271484, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8694689273834229, "step": 75 }, { "adv/mean_abs_final_conf": 0.43272048234939575, "adv/mean_abs_reasoning": 0.2551872134208679, "adv/mean_abs_step_conf": 0.7533714771270752, "adv/ratio_final_to_reasoning": 1.695698136864447, "adv/ratio_step_to_reasoning": 2.9522305096242265, "adv/std_final_conf": 0.7211237549781799, "adv/std_reasoning": 0.5483068227767944, "adv/std_step_conf": 0.9313459396362305, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7246901597730327, "calib/avg_num_step_conf": 5.453125, "calib/ece": 0.14231372549019605, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5294117647058824, "calib/gap": 0.2826183365686128, "calib/mean_conf": 0.7153333333333334, "calib/mu_c": 0.7973480662983425, "calib/mu_w": 0.5147297297297297, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07392156862745095, "calib/std_conf": 0.31995800378019384, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4056301652892562, "calib/step_q_c_n": 968.0, "calib/step_q_gap": 0.08953203444813468, "calib/step_q_w": 0.31609813084112154, "calib/step_q_w_n": 428.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 446.8671875, "completions/mean_terminated_length": 446.8671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08106666666666666, "grad_norm": 0.08966308832168579, "learning_rate": 3.444444444444445e-06, "loss": -0.031, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0387980118393898, "mask/share_reasoning": 0.823246955871582, "mask/share_step_conf": 0.13795502483844757, "num_tokens": 18132899.0, "reward": 1.003382682800293, "reward_std": 0.09004627168178558, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7974995970726013, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8702032566070557, "step": 76 }, { "adv/mean_abs_final_conf": 0.6654222011566162, "adv/mean_abs_reasoning": 0.4536302089691162, "adv/mean_abs_step_conf": 0.7507677674293518, "adv/ratio_final_to_reasoning": 1.4668824694651654, "adv/ratio_step_to_reasoning": 1.6550215408614137, "adv/std_final_conf": 0.8907448053359985, "adv/std_reasoning": 0.7392147779464722, "adv/std_step_conf": 0.9320644736289978, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6540658602150538, "calib/avg_num_step_conf": 6.3203125, "calib/ece": 0.21644268774703546, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.30039525691699603, "calib/gap": 0.174858870967742, "calib/mean_conf": 0.555098814229249, "calib/mu_c": 0.619375, "calib/mu_w": 0.444516129032258, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06956521739130427, "calib/std_conf": 0.318255636736884, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.33561269146608314, "calib/step_q_c_n": 914.0, "calib/step_q_gap": 0.062359850556992225, "calib/step_q_w": 0.2732528409090909, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 461.9609375, "completions/mean_terminated_length": 463.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.08213333333333334, "grad_norm": 0.15110744535923004, "learning_rate": 3.416666666666667e-06, "loss": 0.0339, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03926243633031845, "mask/share_reasoning": 0.8083683848381042, "mask/share_step_conf": 0.1484629511833191, "num_tokens": 18355825.0, "reward": 0.9423686265945435, "reward_std": 0.1237025111913681, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7302652597427368, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8341595530509949, "step": 77 }, { "adv/mean_abs_final_conf": 0.651028573513031, "adv/mean_abs_reasoning": 0.4202715754508972, "adv/mean_abs_step_conf": 0.7488130331039429, "adv/ratio_final_to_reasoning": 1.5490663931163113, "adv/ratio_step_to_reasoning": 1.781736088862453, "adv/std_final_conf": 0.8752011060714722, "adv/std_reasoning": 0.6816163063049316, "adv/std_step_conf": 0.9324920773506165, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7661667315932996, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.10632411067193673, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3241106719367589, "calib/gap": 0.2798311907544474, "calib/mean_conf": 0.6242687747035573, "calib/mu_c": 0.7370860927152317, "calib/mu_w": 0.45725490196078433, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06687747035573119, "calib/std_conf": 0.2997402678243543, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.34968652037617554, "calib/step_q_c_n": 957.0, "calib/step_q_gap": 0.07189916405433644, "calib/step_q_w": 0.2777873563218391, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 486.23046875, "completions/mean_terminated_length": 490.0590515136719, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.0832, "grad_norm": 0.09350687265396118, "learning_rate": 3.3888888888888893e-06, "loss": -0.0862, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034544557332992554, "mask/share_reasoning": 0.8210122585296631, "mask/share_step_conf": 0.13663063943386078, "num_tokens": 18588324.0, "reward": 0.9894336462020874, "reward_std": 0.11710391193628311, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7940328121185303, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8692094683647156, "step": 78 }, { "adv/mean_abs_final_conf": 0.5458283424377441, "adv/mean_abs_reasoning": 0.4055037498474121, "adv/mean_abs_step_conf": 0.7363581657409668, "adv/ratio_final_to_reasoning": 1.3460500492119618, "adv/ratio_step_to_reasoning": 1.8159096334326197, "adv/std_final_conf": 0.8096640110015869, "adv/std_reasoning": 0.681533694267273, "adv/std_step_conf": 0.9306913614273071, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6668287622382156, "calib/avg_num_step_conf": 6.5078125, "calib/ece": 0.17761718750000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.47265625, "calib/gap": 0.167319587628866, "calib/mean_conf": 0.7166015625000001, "calib/mu_c": 0.78, "calib/mu_w": 0.612680412371134, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13656250000000003, "calib/std_conf": 0.28650059554485846, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3289990186457311, "calib/step_q_c_n": 1019.0, "calib/step_q_gap": 0.07335759669828129, "calib/step_q_w": 0.2556414219474498, "calib/step_q_w_n": 647.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 488.79296875, "completions/mean_terminated_length": 490.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.08426666666666667, "grad_norm": 0.09503999352455139, "learning_rate": 3.3611111111111117e-06, "loss": -0.0556, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03435365483164787, "mask/share_reasoning": 0.8235607147216797, "mask/share_step_conf": 0.13817936182022095, "num_tokens": 18819831.0, "reward": 0.9725010991096497, "reward_std": 0.09718985855579376, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7486811876296997, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8728834390640259, "step": 79 }, { "adv/mean_abs_final_conf": 0.6295976638793945, "adv/mean_abs_reasoning": 0.4713048040866852, "adv/mean_abs_step_conf": 0.7639672160148621, "adv/ratio_final_to_reasoning": 1.335860908737088, "adv/ratio_step_to_reasoning": 1.6209620809940835, "adv/std_final_conf": 0.8431612253189087, "adv/std_reasoning": 0.7205964922904968, "adv/std_step_conf": 0.9322971701622009, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7113510220657913, "calib/avg_num_step_conf": 6.55078125, "calib/ece": 0.19917647058823534, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6901960784313725, "calib/gap": 0.2162758900771624, "calib/mean_conf": 0.8442745098039216, "calib/mu_c": 0.9197590361445782, "calib/mu_w": 0.7034831460674158, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19623529411764712, "calib/std_conf": 0.24391288049592322, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3613254203758655, "calib/step_q_c_n": 1011.0, "calib/step_q_gap": 0.1055146095650547, "calib/step_q_w": 0.2558108108108108, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2055.0, "completions/max_terminated_length": 2055.0, "completions/mean_length": 447.6796875, "completions/mean_terminated_length": 447.6796875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.08533333333333333, "grad_norm": 0.0750783234834671, "learning_rate": 3.3333333333333333e-06, "loss": -0.0, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.038063328713178635, "mask/share_reasoning": 0.8109001517295837, "mask/share_step_conf": 0.15103650093078613, "num_tokens": 19036597.0, "reward": 0.9909579157829285, "reward_std": 0.1509701907634735, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7711926102638245, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8818169832229614, "step": 80 }, { "adv/mean_abs_final_conf": 0.4786444902420044, "adv/mean_abs_reasoning": 0.3697603940963745, "adv/mean_abs_step_conf": 0.746623158454895, "adv/ratio_final_to_reasoning": 1.2944720361728366, "adv/ratio_step_to_reasoning": 2.019208034109502, "adv/std_final_conf": 0.7590821385383606, "adv/std_reasoning": 0.6612280607223511, "adv/std_step_conf": 0.9312000274658203, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6862087116725885, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.2549003984063745, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7131474103585658, "calib/gap": 0.24628372154230815, "calib/mean_conf": 0.8377689243027889, "calib/mu_c": 0.9378523489932883, "calib/mu_w": 0.6915686274509801, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24952191235059762, "calib/std_conf": 0.2693621697204938, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3830434782608696, "calib/step_q_c_n": 805.0, "calib/step_q_gap": 0.15963198939803255, "calib/step_q_w": 0.22341148886283704, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 494.91796875, "completions/mean_terminated_length": 496.8588562011719, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.0864, "grad_norm": 0.08149909973144531, "learning_rate": 3.3055555555555558e-06, "loss": 0.04, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03588252514600754, "mask/share_reasoning": 0.8174002170562744, "mask/share_step_conf": 0.14281101524829865, "num_tokens": 19269544.0, "reward": 0.943608283996582, "reward_std": 0.14294570684432983, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7283687591552734, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8471290469169617, "step": 81 }, { "adv/mean_abs_final_conf": 0.6110142469406128, "adv/mean_abs_reasoning": 0.5129064321517944, "adv/mean_abs_step_conf": 0.75091153383255, "adv/ratio_final_to_reasoning": 1.1912781915742936, "adv/ratio_step_to_reasoning": 1.4640322030711406, "adv/std_final_conf": 0.8275887966156006, "adv/std_reasoning": 0.7753466367721558, "adv/std_step_conf": 0.9330487847328186, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6737028681920723, "calib/avg_num_step_conf": 6.59375, "calib/ece": 0.28015873015873016, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7103174603174603, "calib/gap": 0.17536577505639717, "calib/mean_conf": 0.8317460317460318, "calib/mu_c": 0.9062068965517242, "calib/mu_w": 0.730841121495327, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26825396825396824, "calib/std_conf": 0.2665091182366948, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.383, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.14293318485523387, "calib/step_q_w": 0.24006681514476613, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2664.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 440.87890625, "completions/mean_terminated_length": 446.10675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.08746666666666666, "grad_norm": 0.09818227589130402, "learning_rate": 3.277777777777778e-06, "loss": -0.0946, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037333473563194275, "mask/share_reasoning": 0.805363118648529, "mask/share_step_conf": 0.14558462798595428, "num_tokens": 19487961.0, "reward": 0.9272757768630981, "reward_std": 0.1902877688407898, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6936218738555908, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8507733345031738, "step": 82 }, { "adv/mean_abs_final_conf": 0.5640679597854614, "adv/mean_abs_reasoning": 0.3593941926956177, "adv/mean_abs_step_conf": 0.7403038740158081, "adv/ratio_final_to_reasoning": 1.569496589676919, "adv/ratio_step_to_reasoning": 2.059865988549222, "adv/std_final_conf": 0.7934759259223938, "adv/std_reasoning": 0.6402297019958496, "adv/std_step_conf": 0.9326952695846558, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6180901432699993, "calib/avg_num_step_conf": 6.44921875, "calib/ece": 0.3313281249999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.73828125, "calib/gap": 0.1460898973129191, "calib/mean_conf": 0.8453906250000001, "calib/mu_c": 0.912158273381295, "calib/mu_w": 0.7660683760683759, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3168749999999999, "calib/std_conf": 0.27356558384071156, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3712214765100671, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.10336275686326801, "calib/step_q_w": 0.2678587196467991, "calib/step_q_w_n": 906.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 507.12890625, "completions/mean_terminated_length": 509.11767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.08853333333333334, "grad_norm": 0.09128120541572571, "learning_rate": 3.2500000000000002e-06, "loss": 0.0273, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0362110435962677, "mask/share_reasoning": 0.8244807124137878, "mask/share_step_conf": 0.13540202379226685, "num_tokens": 19725050.0, "reward": 0.9137445688247681, "reward_std": 0.13943439722061157, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6542484164237976, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8654282093048096, "step": 83 }, { "adv/mean_abs_final_conf": 0.558834969997406, "adv/mean_abs_reasoning": 0.36879509687423706, "adv/mean_abs_step_conf": 0.7573169469833374, "adv/ratio_final_to_reasoning": 1.5152993484291752, "adv/ratio_step_to_reasoning": 2.0534897383453834, "adv/std_final_conf": 0.7958536744117737, "adv/std_reasoning": 0.6402270197868347, "adv/std_step_conf": 0.9319294095039368, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6387340517775301, "calib/avg_num_step_conf": 5.2890625, "calib/ece": 0.3536078431372548, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.796078431372549, "calib/gap": 0.1412244518766257, "calib/mean_conf": 0.8860000000000001, "calib/mu_c": 0.9507971014492754, "calib/mu_w": 0.8095726495726497, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3492156862745097, "calib/std_conf": 0.22265175481358468, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.41620886981402005, "calib/step_q_c_n": 699.0, "calib/step_q_gap": 0.11069741943234063, "calib/step_q_w": 0.30551145038167943, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 417.390625, "completions/mean_terminated_length": 417.390625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0896, "grad_norm": 0.07120621204376221, "learning_rate": 3.2222222222222227e-06, "loss": 0.0132, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.040930844843387604, "mask/share_reasoning": 0.8206195831298828, "mask/share_step_conf": 0.13844957947731018, "num_tokens": 19937822.0, "reward": 0.9140485525131226, "reward_std": 0.1578373908996582, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.650799572467804, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8702661991119385, "step": 84 }, { "adv/mean_abs_final_conf": 0.5749181509017944, "adv/mean_abs_reasoning": 0.47961947321891785, "adv/mean_abs_step_conf": 0.7351856827735901, "adv/ratio_final_to_reasoning": 1.1986964312422286, "adv/ratio_step_to_reasoning": 1.5328520292127952, "adv/std_final_conf": 0.8260165452957153, "adv/std_reasoning": 0.7574634552001953, "adv/std_step_conf": 0.9331307411193848, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7308958306441203, "calib/avg_num_step_conf": 6.45703125, "calib/ece": 0.31855421686746993, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7108433734939759, "calib/gap": 0.2720536982057571, "calib/mean_conf": 0.8285943775100402, "calib/mu_c": 0.9618897637795275, "calib/mu_w": 0.6898360655737704, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.31855421686746993, "calib/std_conf": 0.2787479755334017, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.39964529331514326, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.14666703244557805, "calib/step_q_w": 0.2529782608695652, "calib/step_q_w_n": 920.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 486.53125, "completions/mean_terminated_length": 492.3004150390625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.09066666666666667, "grad_norm": 0.08083637058734894, "learning_rate": 3.1944444444444443e-06, "loss": -0.0126, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03679840639233589, "mask/share_reasoning": 0.8089617490768433, "mask/share_step_conf": 0.14252111315727234, "num_tokens": 20170198.0, "reward": 0.9124147891998291, "reward_std": 0.1984560340642929, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6802054643630981, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8524364829063416, "step": 85 }, { "adv/mean_abs_final_conf": 0.5518361330032349, "adv/mean_abs_reasoning": 0.40247124433517456, "adv/mean_abs_step_conf": 0.7445517778396606, "adv/ratio_final_to_reasoning": 1.3711194048528608, "adv/ratio_step_to_reasoning": 1.8499502469289566, "adv/std_final_conf": 0.7940528392791748, "adv/std_reasoning": 0.681560218334198, "adv/std_step_conf": 0.9328047633171082, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.68415503875969, "calib/avg_num_step_conf": 6.01171875, "calib/ece": 0.3247244094488189, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7007874015748031, "calib/gap": 0.1996527131782947, "calib/mean_conf": 0.8325984251968505, "calib/mu_c": 0.9308527131782947, "calib/mu_w": 0.7312, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3247244094488189, "calib/std_conf": 0.2575924651610683, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.38953191489361705, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.10978371345476812, "calib/step_q_w": 0.27974820143884893, "calib/step_q_w_n": 834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 453.5703125, "completions/mean_terminated_length": 455.34906005859375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.09173333333333333, "grad_norm": 0.0909588634967804, "learning_rate": 3.1666666666666667e-06, "loss": 0.0044, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04099898040294647, "mask/share_reasoning": 0.8147042989730835, "mask/share_step_conf": 0.14039045572280884, "num_tokens": 20391824.0, "reward": 0.9142040014266968, "reward_std": 0.1749543696641922, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6614238023757935, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8701090812683105, "step": 86 }, { "adv/mean_abs_final_conf": 0.4856010675430298, "adv/mean_abs_reasoning": 0.42319798469543457, "adv/mean_abs_step_conf": 0.7525032758712769, "adv/ratio_final_to_reasoning": 1.1474560019289912, "adv/ratio_step_to_reasoning": 1.7781353009344678, "adv/std_final_conf": 0.7397770881652832, "adv/std_reasoning": 0.6816813945770264, "adv/std_step_conf": 0.9327853918075562, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5689840392879066, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.24723320158102774, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9288537549407114, "calib/gap": 0.060748925721301306, "calib/mean_conf": 0.9579051383399211, "calib/mu_c": 0.9751933701657458, "calib/mu_w": 0.9144444444444445, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24486166007905144, "calib/std_conf": 0.1267812177227895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4237377049180328, "calib/step_q_c_n": 915.0, "calib/step_q_gap": 0.08746184284906727, "calib/step_q_w": 0.33627586206896554, "calib/step_q_w_n": 435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 413.2265625, "completions/mean_terminated_length": 414.8470764160156, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.0928, "grad_norm": 0.07672244310379028, "learning_rate": 3.138888888888889e-06, "loss": 0.1069, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04202680289745331, "mask/share_reasoning": 0.8173971176147461, "mask/share_step_conf": 0.13666987419128418, "num_tokens": 20603106.0, "reward": 0.9694166779518127, "reward_std": 0.15908363461494446, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7375199198722839, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8622509837150574, "step": 87 }, { "adv/mean_abs_final_conf": 0.5365962982177734, "adv/mean_abs_reasoning": 0.47331148386001587, "adv/mean_abs_step_conf": 0.7489129304885864, "adv/ratio_final_to_reasoning": 1.1337064840296045, "adv/ratio_step_to_reasoning": 1.5822834560888892, "adv/std_final_conf": 0.7939905524253845, "adv/std_reasoning": 0.7574472427368164, "adv/std_step_conf": 0.9319527745246887, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7550013702384215, "calib/avg_num_step_conf": 5.91796875, "calib/ece": 0.20233201581027666, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7391304347826086, "calib/gap": 0.300604275143875, "calib/mean_conf": 0.8419367588932808, "calib/mu_c": 0.9476829268292682, "calib/mu_w": 0.6470786516853932, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19802371541501973, "calib/std_conf": 0.27218302802277267, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.37860119047619045, "calib/step_q_c_n": 1008.0, "calib/step_q_gap": 0.09512979008171313, "calib/step_q_w": 0.2834714003944773, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 489.859375, "completions/mean_terminated_length": 489.859375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.09386666666666667, "grad_norm": 0.10223963111639023, "learning_rate": 3.1111111111111116e-06, "loss": 0.1124, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03621866554021835, "mask/share_reasoning": 0.8318451642990112, "mask/share_step_conf": 0.13193616271018982, "num_tokens": 20838358.0, "reward": 0.9859171509742737, "reward_std": 0.17374292016029358, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7881089448928833, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8579440116882324, "step": 88 }, { "adv/mean_abs_final_conf": 0.5738560557365417, "adv/mean_abs_reasoning": 0.3232640326023102, "adv/mean_abs_step_conf": 0.7356339693069458, "adv/ratio_final_to_reasoning": 1.7751930244665293, "adv/ratio_step_to_reasoning": 2.2756443498678567, "adv/std_final_conf": 0.7938140630722046, "adv/std_reasoning": 0.6185097694396973, "adv/std_step_conf": 0.9329736828804016, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.715201870221773, "calib/avg_num_step_conf": 6.11328125, "calib/ece": 0.24146825396825383, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5952380952380952, "calib/gap": 0.23565236620964214, "calib/mean_conf": 0.7692460317460318, "calib/mu_c": 0.8805263157894739, "calib/mu_w": 0.6448739495798318, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24146825396825383, "calib/std_conf": 0.28588456951613805, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3869816272965879, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.11916095481838118, "calib/step_q_w": 0.26782067247820673, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 465.7109375, "completions/mean_terminated_length": 473.10321044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.09493333333333333, "grad_norm": 0.0948285236954689, "learning_rate": 3.0833333333333336e-06, "loss": -0.1544, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.039537109434604645, "mask/share_reasoning": 0.8073158264160156, "mask/share_step_conf": 0.13752208650112152, "num_tokens": 21066468.0, "reward": 0.9474252462387085, "reward_std": 0.14904716610908508, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.716819167137146, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8772500157356262, "step": 89 }, { "adv/mean_abs_final_conf": 0.5179926156997681, "adv/mean_abs_reasoning": 0.36165764927864075, "adv/mean_abs_step_conf": 0.7658787965774536, "adv/ratio_final_to_reasoning": 1.4322733577817355, "adv/ratio_step_to_reasoning": 2.117690025650139, "adv/std_final_conf": 0.7583214640617371, "adv/std_reasoning": 0.6402938365936279, "adv/std_step_conf": 0.9270572662353516, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6684695512820513, "calib/avg_num_step_conf": 6.22265625, "calib/ece": 0.23126984126984115, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6388888888888888, "calib/gap": 0.20793269230769218, "calib/mean_conf": 0.7830952380952382, "calib/mu_c": 0.8623076923076922, "calib/mu_w": 0.654375, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19765873015873003, "calib/std_conf": 0.2968350626556248, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.36621276595744684, "calib/step_q_c_n": 940.0, "calib/step_q_gap": 0.10798918249649736, "calib/step_q_w": 0.2582235834609495, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 470.0390625, "completions/mean_terminated_length": 471.88238525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.096, "grad_norm": 0.11417663842439651, "learning_rate": 3.055555555555556e-06, "loss": 0.0411, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.038944244384765625, "mask/share_reasoning": 0.8182889223098755, "mask/share_step_conf": 0.1388605684041977, "num_tokens": 21290118.0, "reward": 0.9442144632339478, "reward_std": 0.1354803889989853, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.731640636920929, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8388195037841797, "step": 90 }, { "adv/mean_abs_final_conf": 0.544395923614502, "adv/mean_abs_reasoning": 0.48889443278312683, "adv/mean_abs_step_conf": 0.7743960022926331, "adv/ratio_final_to_reasoning": 1.1135244893573897, "adv/ratio_step_to_reasoning": 1.583973860950375, "adv/std_final_conf": 0.7938852310180664, "adv/std_reasoning": 0.739333987236023, "adv/std_step_conf": 0.9330266118049622, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6385478547854786, "calib/avg_num_step_conf": 6.1171875, "calib/ece": 0.23681274900398405, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.166839603960396, "calib/mean_conf": 0.8036653386454184, "calib/mu_c": 0.8708, "calib/mu_w": 0.703960396039604, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22143426294820714, "calib/std_conf": 0.27356309052173255, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.38259675405742827, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.0902045971946831, "calib/step_q_w": 0.29239215686274517, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 476.4375, "completions/mean_terminated_length": 478.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.09706666666666666, "grad_norm": 0.07549665868282318, "learning_rate": 3.0277777777777776e-06, "loss": 0.0288, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034715428948402405, "mask/share_reasoning": 0.8294082880020142, "mask/share_step_conf": 0.13197004795074463, "num_tokens": 21519798.0, "reward": 0.9312552213668823, "reward_std": 0.15992335975170135, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7083617448806763, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8400861024856567, "step": 91 }, { "adv/mean_abs_final_conf": 0.6064884662628174, "adv/mean_abs_reasoning": 0.5143711566925049, "adv/mean_abs_step_conf": 0.7511153221130371, "adv/ratio_final_to_reasoning": 1.1790872376333126, "adv/ratio_step_to_reasoning": 1.4602594106225513, "adv/std_final_conf": 0.8270331621170044, "adv/std_reasoning": 0.7575141787528992, "adv/std_step_conf": 0.9315164685249329, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7373693379790942, "calib/avg_num_step_conf": 5.2265625, "calib/ece": 0.14392156862745092, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5411764705882353, "calib/gap": 0.2554147681586706, "calib/mean_conf": 0.7441568627450981, "calib/mu_c": 0.8353048780487805, "calib/mu_w": 0.5798901098901099, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12247058823529408, "calib/std_conf": 0.28358899924383246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3873967889908257, "calib/step_q_c_n": 872.0, "calib/step_q_gap": 0.0632766173170059, "calib/step_q_w": 0.3241201716738198, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 402.015625, "completions/mean_terminated_length": 403.5921936035156, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.09813333333333334, "grad_norm": 0.0977119654417038, "learning_rate": 3e-06, "loss": -0.0141, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0412004217505455, "mask/share_reasoning": 0.8171786665916443, "mask/share_step_conf": 0.1377146691083908, "num_tokens": 21729434.0, "reward": 1.0000758171081543, "reward_std": 0.12583737075328827, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7939882278442383, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8780382871627808, "step": 92 }, { "adv/mean_abs_final_conf": 0.5656777620315552, "adv/mean_abs_reasoning": 0.33399784564971924, "adv/mean_abs_step_conf": 0.7464229464530945, "adv/ratio_final_to_reasoning": 1.6936569184485417, "adv/ratio_step_to_reasoning": 2.2348136557620397, "adv/std_final_conf": 0.8103609085083008, "adv/std_reasoning": 0.5963025689125061, "adv/std_step_conf": 0.9319359064102173, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7271986417657047, "calib/avg_num_step_conf": 6.34375, "calib/ece": 0.14815999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.532, "calib/gap": 0.22942614601018663, "calib/mean_conf": 0.75656, "calib/mu_c": 0.8437419354838711, "calib/mu_w": 0.6143157894736845, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14235999999999996, "calib/std_conf": 0.2669804607082698, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.411528384279476, "calib/step_q_c_n": 916.0, "calib/step_q_gap": 0.08777132213258337, "calib/step_q_w": 0.32375706214689265, "calib/step_q_w_n": 708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 528.00390625, "completions/mean_terminated_length": 528.00390625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.0992, "grad_norm": 0.10494104772806168, "learning_rate": 2.9722222222222225e-06, "loss": -0.0203, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.036547817289829254, "mask/share_reasoning": 0.8225446939468384, "mask/share_step_conf": 0.14090751111507416, "num_tokens": 21970379.0, "reward": 0.9656751751899719, "reward_std": 0.13884666562080383, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7642366886138916, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8507072925567627, "step": 93 }, { "adv/mean_abs_final_conf": 0.5616555213928223, "adv/mean_abs_reasoning": 0.3449043333530426, "adv/mean_abs_step_conf": 0.7426949739456177, "adv/ratio_final_to_reasoning": 1.6284385758004207, "adv/ratio_step_to_reasoning": 2.1533361634670976, "adv/std_final_conf": 0.810353696346283, "adv/std_reasoning": 0.6401544809341431, "adv/std_step_conf": 0.9311507940292358, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7501669337606839, "calib/avg_num_step_conf": 5.984375, "calib/ece": 0.15476190476190482, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5158730158730159, "calib/gap": 0.23957532051282038, "calib/mean_conf": 0.7634126984126984, "calib/mu_c": 0.854679487179487, "calib/mu_w": 0.6151041666666667, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14956349206349212, "calib/std_conf": 0.2580056683836915, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4527167630057804, "calib/step_q_c_n": 865.0, "calib/step_q_gap": 0.1465848289727969, "calib/step_q_w": 0.3061319340329835, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 452.3125, "completions/mean_terminated_length": 457.6759033203125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.10026666666666667, "grad_norm": 0.10385382175445557, "learning_rate": 2.944444444444445e-06, "loss": -0.0691, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0380202978849411, "mask/share_reasoning": 0.8159376978874207, "mask/share_step_conf": 0.13432320952415466, "num_tokens": 22194851.0, "reward": 0.9824118614196777, "reward_std": 0.12148786336183548, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7774211168289185, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8686525821685791, "step": 94 }, { "adv/mean_abs_final_conf": 0.5447875261306763, "adv/mean_abs_reasoning": 0.4278485178947449, "adv/mean_abs_step_conf": 0.7348989248275757, "adv/ratio_final_to_reasoning": 1.273318717595043, "adv/ratio_step_to_reasoning": 1.7176614948760167, "adv/std_final_conf": 0.7937759160995483, "adv/std_reasoning": 0.6816493272781372, "adv/std_step_conf": 0.9309818148612976, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6389552635072349, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.23784313725490178, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7176470588235294, "calib/gap": 0.13794504181600975, "calib/mean_conf": 0.8537647058823531, "calib/mu_c": 0.9040740740740739, "calib/mu_w": 0.7661290322580642, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22815686274509786, "calib/std_conf": 0.23772274081253417, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4700542299349241, "calib/step_q_c_n": 922.0, "calib/step_q_gap": 0.06599172993492408, "calib/step_q_w": 0.4040625, "calib/step_q_w_n": 512.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 458.53125, "completions/mean_terminated_length": 460.3294372558594, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.10133333333333333, "grad_norm": 0.09247266501188278, "learning_rate": 2.916666666666667e-06, "loss": -0.0648, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.040411949157714844, "mask/share_reasoning": 0.8161001801490784, "mask/share_step_conf": 0.1395815908908844, "num_tokens": 22418363.0, "reward": 0.9670491218566895, "reward_std": 0.1566634327173233, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7251417636871338, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8831750750541687, "step": 95 }, { "adv/mean_abs_final_conf": 0.40070587396621704, "adv/mean_abs_reasoning": 0.34994882345199585, "adv/mean_abs_step_conf": 0.7387731075286865, "adv/ratio_final_to_reasoning": 1.1450413520855394, "adv/ratio_step_to_reasoning": 2.111088987930338, "adv/std_final_conf": 0.6824913024902344, "adv/std_reasoning": 0.6401877403259277, "adv/std_step_conf": 0.9318387508392334, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7476726726726727, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.18413385826771655, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7952755905511811, "calib/gap": 0.2388753753753753, "calib/mean_conf": 0.8927952755905514, "calib/mu_c": 0.9623888888888888, "calib/mu_w": 0.7235135135135136, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18413385826771655, "calib/std_conf": 0.20217435519605295, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4986129032258065, "calib/step_q_c_n": 930.0, "calib/step_q_gap": 0.14602242703533025, "calib/step_q_w": 0.35259047619047623, "calib/step_q_w_n": 525.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 411.37109375, "completions/mean_terminated_length": 414.6102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.1024, "grad_norm": 0.08027921617031097, "learning_rate": 2.888888888888889e-06, "loss": -0.0225, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04036235064268112, "mask/share_reasoning": 0.8120318651199341, "mask/share_step_conf": 0.1397933065891266, "num_tokens": 22629490.0, "reward": 0.9985183477401733, "reward_std": 0.1332436501979828, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.8084640502929688, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8502914905548096, "step": 96 }, { "adv/mean_abs_final_conf": 0.47524163126945496, "adv/mean_abs_reasoning": 0.38075166940689087, "adv/mean_abs_step_conf": 0.7631211280822754, "adv/ratio_final_to_reasoning": 1.2481669010401297, "adv/ratio_step_to_reasoning": 2.004248935457102, "adv/std_final_conf": 0.7217053174972534, "adv/std_reasoning": 0.6402749419212341, "adv/std_step_conf": 0.9316134452819824, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6074507389162562, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.38542968750000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.859375, "calib/gap": 0.08488916256157597, "calib/mean_conf": 0.9283203125000001, "calib/mu_c": 0.966785714285714, "calib/mu_w": 0.8818965517241381, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38343750000000004, "calib/std_conf": 0.16455405806269968, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5338696808510639, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.12172783687943267, "calib/step_q_w": 0.4121418439716312, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 405.8125, "completions/mean_terminated_length": 407.4039306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.10346666666666667, "grad_norm": 0.09753809124231339, "learning_rate": 2.861111111111111e-06, "loss": -0.0108, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04243121296167374, "mask/share_reasoning": 0.8089368343353271, "mask/share_step_conf": 0.14472568035125732, "num_tokens": 22838450.0, "reward": 0.8967458009719849, "reward_std": 0.1497422307729721, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6216902732849121, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.862426221370697, "step": 97 }, { "adv/mean_abs_final_conf": 0.43978482484817505, "adv/mean_abs_reasoning": 0.45589399337768555, "adv/mean_abs_step_conf": 0.7524597644805908, "adv/ratio_final_to_reasoning": 0.9646646616022316, "adv/ratio_step_to_reasoning": 1.6505147587176374, "adv/std_final_conf": 0.721373438835144, "adv/std_reasoning": 0.7206504940986633, "adv/std_step_conf": 0.9322798848152161, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6187590187590187, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.24927710843373502, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8393574297188755, "calib/gap": 0.14336147186147152, "calib/mean_conf": 0.8998795180722894, "calib/mu_c": 0.9482424242424241, "calib/mu_w": 0.8048809523809526, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24325301204819286, "calib/std_conf": 0.21657146242178518, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5628714107365793, "calib/step_q_c_n": 801.0, "calib/step_q_gap": 0.1687890779750012, "calib/step_q_w": 0.3940823327615781, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 467.34765625, "completions/mean_terminated_length": 471.0275573730469, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.10453333333333334, "grad_norm": 0.07917863130569458, "learning_rate": 2.8333333333333335e-06, "loss": 0.0165, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04089288413524628, "mask/share_reasoning": 0.8173341155052185, "mask/share_step_conf": 0.13396045565605164, "num_tokens": 23064275.0, "reward": 0.943149209022522, "reward_std": 0.15866869688034058, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7172074317932129, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8456535339355469, "step": 98 }, { "adv/mean_abs_final_conf": 0.5576080679893494, "adv/mean_abs_reasoning": 0.4876534342765808, "adv/mean_abs_step_conf": 0.7644209265708923, "adv/ratio_final_to_reasoning": 1.1434515350364427, "adv/ratio_step_to_reasoning": 1.5675495604883574, "adv/std_final_conf": 0.7939621210098267, "adv/std_reasoning": 0.7575070261955261, "adv/std_step_conf": 0.9333102107048035, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6657533383723859, "calib/avg_num_step_conf": 5.76171875, "calib/ece": 0.3549603174603174, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7380952380952381, "calib/gap": 0.16452380952380952, "calib/mean_conf": 0.8549603174603175, "calib/mu_c": 0.9372222222222221, "calib/mu_w": 0.7726984126984126, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3549603174603174, "calib/std_conf": 0.23729042217721738, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5305287009063445, "calib/step_q_c_n": 662.0, "calib/step_q_gap": 0.1325951215705511, "calib/step_q_w": 0.39793357933579343, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2681.0, "completions/max_terminated_length": 2681.0, "completions/mean_length": 504.19140625, "completions/mean_terminated_length": 510.16998291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.1056, "grad_norm": 0.07687898725271225, "learning_rate": 2.805555555555556e-06, "loss": -0.0539, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03424970805644989, "mask/share_reasoning": 0.8309434056282043, "mask/share_step_conf": 0.12308812141418457, "num_tokens": 23299148.0, "reward": 0.8982889652252197, "reward_std": 0.18456372618675232, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6398026943206787, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8614625930786133, "step": 99 }, { "adv/mean_abs_final_conf": 0.47142088413238525, "adv/mean_abs_reasoning": 0.3072330355644226, "adv/mean_abs_step_conf": 0.7255579233169556, "adv/ratio_final_to_reasoning": 1.53440818389315, "adv/ratio_step_to_reasoning": 2.3615882386606692, "adv/std_final_conf": 0.7401483058929443, "adv/std_reasoning": 0.5961126685142517, "adv/std_step_conf": 0.929786741733551, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8007024265644955, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.245098814229249, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6877470355731226, "calib/gap": 0.3496711366538954, "calib/mean_conf": 0.8114229249011858, "calib/mu_c": 0.9606896551724139, "calib/mu_w": 0.6110185185185185, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.241699604743083, "calib/std_conf": 0.27266861372953083, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.530967365967366, "calib/step_q_c_n": 858.0, "calib/step_q_gap": 0.1171056458999124, "calib/step_q_w": 0.41386172006745364, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 488.44921875, "completions/mean_terminated_length": 492.2952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.10666666666666667, "grad_norm": 0.08317389339208603, "learning_rate": 2.7777777777777783e-06, "loss": -0.0064, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037921108305454254, "mask/share_reasoning": 0.8238252401351929, "mask/share_step_conf": 0.13044117391109467, "num_tokens": 23531599.0, "reward": 0.9863134026527405, "reward_std": 0.1421002447605133, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7859878540039062, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8757014274597168, "step": 100 }, { "adv/mean_abs_final_conf": 0.5873388051986694, "adv/mean_abs_reasoning": 0.4507104158401489, "adv/mean_abs_step_conf": 0.7487509250640869, "adv/ratio_final_to_reasoning": 1.3031400752162288, "adv/ratio_step_to_reasoning": 1.6612682972244477, "adv/std_final_conf": 0.8106685280799866, "adv/std_reasoning": 0.7014285326004028, "adv/std_step_conf": 0.9310752749443054, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7089002267573697, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.2904365079365079, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5595238095238095, "calib/gap": 0.21246031746031757, "calib/mean_conf": 0.7738492063492064, "calib/mu_c": 0.880079365079365, "calib/mu_w": 0.6676190476190474, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.28214285714285703, "calib/std_conf": 0.2642507097033709, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4832345679012346, "calib/step_q_c_n": 810.0, "calib/step_q_gap": 0.08561673350633014, "calib/step_q_w": 0.39761783439490445, "calib/step_q_w_n": 785.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 509.09765625, "completions/mean_terminated_length": 511.0941467285156, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.10773333333333333, "grad_norm": 0.10075970739126205, "learning_rate": 2.7500000000000004e-06, "loss": -0.0249, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.035582467913627625, "mask/share_reasoning": 0.8273746967315674, "mask/share_step_conf": 0.13313661515712738, "num_tokens": 23768920.0, "reward": 0.9301528930664062, "reward_std": 0.16000109910964966, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.7002925872802734, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8647007346153259, "step": 101 }, { "adv/mean_abs_final_conf": 0.4486492872238159, "adv/mean_abs_reasoning": 0.3463757038116455, "adv/mean_abs_step_conf": 0.7536967992782593, "adv/ratio_final_to_reasoning": 1.2952677750971398, "adv/ratio_step_to_reasoning": 2.1759516934481917, "adv/std_final_conf": 0.740336537361145, "adv/std_reasoning": 0.6402208805084229, "adv/std_step_conf": 0.9320363402366638, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.728178953137846, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.20853754940711444, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7509881422924901, "calib/gap": 0.25324403946286633, "calib/mean_conf": 0.8557312252964429, "calib/mu_c": 0.9448170731707316, "calib/mu_w": 0.6915730337078653, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2080237154150196, "calib/std_conf": 0.24362536621728567, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5029542645241039, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.0861121592609459, "calib/step_q_w": 0.41684210526315796, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 416.1171875, "completions/mean_terminated_length": 419.3937072753906, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.1088, "grad_norm": 0.07325369119644165, "learning_rate": 2.7222222222222224e-06, "loss": -0.0532, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04374309629201889, "mask/share_reasoning": 0.8045555353164673, "mask/share_step_conf": 0.14388886094093323, "num_tokens": 23982142.0, "reward": 0.978011965751648, "reward_std": 0.1434057354927063, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7757730484008789, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8552509546279907, "step": 102 }, { "adv/mean_abs_final_conf": 0.4731312394142151, "adv/mean_abs_reasoning": 0.3409392833709717, "adv/mean_abs_step_conf": 0.7478026747703552, "adv/ratio_final_to_reasoning": 1.387728732037625, "adv/ratio_step_to_reasoning": 2.193360258684772, "adv/std_final_conf": 0.739928662776947, "adv/std_reasoning": 0.6185459494590759, "adv/std_step_conf": 0.9310590624809265, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7711931183925406, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.1794488188976377, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6535433070866141, "calib/gap": 0.30495239345984615, "calib/mean_conf": 0.7965354330708664, "calib/mu_c": 0.9129936305732482, "calib/mu_w": 0.6080412371134021, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17893700787401562, "calib/std_conf": 0.277089422311816, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5100984009840099, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.09677151510393256, "calib/step_q_w": 0.4133268858800774, "calib/step_q_w_n": 517.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 509.796875, "completions/mean_terminated_length": 509.796875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.10986666666666667, "grad_norm": 0.1110922247171402, "learning_rate": 2.6944444444444444e-06, "loss": -0.0246, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03668327257037163, "mask/share_reasoning": 0.8423339128494263, "mask/share_step_conf": 0.1209828332066536, "num_tokens": 24217202.0, "reward": 0.9978973269462585, "reward_std": 0.12469913065433502, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7930593490600586, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8816415667533875, "step": 103 }, { "adv/mean_abs_final_conf": 0.6043910980224609, "adv/mean_abs_reasoning": 0.42754340171813965, "adv/mean_abs_step_conf": 0.739775538444519, "adv/ratio_final_to_reasoning": 1.4136368274978293, "adv/ratio_step_to_reasoning": 1.730293428624166, "adv/std_final_conf": 0.8272894620895386, "adv/std_reasoning": 0.7013717293739319, "adv/std_step_conf": 0.9322694540023804, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7389520202020202, "calib/avg_num_step_conf": 5.828125, "calib/ece": 0.2618650793650794, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5158730158730159, "calib/gap": 0.2524621212121214, "calib/mean_conf": 0.7323412698412699, "calib/mu_c": 0.8645833333333335, "calib/mu_w": 0.6121212121212121, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2590079365079366, "calib/std_conf": 0.2834293860350912, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4800479233226837, "calib/step_q_c_n": 626.0, "calib/step_q_gap": 0.10273499029727956, "calib/step_q_w": 0.37731293302540414, "calib/step_q_w_n": 866.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 494.734375, "completions/mean_terminated_length": 494.734375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.11093333333333333, "grad_norm": 0.14319024980068207, "learning_rate": 2.666666666666667e-06, "loss": 0.0643, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03690874204039574, "mask/share_reasoning": 0.8321300745010376, "mask/share_step_conf": 0.13096114993095398, "num_tokens": 24450534.0, "reward": 0.9378859996795654, "reward_std": 0.14923880994319916, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7191511392593384, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8659958839416504, "step": 104 }, { "adv/mean_abs_final_conf": 0.6729190349578857, "adv/mean_abs_reasoning": 0.45059072971343994, "adv/mean_abs_step_conf": 0.7314640283584595, "adv/ratio_final_to_reasoning": 1.4934151783056853, "adv/ratio_step_to_reasoning": 1.6233446010388302, "adv/std_final_conf": 0.8753982782363892, "adv/std_reasoning": 0.7392308712005615, "adv/std_step_conf": 0.9325223565101624, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6679465668559628, "calib/avg_num_step_conf": 5.46484375, "calib/ece": 0.2029249011857707, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6086956521739131, "calib/gap": 0.1823289881259681, "calib/mean_conf": 0.7676679841897234, "calib/mu_c": 0.8426174496644296, "calib/mu_w": 0.6602884615384615, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19083003952569164, "calib/std_conf": 0.2887681627548953, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4685986394557824, "calib/step_q_c_n": 735.0, "calib/step_q_gap": 0.10343297680517993, "calib/step_q_w": 0.36516566265060246, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 456.05078125, "completions/mean_terminated_length": 457.8392333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.112, "grad_norm": 0.07508841156959534, "learning_rate": 2.6388888888888893e-06, "loss": -0.0351, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0396403968334198, "mask/share_reasoning": 0.8274632692337036, "mask/share_step_conf": 0.12899011373519897, "num_tokens": 24673043.0, "reward": 0.9453893899917603, "reward_std": 0.17942391335964203, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7189491987228394, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8585482239723206, "step": 105 }, { "adv/mean_abs_final_conf": 0.5386701822280884, "adv/mean_abs_reasoning": 0.37390226125717163, "adv/mean_abs_step_conf": 0.7243057489395142, "adv/ratio_final_to_reasoning": 1.4406711005622634, "adv/ratio_step_to_reasoning": 1.937152630487392, "adv/std_final_conf": 0.7936661839485168, "adv/std_reasoning": 0.6814560294151306, "adv/std_step_conf": 0.931763768196106, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7666728763040238, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.28248031496063, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6692913385826772, "calib/gap": 0.3149528067560855, "calib/mean_conf": 0.8012992125984253, "calib/mu_c": 0.9525757575757576, "calib/mu_w": 0.6376229508196721, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2820472440944882, "calib/std_conf": 0.2834772224865189, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4911273792093705, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.10643517674699016, "calib/step_q_w": 0.38469220246238034, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 463.83984375, "completions/mean_terminated_length": 465.6588439941406, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.11306666666666666, "grad_norm": 0.11216479539871216, "learning_rate": 2.6111111111111113e-06, "loss": -0.0928, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03690160810947418, "mask/share_reasoning": 0.8339164853096008, "mask/share_step_conf": 0.1252756118774414, "num_tokens": 24896370.0, "reward": 0.9552474021911621, "reward_std": 0.1446411907672882, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7421104907989502, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8668216466903687, "step": 106 }, { "adv/mean_abs_final_conf": 0.5357671976089478, "adv/mean_abs_reasoning": 0.4808008074760437, "adv/mean_abs_step_conf": 0.7439740896224976, "adv/ratio_final_to_reasoning": 1.1143225828206265, "adv/ratio_step_to_reasoning": 1.5473644762120469, "adv/std_final_conf": 0.7937635183334351, "adv/std_reasoning": 0.7392998933792114, "adv/std_step_conf": 0.932767391204834, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7604247104247105, "calib/avg_num_step_conf": 5.9140625, "calib/ece": 0.191383399209486, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6284584980237155, "calib/gap": 0.31738738738738725, "calib/mean_conf": 0.7723320158102768, "calib/mu_c": 0.9040540540540539, "calib/mu_w": 0.5866666666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18936758893280614, "calib/std_conf": 0.29928719079143, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4520554854981085, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.08247157426371177, "calib/step_q_w": 0.3695839112343967, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 460.2421875, "completions/mean_terminated_length": 463.86614990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.11413333333333334, "grad_norm": 0.059717439115047455, "learning_rate": 2.5833333333333337e-06, "loss": -0.0756, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03775336593389511, "mask/share_reasoning": 0.8178339004516602, "mask/share_step_conf": 0.13660022616386414, "num_tokens": 25118808.0, "reward": 0.9761337637901306, "reward_std": 0.1640060544013977, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7751832008361816, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8645843267440796, "step": 107 }, { "adv/mean_abs_final_conf": 0.5327150225639343, "adv/mean_abs_reasoning": 0.40658038854599, "adv/mean_abs_step_conf": 0.751448392868042, "adv/ratio_final_to_reasoning": 1.310232951640944, "adv/ratio_step_to_reasoning": 1.8482160331327504, "adv/std_final_conf": 0.7762723565101624, "adv/std_reasoning": 0.6815800666809082, "adv/std_step_conf": 0.9327512383460999, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6519259259259259, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.18400000000000008, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7647058823529411, "calib/gap": 0.18659999999999977, "calib/mean_conf": 0.8554509803921571, "calib/mu_c": 0.9103333333333332, "calib/mu_w": 0.7237333333333335, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1667843137254903, "calib/std_conf": 0.2545378213528175, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4945033112582781, "calib/step_q_c_n": 906.0, "calib/step_q_gap": 0.08121217201777181, "calib/step_q_w": 0.4132911392405063, "calib/step_q_w_n": 395.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 442.36328125, "completions/mean_terminated_length": 444.0980529785156, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.1152, "grad_norm": 0.11303754150867462, "learning_rate": 2.5555555555555557e-06, "loss": -0.0309, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.040074534714221954, "mask/share_reasoning": 0.8284124732017517, "mask/share_step_conf": 0.12760674953460693, "num_tokens": 25335285.0, "reward": 0.9829171895980835, "reward_std": 0.16700205206871033, "rewards/accuracy_reward_step": 0.70703125, "rewards/final_brier_reward_step": 0.7740527391433716, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8527189493179321, "step": 108 }, { "adv/mean_abs_final_conf": 0.5506957769393921, "adv/mean_abs_reasoning": 0.2803175747394562, "adv/mean_abs_step_conf": 0.7329084277153015, "adv/ratio_final_to_reasoning": 1.9645424567162495, "adv/ratio_step_to_reasoning": 2.6145646715034196, "adv/std_final_conf": 0.809843122959137, "adv/std_reasoning": 0.5960097312927246, "adv/std_step_conf": 0.9302277565002441, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8472541507024266, "calib/avg_num_step_conf": 6.08984375, "calib/ece": 0.13517928286852596, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4820717131474104, "calib/gap": 0.44353639846743287, "calib/mean_conf": 0.6507968127490039, "calib/mu_c": 0.8557777777777777, "calib/mu_w": 0.4122413793103449, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12406374501992039, "calib/std_conf": 0.3490421762289683, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.44309278350515463, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.14832905425866677, "calib/step_q_w": 0.29476372924648786, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 493.03125, "completions/mean_terminated_length": 498.87750244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.11626666666666667, "grad_norm": 0.09755361080169678, "learning_rate": 2.5277777777777778e-06, "loss": -0.0614, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03527119755744934, "mask/share_reasoning": 0.8214081525802612, "mask/share_step_conf": 0.1316019594669342, "num_tokens": 25566101.0, "reward": 1.0021953582763672, "reward_std": 0.12378311902284622, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.8209879398345947, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.881840169429779, "step": 109 }, { "adv/mean_abs_final_conf": 0.6958622336387634, "adv/mean_abs_reasoning": 0.4584200382232666, "adv/mean_abs_step_conf": 0.746253252029419, "adv/ratio_final_to_reasoning": 1.5179577147974805, "adv/ratio_step_to_reasoning": 1.6278809602689477, "adv/std_final_conf": 0.8754823803901672, "adv/std_reasoning": 0.720551609992981, "adv/std_step_conf": 0.9323999881744385, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6815437286025521, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.16834645669291332, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4015748031496063, "calib/gap": 0.204283846872082, "calib/mean_conf": 0.6311811023622047, "calib/mu_c": 0.7268888888888888, "calib/mu_w": 0.5226050420168068, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.134015748031496, "calib/std_conf": 0.3252282400819712, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4307389937106918, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.07017351752021561, "calib/step_q_w": 0.3605654761904762, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 410.23828125, "completions/mean_terminated_length": 413.468505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.11733333333333333, "grad_norm": 0.09466344863176346, "learning_rate": 2.5e-06, "loss": -0.1563, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04288126528263092, "mask/share_reasoning": 0.8201444149017334, "mask/share_step_conf": 0.1291617751121521, "num_tokens": 25776042.0, "reward": 0.9413056373596191, "reward_std": 0.15106236934661865, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7286386489868164, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8516287803649902, "step": 110 }, { "adv/mean_abs_final_conf": 0.6182411313056946, "adv/mean_abs_reasoning": 0.4406476616859436, "adv/mean_abs_step_conf": 0.743072509765625, "adv/ratio_final_to_reasoning": 1.4030282810086137, "adv/ratio_step_to_reasoning": 1.6863189672278898, "adv/std_final_conf": 0.8267171382904053, "adv/std_reasoning": 0.7014089226722717, "adv/std_step_conf": 0.9295375347137451, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7810117810117809, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.09736220472440935, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.40551181102362205, "calib/gap": 0.3560606060606062, "calib/mean_conf": 0.5971259842519686, "calib/mu_c": 0.7527272727272728, "calib/mu_w": 0.3966666666666666, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06574803149606287, "calib/std_conf": 0.3432521740559118, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4495, "calib/step_q_c_n": 680.0, "calib/step_q_gap": 0.1129550561797753, "calib/step_q_w": 0.3365449438202247, "calib/step_q_w_n": 712.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 455.4453125, "completions/mean_terminated_length": 457.2314147949219, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.1184, "grad_norm": 0.13044364750385284, "learning_rate": 2.4722222222222226e-06, "loss": -0.013, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.043011561036109924, "mask/share_reasoning": 0.8237156867980957, "mask/share_step_conf": 0.1293664574623108, "num_tokens": 26000044.0, "reward": 0.9977121353149414, "reward_std": 0.10175234079360962, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.803855836391449, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.881412148475647, "step": 111 }, { "adv/mean_abs_final_conf": 0.7288899421691895, "adv/mean_abs_reasoning": 0.5192070603370667, "adv/mean_abs_step_conf": 0.748733401298523, "adv/ratio_final_to_reasoning": 1.4038521388672907, "adv/ratio_step_to_reasoning": 1.4420709163940277, "adv/std_final_conf": 0.9040296077728271, "adv/std_reasoning": 0.7576315999031067, "adv/std_step_conf": 0.93282151222229, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7564194473409803, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.14425702811244975, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.2971887550200803, "calib/gap": 0.32838047445255475, "calib/mean_conf": 0.4875502008032129, "calib/mu_c": 0.6352554744525547, "calib/mu_w": 0.30687499999999995, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04080321285140557, "calib/std_conf": 0.35301003787046775, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4341702127659575, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.12570989530564008, "calib/step_q_w": 0.30846031746031743, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 480.9296875, "completions/mean_terminated_length": 490.5099792480469, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.11946666666666667, "grad_norm": 0.41237401962280273, "learning_rate": 2.4444444444444447e-06, "loss": -0.1422, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03407225385308266, "mask/share_reasoning": 0.8303850889205933, "mask/share_step_conf": 0.11601140350103378, "num_tokens": 26231082.0, "reward": 0.9550809860229492, "reward_std": 0.1408146470785141, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7650078535079956, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.843591570854187, "step": 112 }, { "adv/mean_abs_final_conf": 0.6568363904953003, "adv/mean_abs_reasoning": 0.406067430973053, "adv/mean_abs_step_conf": 0.7361575961112976, "adv/ratio_final_to_reasoning": 1.6175549684478108, "adv/ratio_step_to_reasoning": 1.8128949528093272, "adv/std_final_conf": 0.8588363528251648, "adv/std_reasoning": 0.7012441754341125, "adv/std_step_conf": 0.9312564730644226, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7611304176775293, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.13626984126984118, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.40476190476190477, "calib/gap": 0.3400052455576684, "calib/mean_conf": 0.5753174603174603, "calib/mu_c": 0.71158940397351, "calib/mu_w": 0.3715841584158416, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05619047619047611, "calib/std_conf": 0.35933511464338164, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44981969486823853, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.11916598462088873, "calib/step_q_w": 0.3306537102473498, "calib/step_q_w_n": 566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 410.3984375, "completions/mean_terminated_length": 413.6299133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.12053333333333334, "grad_norm": 0.11442249268293381, "learning_rate": 2.4166666666666667e-06, "loss": -0.0525, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04052030295133591, "mask/share_reasoning": 0.8211332559585571, "mask/share_step_conf": 0.13053393363952637, "num_tokens": 26441344.0, "reward": 0.9872865676879883, "reward_std": 0.13370364904403687, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7810617089271545, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8786677122116089, "step": 113 }, { "adv/mean_abs_final_conf": 0.5734812021255493, "adv/mean_abs_reasoning": 0.4173528850078583, "adv/mean_abs_step_conf": 0.7532247304916382, "adv/ratio_final_to_reasoning": 1.3740918602126144, "adv/ratio_step_to_reasoning": 1.8047670389949642, "adv/std_final_conf": 0.8099099397659302, "adv/std_reasoning": 0.7013692855834961, "adv/std_step_conf": 0.9314282536506653, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8578020429728778, "calib/avg_num_step_conf": 5.08984375, "calib/ece": 0.08992063492063503, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5396825396825397, "calib/gap": 0.5081944346600915, "calib/mean_conf": 0.6596031746031747, "calib/mu_c": 0.8310179640718561, "calib/mu_w": 0.3228235294117647, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.043412698412698514, "calib/std_conf": 0.37626379802501275, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4893175853018373, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.1826817257824288, "calib/step_q_w": 0.30663585951940847, "calib/step_q_w_n": 541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 437.5390625, "completions/mean_terminated_length": 440.9842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.1216, "grad_norm": 0.08511318266391754, "learning_rate": 2.388888888888889e-06, "loss": -0.0256, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04344598576426506, "mask/share_reasoning": 0.8190726041793823, "mask/share_step_conf": 0.12966890633106232, "num_tokens": 26658378.0, "reward": 1.0297393798828125, "reward_std": 0.13722848892211914, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.8486093878746033, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8835256099700928, "step": 114 }, { "adv/mean_abs_final_conf": 0.582456648349762, "adv/mean_abs_reasoning": 0.469838410615921, "adv/mean_abs_step_conf": 0.763654351234436, "adv/ratio_final_to_reasoning": 1.2396956808750637, "adv/ratio_step_to_reasoning": 1.625355300843423, "adv/std_final_conf": 0.8094815015792847, "adv/std_reasoning": 0.72057044506073, "adv/std_step_conf": 0.9315211176872253, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6543276908335995, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.2514453125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.5703125, "calib/gap": 0.23725646758224195, "calib/mean_conf": 0.6680078125000002, "calib/mu_c": 0.7616129032258064, "calib/mu_w": 0.5243564356435645, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1569921875, "calib/std_conf": 0.38587807733008733, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.454953395472703, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.06627971126217669, "calib/step_q_w": 0.38867368421052634, "calib/step_q_w_n": 475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 383.76171875, "completions/mean_terminated_length": 385.2666931152344, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.12266666666666666, "grad_norm": 0.11096165329217911, "learning_rate": 2.361111111111111e-06, "loss": 0.0262, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.043351054191589355, "mask/share_reasoning": 0.8242461085319519, "mask/share_step_conf": 0.12849658727645874, "num_tokens": 26861885.0, "reward": 0.9501369595527649, "reward_std": 0.13774867355823517, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7216605544090271, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8575196266174316, "step": 115 }, { "adv/mean_abs_final_conf": 0.5877513289451599, "adv/mean_abs_reasoning": 0.3486030697822571, "adv/mean_abs_step_conf": 0.7445104718208313, "adv/ratio_final_to_reasoning": 1.6860187987222217, "adv/ratio_step_to_reasoning": 2.1356968321761025, "adv/std_final_conf": 0.8238668441772461, "adv/std_reasoning": 0.6402716636657715, "adv/std_step_conf": 0.9296499490737915, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7527027027027027, "calib/avg_num_step_conf": 6.0546875, "calib/ece": 0.17768924302788838, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6175298804780877, "calib/gap": 0.3859472329472329, "calib/mean_conf": 0.705179282868526, "calib/mu_c": 0.8758571428571429, "calib/mu_w": 0.48990990990991, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1625498007968127, "calib/std_conf": 0.37619327771809336, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45794158553546593, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.16778514750899182, "calib/step_q_w": 0.2901564380264741, "calib/step_q_w_n": 831.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 507.55078125, "completions/mean_terminated_length": 509.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.12373333333333333, "grad_norm": 0.09604816138744354, "learning_rate": 2.3333333333333336e-06, "loss": 0.079, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04052261635661125, "mask/share_reasoning": 0.8265026807785034, "mask/share_step_conf": 0.12906846404075623, "num_tokens": 27096338.0, "reward": 0.9689432382583618, "reward_std": 0.15446476638317108, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7613726854324341, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8718262910842896, "step": 116 }, { "adv/mean_abs_final_conf": 0.6196193695068359, "adv/mean_abs_reasoning": 0.3499342203140259, "adv/mean_abs_step_conf": 0.7461419105529785, "adv/ratio_final_to_reasoning": 1.7706738396456299, "adv/ratio_step_to_reasoning": 2.1322347665324117, "adv/std_final_conf": 0.8284310698509216, "adv/std_reasoning": 0.6401571035385132, "adv/std_step_conf": 0.9326890110969543, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6507135826771654, "calib/avg_num_step_conf": 5.20703125, "calib/ece": 0.2643529411764704, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5607843137254902, "calib/gap": 0.21378075787401607, "calib/mean_conf": 0.673372549019608, "calib/mu_c": 0.7798437500000001, "calib/mu_w": 0.566062992125984, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21788235294117628, "calib/std_conf": 0.3752546827069617, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45486356340288925, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.09852553523387514, "calib/step_q_w": 0.3563380281690141, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 441.6171875, "completions/mean_terminated_length": 441.6171875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1248, "grad_norm": 0.12227895110845566, "learning_rate": 2.305555555555556e-06, "loss": -0.0571, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.039636991918087006, "mask/share_reasoning": 0.8293566703796387, "mask/share_step_conf": 0.13100633025169373, "num_tokens": 27315992.0, "reward": 0.9242790341377258, "reward_std": 0.14576169848442078, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6840121150016785, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8653271794319153, "step": 117 }, { "adv/mean_abs_final_conf": 0.550491452217102, "adv/mean_abs_reasoning": 0.3719319701194763, "adv/mean_abs_step_conf": 0.768734335899353, "adv/ratio_final_to_reasoning": 1.4800864040815496, "adv/ratio_step_to_reasoning": 2.0668681308907413, "adv/std_final_conf": 0.7908481359481812, "adv/std_reasoning": 0.6612310409545898, "adv/std_step_conf": 0.9323161244392395, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6923175580351416, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.23633466135458167, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.2926074131076054, "calib/mean_conf": 0.7392828685258964, "calib/mu_c": 0.8710144927536232, "calib/mu_w": 0.5784070796460178, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21290836653386452, "calib/std_conf": 0.3629069822341826, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4210052219321148, "calib/step_q_c_n": 766.0, "calib/step_q_gap": 0.10131935805776926, "calib/step_q_w": 0.31968586387434555, "calib/step_q_w_n": 764.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 479.65625, "completions/mean_terminated_length": 485.3439025878906, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.12586666666666665, "grad_norm": 0.09555070847272873, "learning_rate": 2.277777777777778e-06, "loss": -0.0926, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.038702912628650665, "mask/share_reasoning": 0.8145185112953186, "mask/share_step_conf": 0.13505981862545013, "num_tokens": 27542792.0, "reward": 0.9307032823562622, "reward_std": 0.158748060464859, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7154749631881714, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8420253992080688, "step": 118 }, { "adv/mean_abs_final_conf": 0.6382849216461182, "adv/mean_abs_reasoning": 0.6090080738067627, "adv/mean_abs_step_conf": 0.7536479830741882, "adv/ratio_final_to_reasoning": 1.048073004445332, "adv/ratio_step_to_reasoning": 1.2375008074413798, "adv/std_final_conf": 0.8556181788444519, "adv/std_reasoning": 0.8265742063522339, "adv/std_step_conf": 0.9337841868400574, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7651271839671122, "calib/avg_num_step_conf": 5.42578125, "calib/ece": 0.19653386454183264, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5936254980079682, "calib/gap": 0.40546056012333015, "calib/mean_conf": 0.6560557768924303, "calib/mu_c": 0.8369784172661873, "calib/mu_w": 0.4315178571428571, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14940239043824696, "calib/std_conf": 0.4104879771187675, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.452749244712991, "calib/step_q_c_n": 662.0, "calib/step_q_gap": 0.1888565349468287, "calib/step_q_w": 0.2638927097661623, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2752.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 496.30859375, "completions/mean_terminated_length": 502.1936950683594, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.12693333333333334, "grad_norm": 0.16473527252674103, "learning_rate": 2.25e-06, "loss": -0.1114, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03695293143391609, "mask/share_reasoning": 0.8348743319511414, "mask/share_step_conf": 0.11645397543907166, "num_tokens": 27774911.0, "reward": 0.9588329792022705, "reward_std": 0.2034362256526947, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7515574097633362, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8629834651947021, "step": 119 }, { "adv/mean_abs_final_conf": 0.5428203344345093, "adv/mean_abs_reasoning": 0.4027462303638458, "adv/mean_abs_step_conf": 0.7498716115951538, "adv/ratio_final_to_reasoning": 1.3477974305162803, "adv/ratio_step_to_reasoning": 1.8618960403867981, "adv/std_final_conf": 0.7921384572982788, "adv/std_reasoning": 0.701229453086853, "adv/std_step_conf": 0.9321331977844238, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8131541077969648, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.15433070866141735, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5511811023622047, "calib/gap": 0.49633961276818434, "calib/mean_conf": 0.6260629921259843, "calib/mu_c": 0.8175641025641027, "calib/mu_w": 0.3212244897959184, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08311023622047245, "calib/std_conf": 0.4156233483402194, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4483730715287518, "calib/step_q_c_n": 713.0, "calib/step_q_gap": 0.15654071285428783, "calib/step_q_w": 0.29183235867446394, "calib/step_q_w_n": 513.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 416.72265625, "completions/mean_terminated_length": 416.72265625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.128, "grad_norm": 0.08957294374704361, "learning_rate": 2.222222222222222e-06, "loss": 0.0547, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04134856536984444, "mask/share_reasoning": 0.8332849740982056, "mask/share_step_conf": 0.1253664195537567, "num_tokens": 27988280.0, "reward": 1.012019157409668, "reward_std": 0.14625471830368042, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.8189327716827393, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8847928047180176, "step": 120 }, { "adv/mean_abs_final_conf": 0.6672395467758179, "adv/mean_abs_reasoning": 0.488075315952301, "adv/mean_abs_step_conf": 0.7569321393966675, "adv/ratio_final_to_reasoning": 1.3670831631260498, "adv/ratio_step_to_reasoning": 1.5508510974782455, "adv/std_final_conf": 0.8629491925239563, "adv/std_reasoning": 0.7574973702430725, "adv/std_step_conf": 0.9334088563919067, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6913127413127413, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.24367588932806328, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.541501976284585, "calib/gap": 0.3026287001287002, "calib/mean_conf": 0.6179841897233203, "calib/mu_c": 0.7435810810810811, "calib/mu_w": 0.44095238095238093, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13833992094861663, "calib/std_conf": 0.4176135472525399, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4174746008708273, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.0926760019216154, "calib/step_q_w": 0.32479859894921187, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 467.4765625, "completions/mean_terminated_length": 469.3098449707031, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.12906666666666666, "grad_norm": 0.0753481537103653, "learning_rate": 2.1944444444444445e-06, "loss": -0.0361, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0383126437664032, "mask/share_reasoning": 0.8395795822143555, "mask/share_step_conf": 0.11820157617330551, "num_tokens": 28213010.0, "reward": 0.9504961967468262, "reward_std": 0.16614177823066711, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7162296772003174, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8722625374794006, "step": 121 }, { "adv/mean_abs_final_conf": 0.556941032409668, "adv/mean_abs_reasoning": 0.43468087911605835, "adv/mean_abs_step_conf": 0.7523335218429565, "adv/ratio_final_to_reasoning": 1.2812641622107481, "adv/ratio_step_to_reasoning": 1.7307720628817584, "adv/std_final_conf": 0.7994430065155029, "adv/std_reasoning": 0.7013107538223267, "adv/std_step_conf": 0.9319691061973572, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8082039911308205, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.1551984126984126, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.623015873015873, "calib/gap": 0.5038747228381373, "calib/mean_conf": 0.6683730158730159, "calib/mu_c": 0.8443292682926827, "calib/mu_w": 0.3404545454545454, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08638888888888879, "calib/std_conf": 0.4209618282685004, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45235079171741777, "calib/step_q_c_n": 821.0, "calib/step_q_gap": 0.22553943946366148, "calib/step_q_w": 0.2268113522537563, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 421.1484375, "completions/mean_terminated_length": 426.1423034667969, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.13013333333333332, "grad_norm": 0.07491142302751541, "learning_rate": 2.166666666666667e-06, "loss": -0.0503, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04027494415640831, "mask/share_reasoning": 0.8144368529319763, "mask/share_step_conf": 0.13356944918632507, "num_tokens": 28428168.0, "reward": 1.0099565982818604, "reward_std": 0.1610824018716812, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.811364471912384, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8835486769676208, "step": 122 }, { "adv/mean_abs_final_conf": 0.6732733845710754, "adv/mean_abs_reasoning": 0.47695034742355347, "adv/mean_abs_step_conf": 0.7559632062911987, "adv/ratio_final_to_reasoning": 1.4116215413366253, "adv/ratio_step_to_reasoning": 1.5849935121650498, "adv/std_final_conf": 0.8682565093040466, "adv/std_reasoning": 0.7574825882911682, "adv/std_step_conf": 0.933620810508728, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6533776301218162, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.2892941176470589, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4745098039215686, "calib/gap": 0.2476153562200073, "calib/mean_conf": 0.5544705882352942, "calib/mu_c": 0.6768217054263566, "calib/mu_w": 0.42920634920634926, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1689411764705883, "calib/std_conf": 0.4278780453399867, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3547577092511013, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.0827335805647742, "calib/step_q_w": 0.2720241286863271, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 494.90234375, "completions/mean_terminated_length": 496.8431701660156, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1312, "grad_norm": 0.08160626143217087, "learning_rate": 2.138888888888889e-06, "loss": -0.0256, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03725016117095947, "mask/share_reasoning": 0.8364986777305603, "mask/share_step_conf": 0.12234492599964142, "num_tokens": 28660151.0, "reward": 0.9081982374191284, "reward_std": 0.18566684424877167, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6762874722480774, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8424526453018188, "step": 123 }, { "adv/mean_abs_final_conf": 0.5857831835746765, "adv/mean_abs_reasoning": 0.25135308504104614, "adv/mean_abs_step_conf": 0.7474823594093323, "adv/ratio_final_to_reasoning": 2.330519171781869, "adv/ratio_step_to_reasoning": 2.9738340362413607, "adv/std_final_conf": 0.8261035084724426, "adv/std_reasoning": 0.5725484490394592, "adv/std_step_conf": 0.9320715069770813, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6433784501061571, "calib/avg_num_step_conf": 5.5390625, "calib/ece": 0.3070750988142292, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5573122529644269, "calib/gap": 0.20498473991507438, "calib/mean_conf": 0.6358498023715415, "calib/mu_c": 0.7136305732484077, "calib/mu_w": 0.5086458333333334, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16118577075098808, "calib/std_conf": 0.4154265501233542, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.40996088657105606, "calib/step_q_c_n": 767.0, "calib/step_q_gap": 0.1465507483222081, "calib/step_q_w": 0.26341013824884796, "calib/step_q_w_n": 651.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 440.51953125, "completions/mean_terminated_length": 443.9881896972656, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.13226666666666667, "grad_norm": 0.11682891100645065, "learning_rate": 2.1111111111111114e-06, "loss": -0.0544, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03923002630472183, "mask/share_reasoning": 0.8280006051063538, "mask/share_step_conf": 0.12495690584182739, "num_tokens": 28879740.0, "reward": 0.9176300168037415, "reward_std": 0.1578715741634369, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6801886558532715, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8347588181495667, "step": 124 }, { "adv/mean_abs_final_conf": 0.6839307546615601, "adv/mean_abs_reasoning": 0.4573225677013397, "adv/mean_abs_step_conf": 0.7513834238052368, "adv/ratio_final_to_reasoning": 1.4955106154048572, "adv/ratio_step_to_reasoning": 1.6430053464930627, "adv/std_final_conf": 0.856546938419342, "adv/std_reasoning": 0.7206392884254456, "adv/std_step_conf": 0.9326479434967041, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6251910425941923, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.32336, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.492, "calib/gap": 0.17775799056415692, "calib/mean_conf": 0.5783200000000001, "calib/mu_c": 0.6501342281879194, "calib/mu_w": 0.4723762376237625, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15284000000000003, "calib/std_conf": 0.42156610110396686, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.41558869701726836, "calib/step_q_c_n": 637.0, "calib/step_q_gap": 0.12487779654333475, "calib/step_q_w": 0.2907109004739336, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 474.046875, "completions/mean_terminated_length": 479.6679992675781, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.13333333333333333, "grad_norm": 0.10979079455137253, "learning_rate": 2.0833333333333334e-06, "loss": -0.097, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.04010496288537979, "mask/share_reasoning": 0.8361002206802368, "mask/share_step_conf": 0.1120760440826416, "num_tokens": 29105904.0, "reward": 0.8949642181396484, "reward_std": 0.1865251064300537, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6494511365890503, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8295398354530334, "step": 125 }, { "adv/mean_abs_final_conf": 0.5566482543945312, "adv/mean_abs_reasoning": 0.3620987832546234, "adv/mean_abs_step_conf": 0.7504023909568787, "adv/ratio_final_to_reasoning": 1.5372828635082794, "adv/ratio_step_to_reasoning": 2.072369269545998, "adv/std_final_conf": 0.8037400245666504, "adv/std_reasoning": 0.6402396559715271, "adv/std_step_conf": 0.9239445328712463, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7824463118580766, "calib/avg_num_step_conf": 5.17578125, "calib/ece": 0.19692913385826766, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.49606299212598426, "calib/gap": 0.4998842203548089, "calib/mean_conf": 0.5429133858267716, "calib/mu_c": 0.7771111111111113, "calib/mu_w": 0.2772268907563024, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10417322834645666, "calib/std_conf": 0.451641119872186, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4148795180722892, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.1740323168620017, "calib/step_q_w": 0.24084720121028746, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 461.94921875, "completions/mean_terminated_length": 461.94921875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.1344, "grad_norm": 0.0892636626958847, "learning_rate": 2.0555555555555555e-06, "loss": 0.0718, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.042710691690444946, "mask/share_reasoning": 0.8292683362960815, "mask/share_step_conf": 0.1280210316181183, "num_tokens": 29329627.0, "reward": 0.9750540256500244, "reward_std": 0.14126716554164886, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7896147966384888, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8565868735313416, "step": 126 }, { "adv/mean_abs_final_conf": 0.6591480374336243, "adv/mean_abs_reasoning": 0.4783036410808563, "adv/mean_abs_step_conf": 0.7303962707519531, "adv/ratio_final_to_reasoning": 1.3780953787934818, "adv/ratio_step_to_reasoning": 1.527055635832973, "adv/std_final_conf": 0.8759030699729919, "adv/std_reasoning": 0.7753259539604187, "adv/std_step_conf": 0.9340888857841492, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7108144192256343, "calib/avg_num_step_conf": 4.765625, "calib/ece": 0.2846558704453441, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.41295546558704455, "calib/gap": 0.2901962616822431, "calib/mean_conf": 0.4972874493927126, "calib/mu_c": 0.6230000000000001, "calib/mu_w": 0.332803738317757, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10757085020242904, "calib/std_conf": 0.4288431033039813, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.42916943521594675, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.18598173295057455, "calib/step_q_w": 0.2431877022653722, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 410.05859375, "completions/mean_terminated_length": 414.92095947265625, "completions/min_length": 0.0, "completions/min_terminated_length": 60.0, "epoch": 0.13546666666666668, "grad_norm": 0.07838647067546844, "learning_rate": 2.027777777777778e-06, "loss": -0.0798, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0423613004386425, "mask/share_reasoning": 0.8236986398696899, "mask/share_step_conf": 0.12222124636173248, "num_tokens": 29538274.0, "reward": 0.902811586856842, "reward_std": 0.2152184695005417, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6757652163505554, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8290766477584839, "step": 127 }, { "adv/mean_abs_final_conf": 0.66679847240448, "adv/mean_abs_reasoning": 0.5108741521835327, "adv/mean_abs_step_conf": 0.7615749835968018, "adv/ratio_final_to_reasoning": 1.305210822576381, "adv/ratio_step_to_reasoning": 1.4907291362104462, "adv/std_final_conf": 0.8649638295173645, "adv/std_reasoning": 0.7576236128807068, "adv/std_step_conf": 0.9336161017417908, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7494089834515366, "calib/avg_num_step_conf": 4.09375, "calib/ece": 0.2813888888888887, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.34249377036611084, "calib/mean_conf": 0.4142460317460317, "calib/mu_c": 0.5651063829787234, "calib/mu_w": 0.22261261261261256, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.06805555555555545, "calib/std_conf": 0.42650739561700335, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4248576850094876, "calib/step_q_c_n": 527.0, "calib/step_q_gap": 0.15871565046054337, "calib/step_q_w": 0.26614203454894425, "calib/step_q_w_n": 521.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 442.75390625, "completions/mean_terminated_length": 444.490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.13653333333333334, "grad_norm": 0.08396448940038681, "learning_rate": 2.0000000000000003e-06, "loss": -0.0167, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04571385681629181, "mask/share_reasoning": 0.8379007577896118, "mask/share_step_conf": 0.11247918009757996, "num_tokens": 29758283.0, "reward": 0.911347508430481, "reward_std": 0.18492817878723145, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6947382688522339, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8240504264831543, "step": 128 }, { "adv/mean_abs_final_conf": 0.6443102955818176, "adv/mean_abs_reasoning": 0.35985320806503296, "adv/mean_abs_step_conf": 0.7655781507492065, "adv/ratio_final_to_reasoning": 1.790480899270953, "adv/ratio_step_to_reasoning": 2.1274734630428824, "adv/std_final_conf": 0.8533403873443604, "adv/std_reasoning": 0.6402210593223572, "adv/std_step_conf": 0.9322566390037537, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6648866033755274, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.32421259842519684, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.32677165354330706, "calib/gap": 0.24921809071729956, "calib/mean_conf": 0.43492125984251967, "calib/mu_c": 0.5291139240506328, "calib/mu_w": 0.2798958333333333, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06854330708661417, "calib/std_conf": 0.41206197657651583, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3711275167785235, "calib/step_q_c_n": 745.0, "calib/step_q_gap": 0.08869281528598622, "calib/step_q_w": 0.2824347014925373, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 396.50390625, "completions/mean_terminated_length": 396.50390625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1376, "grad_norm": 0.10386553406715393, "learning_rate": 1.9722222222222224e-06, "loss": 0.0465, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04456010460853577, "mask/share_reasoning": 0.8199877738952637, "mask/share_step_conf": 0.13545210659503937, "num_tokens": 29962172.0, "reward": 0.9159624576568604, "reward_std": 0.14984360337257385, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6719777584075928, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8372910022735596, "step": 129 }, { "adv/mean_abs_final_conf": 0.6527014970779419, "adv/mean_abs_reasoning": 0.335531085729599, "adv/mean_abs_step_conf": 0.7392266988754272, "adv/ratio_final_to_reasoning": 1.9452787680124137, "adv/ratio_step_to_reasoning": 2.203154134789056, "adv/std_final_conf": 0.8733097910881042, "adv/std_reasoning": 0.6401881575584412, "adv/std_step_conf": 0.932996392250061, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7235110568443903, "calib/avg_num_step_conf": 4.13671875, "calib/ece": 0.2717391304347826, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.383399209486166, "calib/gap": 0.34689458689458685, "calib/mean_conf": 0.46596837944664027, "calib/mu_c": 0.5907407407407407, "calib/mu_w": 0.24384615384615382, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.04869565217391305, "calib/std_conf": 0.4331048296504483, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.41537993920972643, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.1030856748705743, "calib/step_q_w": 0.31229426433915214, "calib/step_q_w_n": 401.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 366.328125, "completions/mean_terminated_length": 370.6719665527344, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.13866666666666666, "grad_norm": 0.0782543420791626, "learning_rate": 1.944444444444445e-06, "loss": -0.0474, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04584982246160507, "mask/share_reasoning": 0.8227038383483887, "mask/share_step_conf": 0.11972758173942566, "num_tokens": 30161240.0, "reward": 0.9276007413864136, "reward_std": 0.15801388025283813, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.695837140083313, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8367080688476562, "step": 130 }, { "adv/mean_abs_final_conf": 0.636013925075531, "adv/mean_abs_reasoning": 0.3565048277378082, "adv/mean_abs_step_conf": 0.7499645352363586, "adv/ratio_final_to_reasoning": 1.784026121360937, "adv/ratio_step_to_reasoning": 2.1036588480308622, "adv/std_final_conf": 0.851593554019928, "adv/std_reasoning": 0.6611610054969788, "adv/std_step_conf": 0.9324222207069397, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6987875726193484, "calib/avg_num_step_conf": 4.0859375, "calib/ece": 0.22199999999999995, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.2901960784313726, "calib/gap": 0.34479982318767366, "calib/mean_conf": 0.3574509803921569, "calib/mu_c": 0.557570093457944, "calib/mu_w": 0.21277027027027026, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07992156862745095, "calib/std_conf": 0.41659894259302566, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.41762124711316395, "calib/step_q_c_n": 433.0, "calib/step_q_gap": 0.1284695342257251, "calib/step_q_w": 0.28915171288743885, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 392.80078125, "completions/mean_terminated_length": 392.80078125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.13973333333333332, "grad_norm": 0.07908864319324493, "learning_rate": 1.916666666666667e-06, "loss": -0.0475, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0426817312836647, "mask/share_reasoning": 0.8421868085861206, "mask/share_step_conf": 0.11513150483369827, "num_tokens": 30368005.0, "reward": 0.9415676593780518, "reward_std": 0.1567797064781189, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.7366679906845093, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8659986257553101, "step": 131 }, { "adv/mean_abs_final_conf": 0.6498449444770813, "adv/mean_abs_reasoning": 0.568773090839386, "adv/mean_abs_step_conf": 0.7566512823104858, "adv/ratio_final_to_reasoning": 1.142538131538626, "adv/ratio_step_to_reasoning": 1.3303218708779494, "adv/std_final_conf": 0.8433780670166016, "adv/std_reasoning": 0.7928308248519897, "adv/std_step_conf": 0.9342484474182129, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6983031530961125, "calib/avg_num_step_conf": 4.203125, "calib/ece": 0.24211764705882352, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.48627450980392156, "calib/gap": 0.3788406989996203, "calib/mean_conf": 0.5425882352941176, "calib/mu_c": 0.7000671140939598, "calib/mu_w": 0.3212264150943395, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10019607843137252, "calib/std_conf": 0.44859821651628495, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4333222036727879, "calib/step_q_c_n": 599.0, "calib/step_q_gap": 0.1482068996895594, "calib/step_q_w": 0.2851153039832285, "calib/step_q_w_n": 477.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 404.47265625, "completions/mean_terminated_length": 406.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.1408, "grad_norm": 0.08786921203136444, "learning_rate": 1.888888888888889e-06, "loss": -0.0412, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04840783774852753, "mask/share_reasoning": 0.8275307416915894, "mask/share_step_conf": 0.1201552003622055, "num_tokens": 30577142.0, "reward": 0.9508693218231201, "reward_std": 0.18324634432792664, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7313722372055054, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.855522632598877, "step": 132 }, { "adv/mean_abs_final_conf": 0.7346389293670654, "adv/mean_abs_reasoning": 0.5648310780525208, "adv/mean_abs_step_conf": 0.7609961032867432, "adv/ratio_final_to_reasoning": 1.3006347524290354, "adv/ratio_step_to_reasoning": 1.3472985691767867, "adv/std_final_conf": 0.9023372530937195, "adv/std_reasoning": 0.8098082542419434, "adv/std_step_conf": 0.9346030354499817, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6944772399317853, "calib/avg_num_step_conf": 4.9296875, "calib/ece": 0.23782608695652174, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.2964426877470356, "calib/gap": 0.3451010101010101, "calib/mean_conf": 0.3673122529644269, "calib/mu_c": 0.5773737373737373, "calib/mu_w": 0.23227272727272724, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1069169960474308, "calib/std_conf": 0.41836767465201996, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3430426356589147, "calib/step_q_c_n": 516.0, "calib/step_q_gap": 0.06647427104765463, "calib/step_q_w": 0.27656836461126005, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 493.12109375, "completions/mean_terminated_length": 493.12109375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.14186666666666667, "grad_norm": 0.07785658538341522, "learning_rate": 1.8611111111111113e-06, "loss": 0.0052, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03774401545524597, "mask/share_reasoning": 0.8516237735748291, "mask/share_step_conf": 0.11063216626644135, "num_tokens": 30809725.0, "reward": 0.9364477396011353, "reward_std": 0.20268605649471283, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.7341363430023193, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.865321695804596, "step": 133 }, { "adv/mean_abs_final_conf": 0.7228103876113892, "adv/mean_abs_reasoning": 0.5332584381103516, "adv/mean_abs_step_conf": 0.7673026323318481, "adv/ratio_final_to_reasoning": 1.355459822019379, "adv/ratio_step_to_reasoning": 1.4388944974801579, "adv/std_final_conf": 0.885215163230896, "adv/std_reasoning": 0.7577138543128967, "adv/std_step_conf": 0.9339636564254761, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6724409945727629, "calib/avg_num_step_conf": 4.65625, "calib/ece": 0.3328853754940711, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.383399209486166, "calib/gap": 0.21952984980436696, "calib/mean_conf": 0.4512252964426877, "calib/mu_c": 0.5501438848920863, "calib/mu_w": 0.33061403508771936, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1173517786561265, "calib/std_conf": 0.4389149402054845, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.39446868217054254, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.11713467851423537, "calib/step_q_w": 0.27733400365630717, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 475.8359375, "completions/mean_terminated_length": 477.7019958496094, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.14293333333333333, "grad_norm": 0.07474420964717865, "learning_rate": 1.8333333333333333e-06, "loss": -0.0342, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.039326705038547516, "mask/share_reasoning": 0.8487839698791504, "mask/share_step_conf": 0.10798301547765732, "num_tokens": 31040491.0, "reward": 0.8866908550262451, "reward_std": 0.21303297579288483, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6380894780158997, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8321672677993774, "step": 134 }, { "adv/mean_abs_final_conf": 0.6431422233581543, "adv/mean_abs_reasoning": 0.34882500767707825, "adv/mean_abs_step_conf": 0.7356947064399719, "adv/ratio_final_to_reasoning": 1.8437388639106336, "adv/ratio_step_to_reasoning": 2.1090652626632633, "adv/std_final_conf": 0.8505651354789734, "adv/std_reasoning": 0.6403726935386658, "adv/std_step_conf": 0.9314845204353333, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6983677767539818, "calib/avg_num_step_conf": 4.69921875, "calib/ece": 0.29024096385542164, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.41365461847389556, "calib/gap": 0.28246215611425574, "calib/mean_conf": 0.5005220883534137, "calib/mu_c": 0.6219014084507043, "calib/mu_w": 0.33943925233644856, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11024096385542165, "calib/std_conf": 0.43708431602072095, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.40368770764119605, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.12139153459627094, "calib/step_q_w": 0.2822961730449251, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 456.7890625, "completions/mean_terminated_length": 462.20556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.144, "grad_norm": 0.09601619839668274, "learning_rate": 1.8055555555555557e-06, "loss": -0.0984, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04134872555732727, "mask/share_reasoning": 0.8315227031707764, "mask/share_step_conf": 0.11540976911783218, "num_tokens": 31263309.0, "reward": 0.9178197383880615, "reward_std": 0.17275235056877136, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6783995628356934, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8517711162567139, "step": 135 }, { "adv/mean_abs_final_conf": 0.6235309839248657, "adv/mean_abs_reasoning": 0.3897348642349243, "adv/mean_abs_step_conf": 0.7583878040313721, "adv/ratio_final_to_reasoning": 1.5998850530062247, "adv/ratio_step_to_reasoning": 1.9459070091667017, "adv/std_final_conf": 0.8260871171951294, "adv/std_reasoning": 0.6815566420555115, "adv/std_step_conf": 0.9311226606369019, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8587307501241926, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.15692913385826765, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3031496062992126, "calib/gap": 0.5257886239443618, "calib/mean_conf": 0.3671653543307087, "calib/mu_c": 0.6404098360655739, "calib/mu_w": 0.11462121212121214, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.02188976377952751, "calib/std_conf": 0.422739281611019, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4003921568627451, "calib/step_q_c_n": 561.0, "calib/step_q_gap": 0.20574068233191398, "calib/step_q_w": 0.19465147453083112, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 437.71875, "completions/mean_terminated_length": 439.4353332519531, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.14506666666666668, "grad_norm": 0.08469892293214798, "learning_rate": 1.777777777777778e-06, "loss": -0.0622, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04148070514202118, "mask/share_reasoning": 0.829554557800293, "mask/share_step_conf": 0.12505844235420227, "num_tokens": 31483853.0, "reward": 0.9983916282653809, "reward_std": 0.14055410027503967, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.8112839460372925, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8925305008888245, "step": 136 }, { "adv/mean_abs_final_conf": 0.6505257487297058, "adv/mean_abs_reasoning": 0.39071184396743774, "adv/mean_abs_step_conf": 0.7679653167724609, "adv/ratio_final_to_reasoning": 1.664975758410644, "adv/ratio_step_to_reasoning": 1.965554227827462, "adv/std_final_conf": 0.8569141626358032, "adv/std_reasoning": 0.6815319657325745, "adv/std_step_conf": 0.9333835244178772, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.740466366591648, "calib/avg_num_step_conf": 4.52734375, "calib/ece": 0.24162055335968374, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3438735177865613, "calib/gap": 0.3437815703925983, "calib/mean_conf": 0.4397233201581028, "calib/mu_c": 0.608217054263566, "calib/mu_w": 0.26443548387096766, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08573122529644261, "calib/std_conf": 0.4285663232321683, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.429008547008547, "calib/step_q_c_n": 585.0, "calib/step_q_gap": 0.1425102891688258, "calib/step_q_w": 0.2864982578397212, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 408.57421875, "completions/mean_terminated_length": 410.1764831542969, "completions/min_length": 0.0, "completions/min_terminated_length": 87.0, "epoch": 0.14613333333333334, "grad_norm": 0.0840989202260971, "learning_rate": 1.75e-06, "loss": -0.0431, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04242321103811264, "mask/share_reasoning": 0.8325255513191223, "mask/share_step_conf": 0.12114499509334564, "num_tokens": 31695432.0, "reward": 0.9399605989456177, "reward_std": 0.16075079143047333, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7247363328933716, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8567474484443665, "step": 137 }, { "adv/mean_abs_final_conf": 0.6266717910766602, "adv/mean_abs_reasoning": 0.5125274658203125, "adv/mean_abs_step_conf": 0.7337398529052734, "adv/ratio_final_to_reasoning": 1.2227086992765488, "adv/ratio_step_to_reasoning": 1.4316107952008097, "adv/std_final_conf": 0.8565248847007751, "adv/std_reasoning": 0.7575325965881348, "adv/std_step_conf": 0.9332212209701538, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7519230769230768, "calib/avg_num_step_conf": 4.42578125, "calib/ece": 0.2463281249999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.42578125, "calib/gap": 0.40117948717948726, "calib/mean_conf": 0.49546875, "calib/mu_c": 0.6521794871794873, "calib/mu_w": 0.251, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0662109374999999, "calib/std_conf": 0.4419945195061105, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4335382308845577, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.1293107630733989, "calib/step_q_w": 0.3042274678111588, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 412.43359375, "completions/mean_terminated_length": 414.0509948730469, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.1472, "grad_norm": 0.11178892850875854, "learning_rate": 1.7222222222222224e-06, "loss": 0.0155, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04397442191839218, "mask/share_reasoning": 0.8371366262435913, "mask/share_step_conf": 0.11498266458511353, "num_tokens": 31905351.0, "reward": 0.9681872129440308, "reward_std": 0.16365806758403778, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7441655993461609, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8711150884628296, "step": 138 }, { "adv/mean_abs_final_conf": 0.6846431493759155, "adv/mean_abs_reasoning": 0.3737006187438965, "adv/mean_abs_step_conf": 0.7485607266426086, "adv/ratio_final_to_reasoning": 1.8320631945357129, "adv/ratio_step_to_reasoning": 2.00310272206322, "adv/std_final_conf": 0.8709847331047058, "adv/std_reasoning": 0.6612383723258972, "adv/std_step_conf": 0.9338328838348389, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7626567732615405, "calib/avg_num_step_conf": 4.05859375, "calib/ece": 0.2467716535433071, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.38188976377952755, "calib/gap": 0.3783308162059231, "calib/mean_conf": 0.4805511811023622, "calib/mu_c": 0.6250318471337581, "calib/mu_w": 0.24670103092783505, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05460629921259846, "calib/std_conf": 0.4246449319686231, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4823040752351097, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.1532018308460823, "calib/step_q_w": 0.3291022443890274, "calib/step_q_w_n": 401.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 369.4140625, "completions/mean_terminated_length": 370.8627624511719, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.14826666666666666, "grad_norm": 0.07893967628479004, "learning_rate": 1.6944444444444446e-06, "loss": -0.0048, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0466064028441906, "mask/share_reasoning": 0.8326088190078735, "mask/share_step_conf": 0.11687853187322617, "num_tokens": 32103017.0, "reward": 0.9592806100845337, "reward_std": 0.18043480813503265, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7375070452690125, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8599603176116943, "step": 139 }, { "adv/mean_abs_final_conf": 0.612659215927124, "adv/mean_abs_reasoning": 0.4621930420398712, "adv/mean_abs_step_conf": 0.746800422668457, "adv/ratio_final_to_reasoning": 1.3255483319765615, "adv/ratio_step_to_reasoning": 1.615775995615343, "adv/std_final_conf": 0.82439124584198, "adv/std_reasoning": 0.7393609285354614, "adv/std_step_conf": 0.9319517612457275, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7510753825541217, "calib/avg_num_step_conf": 3.69921875, "calib/ece": 0.23488000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.58, "calib/gap": 0.3750906142021013, "calib/mean_conf": 0.6427200000000001, "calib/mu_c": 0.7732515337423312, "calib/mu_w": 0.3981609195402299, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1128, "calib/std_conf": 0.42086838987978176, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5261441441441441, "calib/step_q_c_n": 555.0, "calib/step_q_gap": 0.18042985842985848, "calib/step_q_w": 0.34571428571428564, "calib/step_q_w_n": 392.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 399.810302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.14933333333333335, "grad_norm": 0.07556366175413132, "learning_rate": 1.6666666666666667e-06, "loss": -0.1017, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.046415865421295166, "mask/share_reasoning": 0.832078218460083, "mask/share_step_conf": 0.10978717356920242, "num_tokens": 32309185.0, "reward": 0.9508023858070374, "reward_std": 0.21228066086769104, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7359312772750854, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8461422920227051, "step": 140 }, { "adv/mean_abs_final_conf": 0.5549712181091309, "adv/mean_abs_reasoning": 0.3791113495826721, "adv/mean_abs_step_conf": 0.7596755027770996, "adv/ratio_final_to_reasoning": 1.4638739217911736, "adv/ratio_step_to_reasoning": 2.0038321290390138, "adv/std_final_conf": 0.784277617931366, "adv/std_reasoning": 0.6611729860305786, "adv/std_step_conf": 0.932152509689331, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8491539135710915, "calib/avg_num_step_conf": 4.5, "calib/ece": 0.14665354330708666, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5669291338582677, "calib/gap": 0.5754513584574936, "calib/mean_conf": 0.625, "calib/mu_c": 0.8311656441717793, "calib/mu_w": 0.2557142857142857, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.06496062992125987, "calib/std_conf": 0.43826676340502, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5248363095238094, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.26125297619047616, "calib/step_q_w": 0.2635833333333333, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 424.7421875, "completions/mean_terminated_length": 426.4078674316406, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.1504, "grad_norm": 0.07134176045656204, "learning_rate": 1.638888888888889e-06, "loss": 0.021, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04193870723247528, "mask/share_reasoning": 0.8437968492507935, "mask/share_step_conf": 0.11035820841789246, "num_tokens": 32525015.0, "reward": 1.0193904638290405, "reward_std": 0.14930438995361328, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.8318593502044678, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8811401128768921, "step": 141 }, { "adv/mean_abs_final_conf": 0.6422310471534729, "adv/mean_abs_reasoning": 0.38081085681915283, "adv/mean_abs_step_conf": 0.7638607025146484, "adv/ratio_final_to_reasoning": 1.6864830286560568, "adv/ratio_step_to_reasoning": 2.0058795300508097, "adv/std_final_conf": 0.8471424579620361, "adv/std_reasoning": 0.6814852356910706, "adv/std_step_conf": 0.9324362874031067, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7695195195195195, "calib/avg_num_step_conf": 4.60546875, "calib/ece": 0.20956862745098045, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.4745098039215686, "calib/gap": 0.4343843843843845, "calib/mean_conf": 0.5431372549019607, "calib/mu_c": 0.7322222222222223, "calib/mu_w": 0.29783783783783785, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09400000000000003, "calib/std_conf": 0.442025153201775, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5328217821782179, "calib/step_q_c_n": 606.0, "calib/step_q_gap": 0.21055302127071346, "calib/step_q_w": 0.3222687609075044, "calib/step_q_w_n": 573.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 414.46484375, "completions/mean_terminated_length": 416.0902099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.15146666666666667, "grad_norm": 0.08121337741613388, "learning_rate": 1.6111111111111113e-06, "loss": -0.0221, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04288134723901749, "mask/share_reasoning": 0.8329954147338867, "mask/share_step_conf": 0.12021702527999878, "num_tokens": 32736278.0, "reward": 0.9745882153511047, "reward_std": 0.1629941165447235, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7661937475204468, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8720451593399048, "step": 142 }, { "adv/mean_abs_final_conf": 0.5666977763175964, "adv/mean_abs_reasoning": 0.23824170231819153, "adv/mean_abs_step_conf": 0.7360186576843262, "adv/ratio_final_to_reasoning": 2.378667423895102, "adv/ratio_step_to_reasoning": 3.0893779322534907, "adv/std_final_conf": 0.7825659513473511, "adv/std_reasoning": 0.5228584408760071, "adv/std_step_conf": 0.9154124855995178, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8805031446540881, "calib/avg_num_step_conf": 4.74609375, "calib/ece": 0.12600000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.488, "calib/gap": 0.5709067085953878, "calib/mean_conf": 0.56488, "calib/mu_c": 0.8069444444444445, "calib/mu_w": 0.2360377358490566, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.05744000000000006, "calib/std_conf": 0.43227235118614743, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5302007299270072, "calib/step_q_c_n": 548.0, "calib/step_q_gap": 0.3131092756541436, "calib/step_q_w": 0.21709145427286358, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 430.609375, "completions/mean_terminated_length": 434.0, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.15253333333333333, "grad_norm": 0.07603774964809418, "learning_rate": 1.5833333333333333e-06, "loss": -0.0706, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.04396149888634682, "mask/share_reasoning": 0.8335665464401245, "mask/share_step_conf": 0.11465941369533539, "num_tokens": 32953850.0, "reward": 0.9956755638122559, "reward_std": 0.1526007205247879, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.8216222524642944, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8634787201881409, "step": 143 }, { "adv/mean_abs_final_conf": 0.5291182994842529, "adv/mean_abs_reasoning": 0.2960050106048584, "adv/mean_abs_step_conf": 0.7715482711791992, "adv/ratio_final_to_reasoning": 1.7875315637497131, "adv/ratio_step_to_reasoning": 2.6065378744860195, "adv/std_final_conf": 0.7694202065467834, "adv/std_reasoning": 0.5726398825645447, "adv/std_step_conf": 0.9335837960243225, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7331118493909191, "calib/avg_num_step_conf": 4.44921875, "calib/ece": 0.24940944881889748, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.562992125984252, "calib/gap": 0.3708707087486158, "calib/mean_conf": 0.6143700787401575, "calib/mu_c": 0.7399404761904762, "calib/mu_w": 0.36906976744186043, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10118110236220457, "calib/std_conf": 0.44259635372771056, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47892247043363995, "calib/step_q_c_n": 761.0, "calib/step_q_gap": 0.14701770852887808, "calib/step_q_w": 0.33190476190476187, "calib/step_q_w_n": 378.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 414.06640625, "completions/mean_terminated_length": 414.06640625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.1536, "grad_norm": 0.08561398833990097, "learning_rate": 1.5555555555555558e-06, "loss": -0.0063, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04731464385986328, "mask/share_reasoning": 0.8295125365257263, "mask/share_step_conf": 0.1231728121638298, "num_tokens": 33163979.0, "reward": 0.9652718901634216, "reward_std": 0.1510903239250183, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7343593835830688, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8672782182693481, "step": 144 }, { "adv/mean_abs_final_conf": 0.5362052917480469, "adv/mean_abs_reasoning": 0.41883423924446106, "adv/mean_abs_step_conf": 0.7684600353240967, "adv/ratio_final_to_reasoning": 1.2802327066557704, "adv/ratio_step_to_reasoning": 1.8347593470637187, "adv/std_final_conf": 0.7784587740898132, "adv/std_reasoning": 0.6816502213478088, "adv/std_step_conf": 0.9331372976303101, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7162085976039464, "calib/avg_num_step_conf": 4.46484375, "calib/ece": 0.20179282868525894, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6055776892430279, "calib/gap": 0.2933157152924596, "calib/mean_conf": 0.7152589641434263, "calib/mu_c": 0.8157575757575759, "calib/mu_w": 0.5224418604651163, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12984063745019916, "calib/std_conf": 0.3727803267091721, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5732499999999999, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.18438475177304964, "calib/step_q_w": 0.3888652482269503, "calib/step_q_w_n": 423.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 369.0703125, "completions/mean_terminated_length": 371.97637939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.15466666666666667, "grad_norm": 0.08192351460456848, "learning_rate": 1.527777777777778e-06, "loss": 0.0497, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04904334992170334, "mask/share_reasoning": 0.8118296265602112, "mask/share_step_conf": 0.13131451606750488, "num_tokens": 33361165.0, "reward": 0.9571807980537415, "reward_std": 0.16555394232273102, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7496456503868103, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.839715838432312, "step": 145 }, { "adv/mean_abs_final_conf": 0.664141058921814, "adv/mean_abs_reasoning": 0.46557626128196716, "adv/mean_abs_step_conf": 0.7499189376831055, "adv/ratio_final_to_reasoning": 1.4264925301240605, "adv/ratio_step_to_reasoning": 1.6107327629166466, "adv/std_final_conf": 0.8418440222740173, "adv/std_reasoning": 0.7206533551216125, "adv/std_step_conf": 0.9325315356254578, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7193112805272, "calib/avg_num_step_conf": 4.53515625, "calib/ece": 0.2748605577689242, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.601593625498008, "calib/gap": 0.3559006331567385, "calib/mean_conf": 0.6782868525896414, "calib/mu_c": 0.8796330275229357, "calib/mu_w": 0.5237323943661972, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2594422310756971, "calib/std_conf": 0.4038083809831653, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.512928870292887, "calib/step_q_c_n": 478.0, "calib/step_q_gap": 0.14349255989757226, "calib/step_q_w": 0.36943631039531477, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 418.5859375, "completions/mean_terminated_length": 421.88189697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.15573333333333333, "grad_norm": 0.10385292023420334, "learning_rate": 1.5e-06, "loss": -0.0516, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04161831736564636, "mask/share_reasoning": 0.8323127627372742, "mask/share_step_conf": 0.11825643479824066, "num_tokens": 33575539.0, "reward": 0.9033230543136597, "reward_std": 0.21980994939804077, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.6927086114883423, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8334687352180481, "step": 146 }, { "adv/mean_abs_final_conf": 0.5654524564743042, "adv/mean_abs_reasoning": 0.3860156834125519, "adv/mean_abs_step_conf": 0.7488675713539124, "adv/ratio_final_to_reasoning": 1.4648432195175354, "adv/ratio_step_to_reasoning": 1.9399926053096779, "adv/std_final_conf": 0.7916289567947388, "adv/std_reasoning": 0.6611943244934082, "adv/std_step_conf": 0.9337374567985535, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6490112994350283, "calib/avg_num_step_conf": 4.4609375, "calib/ece": 0.3687109375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.69921875, "calib/gap": 0.2368521247850648, "calib/mean_conf": 0.7291015625000001, "calib/mu_c": 0.8567796610169489, "calib/mu_w": 0.6199275362318841, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31843750000000004, "calib/std_conf": 0.4083794830608641, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5961284046692606, "calib/step_q_c_n": 514.0, "calib/step_q_gap": 0.15821439193040715, "calib/step_q_w": 0.4379140127388535, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 401.33203125, "completions/mean_terminated_length": 402.9059143066406, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.1568, "grad_norm": 0.06466303765773773, "learning_rate": 1.4722222222222225e-06, "loss": 0.0024, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.042257264256477356, "mask/share_reasoning": 0.8345401287078857, "mask/share_step_conf": 0.11929632723331451, "num_tokens": 33781960.0, "reward": 0.8792563676834106, "reward_std": 0.1607673168182373, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6305433511734009, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8357818722724915, "step": 147 }, { "adv/mean_abs_final_conf": 0.61143559217453, "adv/mean_abs_reasoning": 0.5280998945236206, "adv/mean_abs_step_conf": 0.7383086085319519, "adv/ratio_final_to_reasoning": 1.1578029053122298, "adv/ratio_step_to_reasoning": 1.3980472561880604, "adv/std_final_conf": 0.8347281217575073, "adv/std_reasoning": 0.7753977179527283, "adv/std_step_conf": 0.9333657622337341, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.757313829787234, "calib/avg_num_step_conf": 4.5078125, "calib/ece": 0.2012992125984252, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7283464566929134, "calib/gap": 0.40677792553191483, "calib/mean_conf": 0.7638976377952756, "calib/mu_c": 0.9144375, "calib/mu_w": 0.5076595744680852, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16763779527559056, "calib/std_conf": 0.38358052742291554, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5885483870967742, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.19178991252050298, "calib/step_q_w": 0.39675847457627117, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 384.1953125, "completions/mean_terminated_length": 384.1953125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.15786666666666666, "grad_norm": 0.06249001622200012, "learning_rate": 1.4444444444444445e-06, "loss": -0.0046, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04991346597671509, "mask/share_reasoning": 0.8199547529220581, "mask/share_step_conf": 0.13013175129890442, "num_tokens": 33985426.0, "reward": 0.982866644859314, "reward_std": 0.1941990852355957, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7852691411972046, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8570265769958496, "step": 148 }, { "adv/mean_abs_final_conf": 0.5319117307662964, "adv/mean_abs_reasoning": 0.39539942145347595, "adv/mean_abs_step_conf": 0.722755491733551, "adv/ratio_final_to_reasoning": 1.3452516668107541, "adv/ratio_step_to_reasoning": 1.8279123653664548, "adv/std_final_conf": 0.7791368365287781, "adv/std_reasoning": 0.6814852356910706, "adv/std_step_conf": 0.9184855818748474, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7100657894736842, "calib/avg_num_step_conf": 4.72265625, "calib/ece": 0.24849206349206357, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7857142857142857, "calib/gap": 0.3319894736842104, "calib/mean_conf": 0.8090476190476191, "calib/mu_c": 0.9407894736842105, "calib/mu_w": 0.6088000000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22718253968253976, "calib/std_conf": 0.3577862458681529, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5786814449917899, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.1967981116584565, "calib/step_q_w": 0.38188333333333335, "calib/step_q_w_n": 600.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 441.7734375, "completions/mean_terminated_length": 441.7734375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.15893333333333334, "grad_norm": 0.0746147558093071, "learning_rate": 1.4166666666666667e-06, "loss": 0.0358, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04412522539496422, "mask/share_reasoning": 0.8377799391746521, "mask/share_step_conf": 0.11809486150741577, "num_tokens": 34202976.0, "reward": 0.9574228525161743, "reward_std": 0.19794991612434387, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7374711036682129, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.861749529838562, "step": 149 }, { "adv/mean_abs_final_conf": 0.4720293879508972, "adv/mean_abs_reasoning": 0.4032324552536011, "adv/mean_abs_step_conf": 0.757534384727478, "adv/ratio_final_to_reasoning": 1.1706135798370405, "adv/ratio_step_to_reasoning": 1.8786542969390925, "adv/std_final_conf": 0.737794041633606, "adv/std_reasoning": 0.6612992286682129, "adv/std_step_conf": 0.9328404068946838, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6284599239144694, "calib/avg_num_step_conf": 4.19921875, "calib/ece": 0.2726086956521739, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7865612648221344, "calib/gap": 0.2189610389610389, "calib/mean_conf": 0.8250988142292492, "calib/mu_c": 0.9107792207792207, "calib/mu_w": 0.6918181818181818, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2445059288537549, "calib/std_conf": 0.33064628611302, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5506818181818183, "calib/step_q_c_n": 616.0, "calib/step_q_gap": 0.13835066349772246, "calib/step_q_w": 0.4123311546840958, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 379.26171875, "completions/mean_terminated_length": 380.7490539550781, "completions/min_length": 0.0, "completions/min_terminated_length": 79.0, "epoch": 0.16, "grad_norm": 0.07002872973680496, "learning_rate": 1.3888888888888892e-06, "loss": -0.0237, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.05102251470088959, "mask/share_reasoning": 0.8146055936813354, "mask/share_step_conf": 0.13046567142009735, "num_tokens": 34405027.0, "reward": 0.9285831451416016, "reward_std": 0.1642819046974182, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6999351382255554, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8400436639785767, "step": 150 }, { "adv/mean_abs_final_conf": 0.6051285862922668, "adv/mean_abs_reasoning": 0.4240601360797882, "adv/mean_abs_step_conf": 0.7665138244628906, "adv/ratio_final_to_reasoning": 1.4269876718107972, "adv/ratio_step_to_reasoning": 1.8075592569226284, "adv/std_final_conf": 0.829787015914917, "adv/std_reasoning": 0.7014648914337158, "adv/std_step_conf": 0.9331311583518982, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7577142857142858, "calib/avg_num_step_conf": 4.6328125, "calib/ece": 0.26478087649402376, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6653386454183267, "calib/gap": 0.39082222222222207, "calib/mean_conf": 0.7074103585657371, "calib/mu_c": 0.9035999999999998, "calib/mu_w": 0.5127777777777778, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2370916334661353, "calib/std_conf": 0.4102337140359674, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5399395161290322, "calib/step_q_c_n": 496.0, "calib/step_q_gap": 0.1792438639551192, "calib/step_q_w": 0.36069565217391303, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 468.26953125, "completions/mean_terminated_length": 471.9566955566406, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.16106666666666666, "grad_norm": 0.06739109009504318, "learning_rate": 1.3611111111111112e-06, "loss": 0.0056, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04095233231782913, "mask/share_reasoning": 0.8410570621490479, "mask/share_step_conf": 0.11017806828022003, "num_tokens": 34631928.0, "reward": 0.9262844920158386, "reward_std": 0.21015754342079163, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7189491987228394, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8398697376251221, "step": 151 }, { "adv/mean_abs_final_conf": 0.6687546372413635, "adv/mean_abs_reasoning": 0.5585987567901611, "adv/mean_abs_step_conf": 0.778603732585907, "adv/ratio_final_to_reasoning": 1.1972003680856431, "adv/ratio_step_to_reasoning": 1.3938515313924897, "adv/std_final_conf": 0.846340000629425, "adv/std_reasoning": 0.7928636074066162, "adv/std_step_conf": 0.9337232112884521, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6698924046603426, "calib/avg_num_step_conf": 4.71484375, "calib/ece": 0.32246031746031745, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.7182539682539683, "calib/gap": 0.21262749092761213, "calib/mean_conf": 0.7651587301587303, "calib/mu_c": 0.8605035971223023, "calib/mu_w": 0.6478761061946902, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.26801587301587304, "calib/std_conf": 0.3784067143582462, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.508804920913884, "calib/step_q_c_n": 569.0, "calib/step_q_gap": 0.10819363564742634, "calib/step_q_w": 0.4006112852664576, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 406.9296875, "completions/mean_terminated_length": 408.5255126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.16213333333333332, "grad_norm": 0.07073374837636948, "learning_rate": 1.3333333333333334e-06, "loss": -0.0261, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.044950634241104126, "mask/share_reasoning": 0.8246068358421326, "mask/share_step_conf": 0.1265362799167633, "num_tokens": 34841494.0, "reward": 0.8939030170440674, "reward_std": 0.2405945062637329, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6507734060287476, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8331261873245239, "step": 152 }, { "adv/mean_abs_final_conf": 0.5792268514633179, "adv/mean_abs_reasoning": 0.39986711740493774, "adv/mean_abs_step_conf": 0.7697999477386475, "adv/ratio_final_to_reasoning": 1.4485483458164579, "adv/ratio_step_to_reasoning": 1.9251394131493085, "adv/std_final_conf": 0.7936047911643982, "adv/std_reasoning": 0.661339282989502, "adv/std_step_conf": 0.9338732361793518, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6221088877338877, "calib/avg_num_step_conf": 4.56640625, "calib/ece": 0.3182936507936508, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7857142857142857, "calib/gap": 0.17196205821205812, "calib/mean_conf": 0.8291666666666666, "calib/mu_c": 0.9001351351351351, "calib/mu_w": 0.728173076923077, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2800793650793651, "calib/std_conf": 0.32914828158421433, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5255292259083728, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.15806653934120868, "calib/step_q_w": 0.36746268656716413, "calib/step_q_w_n": 536.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 385.21484375, "completions/mean_terminated_length": 389.7826232910156, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.1632, "grad_norm": 0.0813259556889534, "learning_rate": 1.3055555555555556e-06, "loss": -0.0518, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04396626353263855, "mask/share_reasoning": 0.8241199254989624, "mask/share_step_conf": 0.12019501626491547, "num_tokens": 35047429.0, "reward": 0.898368239402771, "reward_std": 0.21193614602088928, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.6636105179786682, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8206258416175842, "step": 153 }, { "adv/mean_abs_final_conf": 0.582364022731781, "adv/mean_abs_reasoning": 0.5326874256134033, "adv/mean_abs_step_conf": 0.7495837211608887, "adv/ratio_final_to_reasoning": 1.093256560470099, "adv/ratio_step_to_reasoning": 1.4071736728114497, "adv/std_final_conf": 0.7997604012489319, "adv/std_reasoning": 0.7753660082817078, "adv/std_step_conf": 0.9338908791542053, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6827019254813702, "calib/avg_num_step_conf": 4.04296875, "calib/ece": 0.33399209486166015, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7272727272727273, "calib/gap": 0.2772130532633157, "calib/mean_conf": 0.7707509881422925, "calib/mu_c": 0.9120967741935483, "calib/mu_w": 0.6348837209302326, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.307312252964427, "calib/std_conf": 0.3743587788543868, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5527254098360657, "calib/step_q_c_n": 488.0, "calib/step_q_gap": 0.1330361959420986, "calib/step_q_w": 0.4196892138939671, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 372.59375, "completions/mean_terminated_length": 375.5275573730469, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.16426666666666667, "grad_norm": 0.06172942742705345, "learning_rate": 1.2777777777777779e-06, "loss": -0.0549, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.046777769923210144, "mask/share_reasoning": 0.8256240487098694, "mask/share_step_conf": 0.11978568881750107, "num_tokens": 35247253.0, "reward": 0.9011325836181641, "reward_std": 0.21180510520935059, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6619023084640503, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8458315134048462, "step": 154 }, { "adv/mean_abs_final_conf": 0.5934216380119324, "adv/mean_abs_reasoning": 0.39241135120391846, "adv/mean_abs_step_conf": 0.7745480537414551, "adv/ratio_final_to_reasoning": 1.5122438130072287, "adv/ratio_step_to_reasoning": 1.9738166374778425, "adv/std_final_conf": 0.8153236508369446, "adv/std_reasoning": 0.6814760565757751, "adv/std_step_conf": 0.934339702129364, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6706211180124223, "calib/avg_num_step_conf": 4.33203125, "calib/ece": 0.3203529411764705, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6705882352941176, "calib/gap": 0.2845248447204968, "calib/mean_conf": 0.741529411764706, "calib/mu_c": 0.8977391304347826, "calib/mu_w": 0.6132142857142858, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.30545098039215673, "calib/std_conf": 0.38204867531692693, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5163991323210412, "calib/step_q_c_n": 461.0, "calib/step_q_gap": 0.15479419404943623, "calib/step_q_w": 0.36160493827160495, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 365.33203125, "completions/mean_terminated_length": 365.33203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.16533333333333333, "grad_norm": 0.08029747009277344, "learning_rate": 1.25e-06, "loss": -0.0348, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04825538396835327, "mask/share_reasoning": 0.8240121006965637, "mask/share_step_conf": 0.12773250043392181, "num_tokens": 35447994.0, "reward": 0.8786300420761108, "reward_std": 0.18942666053771973, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6564257740974426, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8125530481338501, "step": 155 }, { "adv/mean_abs_final_conf": 0.59581458568573, "adv/mean_abs_reasoning": 0.45483464002609253, "adv/mean_abs_step_conf": 0.7564510107040405, "adv/ratio_final_to_reasoning": 1.309958682240099, "adv/ratio_step_to_reasoning": 1.6631341242185185, "adv/std_final_conf": 0.8109592795372009, "adv/std_reasoning": 0.701445996761322, "adv/std_step_conf": 0.9301225543022156, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.666646123505485, "calib/avg_num_step_conf": 4.6015625, "calib/ece": 0.2979215686274509, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.592156862745098, "calib/gap": 0.26115432022679663, "calib/mean_conf": 0.6849803921568627, "calib/mu_c": 0.8099248120300753, "calib/mu_w": 0.5487704918032786, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23066666666666655, "calib/std_conf": 0.4056066620772085, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4784927066450568, "calib/step_q_c_n": 617.0, "calib/step_q_gap": 0.07659573694808708, "calib/step_q_w": 0.4018969696969697, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 415.80859375, "completions/mean_terminated_length": 415.80859375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1664, "grad_norm": 0.07279438525438309, "learning_rate": 1.2222222222222223e-06, "loss": 0.0239, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04457136243581772, "mask/share_reasoning": 0.8282079100608826, "mask/share_step_conf": 0.12722070515155792, "num_tokens": 35659201.0, "reward": 0.9135178923606873, "reward_std": 0.18442893028259277, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6829797029495239, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.841712236404419, "step": 156 }, { "adv/mean_abs_final_conf": 0.6162358522415161, "adv/mean_abs_reasoning": 0.5435482263565063, "adv/mean_abs_step_conf": 0.7789421081542969, "adv/ratio_final_to_reasoning": 1.133728015952231, "adv/ratio_step_to_reasoning": 1.4330689907235548, "adv/std_final_conf": 0.8435057401657104, "adv/std_reasoning": 0.7753556370735168, "adv/std_step_conf": 0.9337359666824341, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7154391974553462, "calib/avg_num_step_conf": 4.6484375, "calib/ece": 0.28910156249999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.69921875, "calib/gap": 0.3077746513334968, "calib/mean_conf": 0.7451171875000001, "calib/mu_c": 0.8917910447761196, "calib/mu_w": 0.5840163934426228, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.255390625, "calib/std_conf": 0.38766325434207694, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5419318181818181, "calib/step_q_c_n": 616.0, "calib/step_q_gap": 0.12998059866962303, "calib/step_q_w": 0.4119512195121951, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 400.73828125, "completions/mean_terminated_length": 402.309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.16746666666666668, "grad_norm": 0.08922214806079865, "learning_rate": 1.1944444444444446e-06, "loss": -0.0248, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04538150504231453, "mask/share_reasoning": 0.8202134966850281, "mask/share_step_conf": 0.1304987668991089, "num_tokens": 35865518.0, "reward": 0.9309684038162231, "reward_std": 0.19485074281692505, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7046737670898438, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8525755405426025, "step": 157 }, { "adv/mean_abs_final_conf": 0.5458027124404907, "adv/mean_abs_reasoning": 0.4660303294658661, "adv/mean_abs_step_conf": 0.7698330283164978, "adv/ratio_final_to_reasoning": 1.1711742303683423, "adv/ratio_step_to_reasoning": 1.6518946936325598, "adv/std_final_conf": 0.7610622644424438, "adv/std_reasoning": 0.720642626285553, "adv/std_step_conf": 0.9223465919494629, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.552269250382458, "calib/avg_num_step_conf": 4.44921875, "calib/ece": 0.39011811023622045, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8228346456692913, "calib/gap": 0.045982916879143354, "calib/mean_conf": 0.8454724409448822, "calib/mu_c": 0.8646621621621622, "calib/mu_w": 0.8186792452830188, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3264566929133858, "calib/std_conf": 0.3211365968048917, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5013793103448275, "calib/step_q_c_n": 667.0, "calib/step_q_gap": 0.05773524254821738, "calib/step_q_w": 0.44364406779661014, "calib/step_q_w_n": 472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 396.359375, "completions/mean_terminated_length": 396.359375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.16853333333333334, "grad_norm": 0.06744744628667831, "learning_rate": 1.1666666666666668e-06, "loss": -0.0448, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04748845472931862, "mask/share_reasoning": 0.8199511766433716, "mask/share_step_conf": 0.13256040215492249, "num_tokens": 36072226.0, "reward": 0.8509544134140015, "reward_std": 0.21927496790885925, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.5915929675102234, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.800940752029419, "step": 158 }, { "adv/mean_abs_final_conf": 0.5806899070739746, "adv/mean_abs_reasoning": 0.3605530560016632, "adv/mean_abs_step_conf": 0.7557182312011719, "adv/ratio_final_to_reasoning": 1.6105532803230378, "adv/ratio_step_to_reasoning": 2.095997298099966, "adv/std_final_conf": 0.7915095686912537, "adv/std_reasoning": 0.6402944922447205, "adv/std_step_conf": 0.9314445853233337, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6462706146926538, "calib/avg_num_step_conf": 4.24609375, "calib/ece": 0.3015748031496062, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6850393700787402, "calib/gap": 0.21737256371814107, "calib/mean_conf": 0.7509448818897638, "calib/mu_c": 0.8502173913043479, "calib/mu_w": 0.6328448275862069, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25460629921259836, "calib/std_conf": 0.3749896231023398, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5217921146953406, "calib/step_q_c_n": 558.0, "calib/step_q_gap": 0.15651801261594556, "calib/step_q_w": 0.36527410207939504, "calib/step_q_w_n": 529.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 373.90625, "completions/mean_terminated_length": 375.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.1696, "grad_norm": 0.07806520909070969, "learning_rate": 1.138888888888889e-06, "loss": -0.0167, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.047464147210121155, "mask/share_reasoning": 0.8260021805763245, "mask/share_step_conf": 0.12262741476297379, "num_tokens": 36272730.0, "reward": 0.9267535209655762, "reward_std": 0.1796480417251587, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6707344055175781, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8765225410461426, "step": 159 }, { "adv/mean_abs_final_conf": 0.5432534217834473, "adv/mean_abs_reasoning": 0.5357876420021057, "adv/mean_abs_step_conf": 0.7486876249313354, "adv/ratio_final_to_reasoning": 1.0139342142223433, "adv/ratio_step_to_reasoning": 1.3973588904247123, "adv/std_final_conf": 0.7767263054847717, "adv/std_reasoning": 0.7754621505737305, "adv/std_step_conf": 0.9328858852386475, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7101940457203615, "calib/avg_num_step_conf": 4.625, "calib/ece": 0.23509960159362542, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6812749003984063, "calib/gap": 0.33595627325890487, "calib/mean_conf": 0.7333466135458169, "calib/mu_c": 0.8658552631578948, "calib/mu_w": 0.52989898989899, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18143426294820714, "calib/std_conf": 0.3908199004657845, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5018335901386748, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.17026349668073087, "calib/step_q_w": 0.3315700934579439, "calib/step_q_w_n": 535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 453.4140625, "completions/mean_terminated_length": 453.4140625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.17066666666666666, "grad_norm": 0.07252976298332214, "learning_rate": 1.111111111111111e-06, "loss": 0.0478, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04366093873977661, "mask/share_reasoning": 0.8330541849136353, "mask/share_step_conf": 0.12328487634658813, "num_tokens": 36493644.0, "reward": 0.940780758857727, "reward_std": 0.21383237838745117, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7339656352996826, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8335334062576294, "step": 160 }, { "adv/mean_abs_final_conf": 0.5750141143798828, "adv/mean_abs_reasoning": 0.5041825771331787, "adv/mean_abs_step_conf": 0.7313765287399292, "adv/ratio_final_to_reasoning": 1.1404878717734708, "adv/ratio_step_to_reasoning": 1.4506184106927948, "adv/std_final_conf": 0.8224272131919861, "adv/std_reasoning": 0.7753112316131592, "adv/std_step_conf": 0.9324716925621033, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6812957157784743, "calib/avg_num_step_conf": 4.03515625, "calib/ece": 0.22353174603174608, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7698412698412699, "calib/gap": 0.3074879832810864, "calib/mean_conf": 0.8030555555555556, "calib/mu_c": 0.9092121212121211, "calib/mu_w": 0.6017241379310347, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18591269841269845, "calib/std_conf": 0.35829815564994427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.497725258493353, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.09244435961694847, "calib/step_q_w": 0.4052808988764045, "calib/step_q_w_n": 356.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 369.64453125, "completions/mean_terminated_length": 371.0941467285156, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.17173333333333332, "grad_norm": 0.07589954882860184, "learning_rate": 1.0833333333333335e-06, "loss": -0.0766, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0506252720952034, "mask/share_reasoning": 0.8228753805160522, "mask/share_step_conf": 0.12259312719106674, "num_tokens": 36692193.0, "reward": 0.9586951732635498, "reward_std": 0.20655444264411926, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7506816387176514, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8409274816513062, "step": 161 }, { "adv/mean_abs_final_conf": 0.5890517234802246, "adv/mean_abs_reasoning": 0.35327833890914917, "adv/mean_abs_step_conf": 0.7464599609375, "adv/ratio_final_to_reasoning": 1.6673870390669723, "adv/ratio_step_to_reasoning": 2.112951400423855, "adv/std_final_conf": 0.8073970079421997, "adv/std_reasoning": 0.6402244567871094, "adv/std_step_conf": 0.9316359162330627, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6739285714285714, "calib/avg_num_step_conf": 4.57421875, "calib/ece": 0.21890196078431368, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7254901960784313, "calib/gap": 0.26711785714285696, "calib/mean_conf": 0.7849411764705884, "calib/mu_c": 0.868742857142857, "calib/mu_w": 0.6016250000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15878431372549015, "calib/std_conf": 0.3555110881459175, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4782210927573062, "calib/step_q_c_n": 787.0, "calib/step_q_gap": 0.08267421775730621, "calib/step_q_w": 0.39554687499999996, "calib/step_q_w_n": 384.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 369.203125, "completions/mean_terminated_length": 370.6510009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.1728, "grad_norm": 0.07268118858337402, "learning_rate": 1.0555555555555557e-06, "loss": 0.0244, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04668009281158447, "mask/share_reasoning": 0.8181484937667847, "mask/share_step_conf": 0.13126519322395325, "num_tokens": 36890853.0, "reward": 0.980265200138092, "reward_std": 0.18448534607887268, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7606140971183777, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8639787435531616, "step": 162 }, { "adv/mean_abs_final_conf": 0.5629448294639587, "adv/mean_abs_reasoning": 0.37756574153900146, "adv/mean_abs_step_conf": 0.7611972093582153, "adv/ratio_final_to_reasoning": 1.4909849266761617, "adv/ratio_step_to_reasoning": 2.016065351309385, "adv/std_final_conf": 0.7932924032211304, "adv/std_reasoning": 0.6403728723526001, "adv/std_step_conf": 0.9302433133125305, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6921878972794304, "calib/avg_num_step_conf": 5.0703125, "calib/ece": 0.25400793650793646, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5674603174603174, "calib/gap": 0.3168916857360794, "calib/mean_conf": 0.6593253968253967, "calib/mu_c": 0.80268115942029, "calib/mu_w": 0.48578947368421055, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1828571428571428, "calib/std_conf": 0.4148770391440933, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4744969512195122, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.15307950573664614, "calib/step_q_w": 0.32141744548286605, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 466.16015625, "completions/mean_terminated_length": 466.16015625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.17386666666666667, "grad_norm": 0.08332052826881409, "learning_rate": 1.0277777777777777e-06, "loss": 0.0246, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04192296415567398, "mask/share_reasoning": 0.8303929567337036, "mask/share_step_conf": 0.12768401205539703, "num_tokens": 37115022.0, "reward": 0.9365798234939575, "reward_std": 0.1679585576057434, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7094457149505615, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8598076105117798, "step": 163 }, { "adv/mean_abs_final_conf": 0.6199654936790466, "adv/mean_abs_reasoning": 0.44486063718795776, "adv/mean_abs_step_conf": 0.7613261938095093, "adv/ratio_final_to_reasoning": 1.3936173305823536, "adv/ratio_step_to_reasoning": 1.7113813409565428, "adv/std_final_conf": 0.8239652514457703, "adv/std_reasoning": 0.7205904126167297, "adv/std_step_conf": 0.9310764074325562, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7449919484702092, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.2265199999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.568, "calib/gap": 0.40511111111111126, "calib/mean_conf": 0.63276, "calib/mu_c": 0.8191111111111112, "calib/mu_w": 0.414, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1596399999999999, "calib/std_conf": 0.43284914508405814, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4530561056105611, "calib/step_q_c_n": 606.0, "calib/step_q_gap": 0.16538446381951633, "calib/step_q_w": 0.28767164179104476, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 465.44140625, "completions/mean_terminated_length": 467.2666931152344, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.17493333333333333, "grad_norm": 0.07369590550661087, "learning_rate": 1.0000000000000002e-06, "loss": -0.0226, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.040351904928684235, "mask/share_reasoning": 0.8398116827011108, "mask/share_step_conf": 0.11593015491962433, "num_tokens": 37340311.0, "reward": 0.9514877796173096, "reward_std": 0.1814444661140442, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7391566038131714, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8630375862121582, "step": 164 }, { "adv/mean_abs_final_conf": 0.5729429125785828, "adv/mean_abs_reasoning": 0.3981139659881592, "adv/mean_abs_step_conf": 0.7356992959976196, "adv/ratio_final_to_reasoning": 1.439142962886219, "adv/ratio_step_to_reasoning": 1.8479615357666226, "adv/std_final_conf": 0.8126264214515686, "adv/std_reasoning": 0.6815700531005859, "adv/std_step_conf": 0.9325501322746277, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7445909614646012, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.28335968379446635, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6324110671936759, "calib/gap": 0.4067065676609908, "calib/mean_conf": 0.6860474308300396, "calib/mu_c": 0.9207476635514018, "calib/mu_w": 0.514041095890411, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2732411067193675, "calib/std_conf": 0.41896563496773, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.48675381263616546, "calib/step_q_c_n": 459.0, "calib/step_q_gap": 0.1439782482271349, "calib/step_q_w": 0.34277556440903056, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 451.30078125, "completions/mean_terminated_length": 453.07061767578125, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.176, "grad_norm": 0.07571450620889664, "learning_rate": 9.722222222222224e-07, "loss": -0.013, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04123295098543167, "mask/share_reasoning": 0.8342355489730835, "mask/share_step_conf": 0.12062521278858185, "num_tokens": 37561420.0, "reward": 0.9152418375015259, "reward_std": 0.1929076462984085, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.697473406791687, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8525415658950806, "step": 165 }, { "adv/mean_abs_final_conf": 0.5445481538772583, "adv/mean_abs_reasoning": 0.42633116245269775, "adv/mean_abs_step_conf": 0.7497313618659973, "adv/ratio_final_to_reasoning": 1.2772891166211124, "adv/ratio_step_to_reasoning": 1.758565706416503, "adv/std_final_conf": 0.7711051106452942, "adv/std_reasoning": 0.6816762089729309, "adv/std_step_conf": 0.9312403798103333, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.8083285385500576, "calib/avg_num_step_conf": 5.30859375, "calib/ece": 0.14529411764705874, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5843137254901961, "calib/gap": 0.5199769850402762, "calib/mean_conf": 0.658, "calib/mu_c": 0.8190909090909091, "calib/mu_w": 0.2991139240506328, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.05654901960784307, "calib/std_conf": 0.421354864180571, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4063033707865168, "calib/step_q_c_n": 890.0, "calib/step_q_gap": 0.168904650104214, "calib/step_q_w": 0.2373987206823028, "calib/step_q_w_n": 469.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 444.359375, "completions/mean_terminated_length": 446.10198974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.17706666666666668, "grad_norm": 0.07834941148757935, "learning_rate": 9.444444444444445e-07, "loss": -0.0351, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04008059576153755, "mask/share_reasoning": 0.8259658813476562, "mask/share_step_conf": 0.1300472915172577, "num_tokens": 37781360.0, "reward": 1.0240976810455322, "reward_std": 0.14218752086162567, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.8227410316467285, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8902982473373413, "step": 166 }, { "adv/mean_abs_final_conf": 0.4789263904094696, "adv/mean_abs_reasoning": 0.36526644229888916, "adv/mean_abs_step_conf": 0.7769738435745239, "adv/ratio_final_to_reasoning": 1.3111699705979973, "adv/ratio_step_to_reasoning": 2.1271426925628827, "adv/std_final_conf": 0.7239590287208557, "adv/std_reasoning": 0.6403273940086365, "adv/std_step_conf": 0.9335882663726807, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6627229080932785, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.2313888888888887, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.753968253968254, "calib/gap": 0.2902098765432096, "calib/mean_conf": 0.80234126984127, "calib/mu_c": 0.9059876543209875, "calib/mu_w": 0.6157777777777779, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19543650793650777, "calib/std_conf": 0.3505682385557793, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.446627358490566, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.1079097114317425, "calib/step_q_w": 0.3387176470588235, "calib/step_q_w_n": 510.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 431.51171875, "completions/mean_terminated_length": 433.2039489746094, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.17813333333333334, "grad_norm": 0.06649410724639893, "learning_rate": 9.166666666666666e-07, "loss": 0.0577, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.042462125420570374, "mask/share_reasoning": 0.82489013671875, "mask/share_step_conf": 0.12874150276184082, "num_tokens": 37997435.0, "reward": 0.9571617841720581, "reward_std": 0.1660226434469223, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7435324192047119, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8473536372184753, "step": 167 }, { "adv/mean_abs_final_conf": 0.5585249066352844, "adv/mean_abs_reasoning": 0.4548640847206116, "adv/mean_abs_step_conf": 0.7345359921455383, "adv/ratio_final_to_reasoning": 1.2278940575806152, "adv/ratio_step_to_reasoning": 1.6148471968207996, "adv/std_final_conf": 0.7788551449775696, "adv/std_reasoning": 0.7205998301506042, "adv/std_step_conf": 0.9329665899276733, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6898065623555819, "calib/avg_num_step_conf": 4.77734375, "calib/ece": 0.2560714285714286, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6309523809523809, "calib/gap": 0.28005347593582886, "calib/mean_conf": 0.7059920634920636, "calib/mu_c": 0.8160130718954248, "calib/mu_w": 0.535959595959596, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17746031746031748, "calib/std_conf": 0.3982075512911148, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4612481426448736, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.1121208699176009, "calib/step_q_w": 0.34912727272727273, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 481.0859375, "completions/mean_terminated_length": 482.9725646972656, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.1792, "grad_norm": 0.07671131938695908, "learning_rate": 8.88888888888889e-07, "loss": 0.0107, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03883073478937149, "mask/share_reasoning": 0.8422915935516357, "mask/share_step_conf": 0.11497138440608978, "num_tokens": 38225265.0, "reward": 0.9440748691558838, "reward_std": 0.18650661408901215, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7153031229972839, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8572216033935547, "step": 168 }, { "adv/mean_abs_final_conf": 0.517835259437561, "adv/mean_abs_reasoning": 0.35380783677101135, "adv/mean_abs_step_conf": 0.75236976146698, "adv/ratio_final_to_reasoning": 1.4636059623877415, "adv/ratio_step_to_reasoning": 2.1264926416933005, "adv/std_final_conf": 0.7613320350646973, "adv/std_reasoning": 0.640367329120636, "adv/std_step_conf": 0.93329256772995, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6736637512147716, "calib/avg_num_step_conf": 4.41015625, "calib/ece": 0.265595238095238, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.626984126984127, "calib/gap": 0.2759591836734695, "calib/mean_conf": 0.6991666666666666, "calib/mu_c": 0.8141496598639457, "calib/mu_w": 0.5381904761904762, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.19071428571428564, "calib/std_conf": 0.4071997140751171, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.46905582922824296, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.1609019830743968, "calib/step_q_w": 0.30815384615384617, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 429.30859375, "completions/mean_terminated_length": 430.9921875, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.18026666666666666, "grad_norm": 0.09371823817491531, "learning_rate": 8.611111111111112e-07, "loss": 0.0385, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.042047228664159775, "mask/share_reasoning": 0.8362818956375122, "mask/share_step_conf": 0.1177646592259407, "num_tokens": 38439352.0, "reward": 0.9178427457809448, "reward_std": 0.15940846502780914, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6946808099746704, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8308483362197876, "step": 169 }, { "adv/mean_abs_final_conf": 0.5378715395927429, "adv/mean_abs_reasoning": 0.3950071334838867, "adv/mean_abs_step_conf": 0.743911623954773, "adv/ratio_final_to_reasoning": 1.3616755091201005, "adv/ratio_step_to_reasoning": 1.883286555849298, "adv/std_final_conf": 0.7942361235618591, "adv/std_reasoning": 0.6612808704376221, "adv/std_step_conf": 0.9310983419418335, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7955873662092062, "calib/avg_num_step_conf": 5.28515625, "calib/ece": 0.15759448818897626, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5590551181102362, "calib/gap": 0.455518221813645, "calib/mean_conf": 0.6541377952755906, "calib/mu_c": 0.8280955414012738, "calib/mu_w": 0.37257731958762885, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.09681102362204712, "calib/std_conf": 0.4139508709160388, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.42593333333333333, "calib/step_q_c_n": 795.0, "calib/step_q_gap": 0.13033297491039425, "calib/step_q_w": 0.2956003584229391, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 466.9609375, "completions/mean_terminated_length": 466.9609375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.18133333333333335, "grad_norm": 0.07489202171564102, "learning_rate": 8.333333333333333e-07, "loss": -0.034, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03827427700161934, "mask/share_reasoning": 0.8382338881492615, "mask/share_step_conf": 0.1234918087720871, "num_tokens": 38663046.0, "reward": 0.9949482679367065, "reward_std": 0.16509473323822021, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.79248046875, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8771034479141235, "step": 170 }, { "adv/mean_abs_final_conf": 0.6157266497612, "adv/mean_abs_reasoning": 0.4606185257434845, "adv/mean_abs_step_conf": 0.7603050470352173, "adv/ratio_final_to_reasoning": 1.3367387878447907, "adv/ratio_step_to_reasoning": 1.6506176033801694, "adv/std_final_conf": 0.8095325231552124, "adv/std_reasoning": 0.7015524506568909, "adv/std_step_conf": 0.9335606694221497, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.729532722179781, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.27175298804780873, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5179282868525896, "calib/gap": 0.3450993124522532, "calib/mean_conf": 0.5895219123505976, "calib/mu_c": 0.7710084033613441, "calib/mu_w": 0.4259090909090909, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19358565737051786, "calib/std_conf": 0.44340022045094263, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44571701720841295, "calib/step_q_c_n": 523.0, "calib/step_q_gap": 0.1940722803663077, "calib/step_q_w": 0.25164473684210525, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 421.97265625, "completions/mean_terminated_length": 428.670654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1824, "grad_norm": 0.07304099947214127, "learning_rate": 8.055555555555557e-07, "loss": -0.0313, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.040931448340415955, "mask/share_reasoning": 0.8156298398971558, "mask/share_step_conf": 0.12781374156475067, "num_tokens": 38877967.0, "reward": 0.9164899587631226, "reward_std": 0.20652560889720917, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.691113293170929, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.854366660118103, "step": 171 }, { "adv/mean_abs_final_conf": 0.5187068581581116, "adv/mean_abs_reasoning": 0.3769237995147705, "adv/mean_abs_step_conf": 0.7414641976356506, "adv/ratio_final_to_reasoning": 1.3761584140504373, "adv/ratio_step_to_reasoning": 1.9671461409180528, "adv/std_final_conf": 0.7765347361564636, "adv/std_reasoning": 0.681504487991333, "adv/std_step_conf": 0.9323990941047668, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7505080563216723, "calib/avg_num_step_conf": 4.53515625, "calib/ece": 0.18481927710843388, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.642570281124498, "calib/gap": 0.34192771084337337, "calib/mean_conf": 0.750120481927711, "calib/mu_c": 0.8640963855421686, "calib/mu_w": 0.5221686746987952, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13413654618473908, "calib/std_conf": 0.3591751839887356, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4400571428571428, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.09751918190269604, "calib/step_q_w": 0.3425379609544468, "calib/step_q_w_n": 461.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 403.46875, "completions/mean_terminated_length": 408.25299072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.18346666666666667, "grad_norm": 0.07457685470581055, "learning_rate": 7.777777777777779e-07, "loss": -0.0248, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04320889711380005, "mask/share_reasoning": 0.8211055397987366, "mask/share_step_conf": 0.12396682798862457, "num_tokens": 39084607.0, "reward": 0.9766821265220642, "reward_std": 0.17918381094932556, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7681636810302734, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8617630004882812, "step": 172 }, { "adv/mean_abs_final_conf": 0.5529680848121643, "adv/mean_abs_reasoning": 0.365217924118042, "adv/mean_abs_step_conf": 0.7145917415618896, "adv/ratio_final_to_reasoning": 1.5140770709639093, "adv/ratio_step_to_reasoning": 1.9566173902541724, "adv/std_final_conf": 0.7967362999916077, "adv/std_reasoning": 0.6402485966682434, "adv/std_step_conf": 0.9333781599998474, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6749213836477987, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.24749019607843145, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7019607843137254, "calib/gap": 0.2984944968553457, "calib/mean_conf": 0.7567450980392157, "calib/mu_c": 0.8691194968553457, "calib/mu_w": 0.570625, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19035294117647067, "calib/std_conf": 0.3834241926321011, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4593935643564356, "calib/step_q_c_n": 808.0, "calib/step_q_gap": 0.05015609158955109, "calib/step_q_w": 0.4092374727668845, "calib/step_q_w_n": 459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 432.22265625, "completions/mean_terminated_length": 433.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.18453333333333333, "grad_norm": 0.06947637349367142, "learning_rate": 7.5e-07, "loss": -0.0283, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04602811485528946, "mask/share_reasoning": 0.820665717124939, "mask/share_step_conf": 0.12939989566802979, "num_tokens": 39298416.0, "reward": 0.9457964897155762, "reward_std": 0.18005359172821045, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.73384690284729, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8350897431373596, "step": 173 }, { "adv/mean_abs_final_conf": 0.6815347671508789, "adv/mean_abs_reasoning": 0.5925767421722412, "adv/mean_abs_step_conf": 0.7791270613670349, "adv/ratio_final_to_reasoning": 1.1501206825170684, "adv/ratio_step_to_reasoning": 1.314812084103986, "adv/std_final_conf": 0.859148383140564, "adv/std_reasoning": 0.7929279804229736, "adv/std_step_conf": 0.9338409304618835, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6755434782608695, "calib/avg_num_step_conf": 5.37109375, "calib/ece": 0.2848207171314741, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.44621513944223107, "calib/gap": 0.29682608695652174, "calib/mean_conf": 0.540996015936255, "calib/mu_c": 0.7018260869565217, "calib/mu_w": 0.40499999999999997, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18382470119521915, "calib/std_conf": 0.44170193748471687, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.43251327433628317, "calib/step_q_c_n": 565.0, "calib/step_q_gap": 0.13611697803998685, "calib/step_q_w": 0.2963962962962963, "calib/step_q_w_n": 810.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 496.765625, "completions/mean_terminated_length": 496.765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1856, "grad_norm": 0.07883763313293457, "learning_rate": 7.222222222222222e-07, "loss": -0.0019, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03562241047620773, "mask/share_reasoning": 0.8468090891838074, "mask/share_step_conf": 0.1175684779882431, "num_tokens": 39529820.0, "reward": 0.8818709850311279, "reward_std": 0.20740194618701935, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6686819791793823, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8122473955154419, "step": 174 }, { "adv/mean_abs_final_conf": 0.6320722103118896, "adv/mean_abs_reasoning": 0.3945220112800598, "adv/mean_abs_step_conf": 0.779804527759552, "adv/ratio_final_to_reasoning": 1.6021215350217806, "adv/ratio_step_to_reasoning": 1.976580534073145, "adv/std_final_conf": 0.8205243945121765, "adv/std_reasoning": 0.6613585948944092, "adv/std_step_conf": 0.9341087937355042, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7647095959595961, "calib/avg_num_step_conf": 4.8046875, "calib/ece": 0.20841269841269844, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4603174603174603, "calib/gap": 0.41150757575757585, "calib/mean_conf": 0.5643650793650795, "calib/mu_c": 0.7799166666666667, "calib/mu_w": 0.36840909090909085, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1482936507936508, "calib/std_conf": 0.4328752084401884, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.49258849557522116, "calib/step_q_c_n": 452.0, "calib/step_q_gap": 0.1995396523875605, "calib/step_q_w": 0.29304884318766067, "calib/step_q_w_n": 778.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 447.453125, "completions/mean_terminated_length": 449.2078552246094, "completions/min_length": 0.0, "completions/min_terminated_length": 67.0, "epoch": 0.18666666666666668, "grad_norm": 0.07463081926107407, "learning_rate": 6.944444444444446e-07, "loss": -0.0688, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04283197969198227, "mask/share_reasoning": 0.8326044082641602, "mask/share_step_conf": 0.12065736949443817, "num_tokens": 39750192.0, "reward": 0.9311752319335938, "reward_std": 0.19330263137817383, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7413758039474487, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8319121599197388, "step": 175 }, { "adv/mean_abs_final_conf": 0.6272318363189697, "adv/mean_abs_reasoning": 0.4546269178390503, "adv/mean_abs_step_conf": 0.751192569732666, "adv/ratio_final_to_reasoning": 1.3796627777790889, "adv/ratio_step_to_reasoning": 1.6523275245189235, "adv/std_final_conf": 0.8240246772766113, "adv/std_reasoning": 0.7014360427856445, "adv/std_step_conf": 0.933167040348053, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6834124723013613, "calib/avg_num_step_conf": 5.09765625, "calib/ece": 0.2514682539682539, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5198412698412699, "calib/gap": 0.279994301994302, "calib/mean_conf": 0.6372619047619049, "calib/mu_c": 0.7672592592592593, "calib/mu_w": 0.48726495726495733, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.17650793650793647, "calib/std_conf": 0.40794593243081617, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4296898079763663, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.09962611370885033, "calib/step_q_w": 0.330063694267516, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 416.734375, "completions/mean_terminated_length": 423.3492431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.18773333333333334, "grad_norm": 0.09122262895107269, "learning_rate": 6.666666666666667e-07, "loss": -0.1328, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.044397637248039246, "mask/share_reasoning": 0.8054189085960388, "mask/share_step_conf": 0.13455848395824432, "num_tokens": 39960940.0, "reward": 0.9191524982452393, "reward_std": 0.187923401594162, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6974878311157227, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8408170938491821, "step": 176 }, { "adv/mean_abs_final_conf": 0.6960560083389282, "adv/mean_abs_reasoning": 0.40270107984542847, "adv/mean_abs_step_conf": 0.7754931449890137, "adv/ratio_final_to_reasoning": 1.7284681943393352, "adv/ratio_step_to_reasoning": 1.925728993045354, "adv/std_final_conf": 0.8730778694152832, "adv/std_reasoning": 0.6815537810325623, "adv/std_step_conf": 0.9336084127426147, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7267478746351985, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.2518142292490118, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4901185770750988, "calib/gap": 0.3365333079558433, "calib/mean_conf": 0.591505928853755, "calib/mu_c": 0.739154929577465, "calib/mu_w": 0.4026216216216217, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1410276679841897, "calib/std_conf": 0.42191286233484737, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42364583333333333, "calib/step_q_c_n": 672.0, "calib/step_q_gap": 0.15325156321055028, "calib/step_q_w": 0.27039427012278305, "calib/step_q_w_n": 733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 454.7734375, "completions/mean_terminated_length": 458.3543395996094, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.1888, "grad_norm": 0.08699323982000351, "learning_rate": 6.388888888888889e-07, "loss": -0.0555, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04031483829021454, "mask/share_reasoning": 0.8218288421630859, "mask/share_step_conf": 0.13004378974437714, "num_tokens": 40181194.0, "reward": 0.9515196084976196, "reward_std": 0.18108904361724854, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7318902015686035, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8625553846359253, "step": 177 }, { "adv/mean_abs_final_conf": 0.6604294180870056, "adv/mean_abs_reasoning": 0.5337682962417603, "adv/mean_abs_step_conf": 0.7507667541503906, "adv/ratio_final_to_reasoning": 1.2372960753515354, "adv/ratio_step_to_reasoning": 1.4065405522143357, "adv/std_final_conf": 0.8525503277778625, "adv/std_reasoning": 0.7754141688346863, "adv/std_step_conf": 0.9337341785430908, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8208227040816326, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.15789682539682534, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.503968253968254, "calib/gap": 0.48300000000000004, "calib/mean_conf": 0.6051190476190477, "calib/mu_c": 0.8197857142857143, "calib/mu_w": 0.3367857142857143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10373015873015867, "calib/std_conf": 0.4244150803527591, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42572815533980585, "calib/step_q_c_n": 618.0, "calib/step_q_gap": 0.157339997445069, "calib/step_q_w": 0.26838815789473686, "calib/step_q_w_n": 608.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 414.16015625, "completions/mean_terminated_length": 417.4212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.18986666666666666, "grad_norm": 0.08844917267560959, "learning_rate": 6.111111111111112e-07, "loss": -0.0594, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04261164367198944, "mask/share_reasoning": 0.821635365486145, "mask/share_step_conf": 0.12794049084186554, "num_tokens": 40393291.0, "reward": 0.9865151643753052, "reward_std": 0.1949872523546219, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7963793277740479, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8704010844230652, "step": 178 }, { "adv/mean_abs_final_conf": 0.642683207988739, "adv/mean_abs_reasoning": 0.5560048222541809, "adv/mean_abs_step_conf": 0.7505338191986084, "adv/ratio_final_to_reasoning": 1.1558950251243192, "adv/ratio_step_to_reasoning": 1.3498692622048831, "adv/std_final_conf": 0.8355764150619507, "adv/std_reasoning": 0.7754384279251099, "adv/std_step_conf": 0.9327178001403809, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7686349206349206, "calib/avg_num_step_conf": 4.78125, "calib/ece": 0.21637450199203184, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5059760956175299, "calib/gap": 0.42960698412698395, "calib/mean_conf": 0.5848207171314742, "calib/mu_c": 0.8004799999999999, "calib/mu_w": 0.3708730158730159, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.15159362549800792, "calib/std_conf": 0.4404573272187454, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.451613475177305, "calib/step_q_c_n": 564.0, "calib/step_q_gap": 0.1341286266924565, "calib/step_q_w": 0.3174848484848485, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 450.55859375, "completions/mean_terminated_length": 452.3255310058594, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.19093333333333334, "grad_norm": 0.06448730826377869, "learning_rate": 5.833333333333334e-07, "loss": 0.0308, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03963598608970642, "mask/share_reasoning": 0.8372147083282471, "mask/share_step_conf": 0.11924304068088531, "num_tokens": 40614898.0, "reward": 0.9538394212722778, "reward_std": 0.18762339651584625, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7483574151992798, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8655713796615601, "step": 179 }, { "adv/mean_abs_final_conf": 0.6583120226860046, "adv/mean_abs_reasoning": 0.451434850692749, "adv/mean_abs_step_conf": 0.7848700284957886, "adv/ratio_final_to_reasoning": 1.4582658420717682, "adv/ratio_step_to_reasoning": 1.7386119553937116, "adv/std_final_conf": 0.8509746789932251, "adv/std_reasoning": 0.7206116318702698, "adv/std_step_conf": 0.9331481456756592, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7350760955129887, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.2431075697211155, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4940239043824701, "calib/gap": 0.3431251639989503, "calib/mean_conf": 0.5827091633466135, "calib/mu_c": 0.7235135135135136, "calib/mu_w": 0.38038834951456324, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1180876494023904, "calib/std_conf": 0.4357882347351325, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4254787234042553, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.07888059256313379, "calib/step_q_w": 0.3465981308411215, "calib/step_q_w_n": 535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 459.5859375, "completions/mean_terminated_length": 465.03558349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.192, "grad_norm": 0.09744112193584442, "learning_rate": 5.555555555555555e-07, "loss": -0.1025, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037247009575366974, "mask/share_reasoning": 0.8301904201507568, "mask/share_step_conf": 0.1208437904715538, "num_tokens": 40836408.0, "reward": 0.9185820817947388, "reward_std": 0.1872258186340332, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7002886533737183, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8290631175041199, "step": 180 }, { "adv/mean_abs_final_conf": 0.6360911130905151, "adv/mean_abs_reasoning": 0.3917732238769531, "adv/mean_abs_step_conf": 0.756089448928833, "adv/ratio_final_to_reasoning": 1.6236206925930614, "adv/ratio_step_to_reasoning": 1.9299160913720412, "adv/std_final_conf": 0.8453565835952759, "adv/std_reasoning": 0.7012593150138855, "adv/std_step_conf": 0.9292190670967102, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7270427286356822, "calib/avg_num_step_conf": 4.4453125, "calib/ece": 0.2603543307086615, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.4448818897637795, "calib/gap": 0.3485044977511243, "calib/mean_conf": 0.5526377952755905, "calib/mu_c": 0.7419827586206895, "calib/mu_w": 0.3934782608695652, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17814960629921267, "calib/std_conf": 0.43033557989948107, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.48066921606118546, "calib/step_q_c_n": 523.0, "calib/step_q_gap": 0.12810011037012858, "calib/step_q_w": 0.35256910569105687, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 388.63671875, "completions/mean_terminated_length": 391.69683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.19306666666666666, "grad_norm": 0.0824630856513977, "learning_rate": 5.277777777777779e-07, "loss": -0.087, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0428725928068161, "mask/share_reasoning": 0.8240259289741516, "mask/share_step_conf": 0.12528899312019348, "num_tokens": 41042163.0, "reward": 0.9327545166015625, "reward_std": 0.16425344347953796, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7208136320114136, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8564140796661377, "step": 181 }, { "adv/mean_abs_final_conf": 0.587517499923706, "adv/mean_abs_reasoning": 0.4299156665802002, "adv/mean_abs_step_conf": 0.7061957716941833, "adv/ratio_final_to_reasoning": 1.3665877882449893, "adv/ratio_step_to_reasoning": 1.6426379092245607, "adv/std_final_conf": 0.8210249543190002, "adv/std_reasoning": 0.7205567359924316, "adv/std_step_conf": 0.9328953623771667, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7631914180355348, "calib/avg_num_step_conf": 5.359375, "calib/ece": 0.20908730158730154, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5634920634920635, "calib/gap": 0.3618618840093866, "calib/mean_conf": 0.6721825396825396, "calib/mu_c": 0.8085987261146498, "calib/mu_w": 0.44673684210526315, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1291269841269841, "calib/std_conf": 0.3994117678913053, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4477542932628798, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.14672990301897737, "calib/step_q_w": 0.30102439024390243, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 453.52734375, "completions/mean_terminated_length": 455.305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.19413333333333332, "grad_norm": 0.08231616765260696, "learning_rate": 5.000000000000001e-07, "loss": -0.0436, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.039243005216121674, "mask/share_reasoning": 0.827210009098053, "mask/share_step_conf": 0.12964074313640594, "num_tokens": 41264426.0, "reward": 0.9717744588851929, "reward_std": 0.17918717861175537, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.7575777173042297, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8672209978103638, "step": 182 }, { "adv/mean_abs_final_conf": 0.5640467405319214, "adv/mean_abs_reasoning": 0.4868428111076355, "adv/mean_abs_step_conf": 0.7534921169281006, "adv/ratio_final_to_reasoning": 1.1585808143056198, "adv/ratio_step_to_reasoning": 1.5477112935359991, "adv/std_final_conf": 0.8089180588722229, "adv/std_reasoning": 0.7206279039382935, "adv/std_step_conf": 0.9286974668502808, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7397829457364341, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.23244094488188966, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.468503937007874, "calib/gap": 0.3943975193798452, "calib/mean_conf": 0.545984251968504, "calib/mu_c": 0.7400775193798451, "calib/mu_w": 0.34567999999999993, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13527559055118102, "calib/std_conf": 0.44471128357199513, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.42944636678200687, "calib/step_q_c_n": 578.0, "calib/step_q_gap": 0.12737302404575862, "calib/step_q_w": 0.30207334273624825, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 472.8125, "completions/mean_terminated_length": 474.66668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1952, "grad_norm": 0.07106750458478928, "learning_rate": 4.7222222222222226e-07, "loss": 0.0104, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03746788948774338, "mask/share_reasoning": 0.8460267186164856, "mask/share_step_conf": 0.11259913444519043, "num_tokens": 41492146.0, "reward": 0.9547048807144165, "reward_std": 0.15090668201446533, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.74214768409729, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8680433034896851, "step": 183 }, { "adv/mean_abs_final_conf": 0.574894905090332, "adv/mean_abs_reasoning": 0.480303019285202, "adv/mean_abs_step_conf": 0.7650856375694275, "adv/ratio_final_to_reasoning": 1.1969421011466965, "adv/ratio_step_to_reasoning": 1.592922815076294, "adv/std_final_conf": 0.7902071475982666, "adv/std_reasoning": 0.7392838001251221, "adv/std_step_conf": 0.9323039650917053, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.778040810882902, "calib/avg_num_step_conf": 4.8515625, "calib/ece": 0.18841176470588228, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6862745098039216, "calib/gap": 0.4496585756201654, "calib/mean_conf": 0.7302549019607845, "calib/mu_c": 0.8924846625766872, "calib/mu_w": 0.4428260869565218, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13972549019607838, "calib/std_conf": 0.4014098117706824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46454735376044565, "calib/step_q_c_n": 718.0, "calib/step_q_gap": 0.15134124689021666, "calib/step_q_w": 0.313206106870229, "calib/step_q_w_n": 524.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 394.96484375, "completions/mean_terminated_length": 396.5137634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.19626666666666667, "grad_norm": 0.07112251967191696, "learning_rate": 4.444444444444445e-07, "loss": -0.0243, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04334566369652748, "mask/share_reasoning": 0.8269565105438232, "mask/share_step_conf": 0.12579162418842316, "num_tokens": 41698537.0, "reward": 1.0030958652496338, "reward_std": 0.1829470843076706, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.8042088747024536, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8754204511642456, "step": 184 }, { "adv/mean_abs_final_conf": 0.5484204888343811, "adv/mean_abs_reasoning": 0.4035857021808624, "adv/mean_abs_step_conf": 0.7365462183952332, "adv/ratio_final_to_reasoning": 1.3588699645970426, "adv/ratio_step_to_reasoning": 1.8250057284367278, "adv/std_final_conf": 0.7685003876686096, "adv/std_reasoning": 0.6613385081291199, "adv/std_step_conf": 0.9303016662597656, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7063632781717888, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.24650602409638556, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6024096385542169, "calib/gap": 0.32946611505122125, "calib/mean_conf": 0.6636947791164659, "calib/mu_c": 0.806595744680851, "calib/mu_w": 0.4771296296296298, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1719678714859438, "calib/std_conf": 0.4215471637407645, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42699507389162555, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.17407892871014874, "calib/step_q_w": 0.2529161451814768, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 444.671875, "completions/mean_terminated_length": 449.9446716308594, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.19733333333333333, "grad_norm": 0.10078582167625427, "learning_rate": 4.1666666666666667e-07, "loss": -0.0798, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.041414953768253326, "mask/share_reasoning": 0.8224426507949829, "mask/share_step_conf": 0.12442367523908615, "num_tokens": 41919293.0, "reward": 0.9268284440040588, "reward_std": 0.15181460976600647, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7091015577316284, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8398678302764893, "step": 185 }, { "adv/mean_abs_final_conf": 0.5862541794776917, "adv/mean_abs_reasoning": 0.446882039308548, "adv/mean_abs_step_conf": 0.7424435615539551, "adv/ratio_final_to_reasoning": 1.3118767994900657, "adv/ratio_step_to_reasoning": 1.6613859950664471, "adv/std_final_conf": 0.8304627537727356, "adv/std_reasoning": 0.7391940951347351, "adv/std_step_conf": 0.9280949831008911, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7186588921282798, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.21861111111111103, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5952380952380952, "calib/gap": 0.3610667903525047, "calib/mean_conf": 0.6678968253968254, "calib/mu_c": 0.8083116883116884, "calib/mu_w": 0.4472448979591837, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13769841269841263, "calib/std_conf": 0.41985912847966006, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4870789865871833, "calib/step_q_c_n": 671.0, "calib/step_q_gap": 0.1807310242047382, "calib/step_q_w": 0.3063479623824451, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 442.6875, "completions/mean_terminated_length": 446.1732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.1984, "grad_norm": 0.06984265148639679, "learning_rate": 3.8888888888888895e-07, "loss": -0.0135, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.04010544717311859, "mask/share_reasoning": 0.8295929431915283, "mask/share_step_conf": 0.1224890649318695, "num_tokens": 42137661.0, "reward": 0.9584858417510986, "reward_std": 0.1953909993171692, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7387632727622986, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8618021011352539, "step": 186 }, { "adv/mean_abs_final_conf": 0.6203031539916992, "adv/mean_abs_reasoning": 0.4454570710659027, "adv/mean_abs_step_conf": 0.7624503374099731, "adv/ratio_final_to_reasoning": 1.3925093893052807, "adv/ratio_step_to_reasoning": 1.7116135020273888, "adv/std_final_conf": 0.8146321773529053, "adv/std_reasoning": 0.7013767957687378, "adv/std_step_conf": 0.9314128160476685, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6024398395721924, "calib/avg_num_step_conf": 6.375, "calib/ece": 0.32418699186991873, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5609756097560976, "calib/gap": 0.16650935828877012, "calib/mean_conf": 0.6522357723577236, "calib/mu_c": 0.7266911764705883, "calib/mu_w": 0.5601818181818182, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21178861788617886, "calib/std_conf": 0.4154881071410142, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42377253814147015, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.15810843276276543, "calib/step_q_w": 0.2656641053787047, "calib/step_q_w_n": 911.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 511.98046875, "completions/mean_terminated_length": 516.0117797851562, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.19946666666666665, "grad_norm": 0.054615847766399384, "learning_rate": 3.611111111111111e-07, "loss": -0.0214, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03792574256658554, "mask/share_reasoning": 0.8272412419319153, "mask/share_step_conf": 0.12702052295207977, "num_tokens": 42370272.0, "reward": 0.8841220140457153, "reward_std": 0.18903331458568573, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6271160244941711, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8426903486251831, "step": 187 }, { "adv/mean_abs_final_conf": 0.5518736839294434, "adv/mean_abs_reasoning": 0.45593172311782837, "adv/mean_abs_step_conf": 0.729184627532959, "adv/ratio_final_to_reasoning": 1.2104305446340269, "adv/ratio_step_to_reasoning": 1.5993285629403609, "adv/std_final_conf": 0.7792059779167175, "adv/std_reasoning": 0.7393056750297546, "adv/std_step_conf": 0.9334509372711182, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.744102598179281, "calib/avg_num_step_conf": 5.1015625, "calib/ece": 0.23091999999999985, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.604, "calib/gap": 0.3446494783706558, "calib/mean_conf": 0.6945200000000001, "calib/mu_c": 0.833758389261745, "calib/mu_w": 0.48910891089108915, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.16471999999999987, "calib/std_conf": 0.39786828172147626, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4740087463556851, "calib/step_q_c_n": 686.0, "calib/step_q_gap": 0.16036358506536247, "calib/step_q_w": 0.3136451612903226, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 467.0078125, "completions/mean_terminated_length": 474.420654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.20053333333333334, "grad_norm": 0.05497977137565613, "learning_rate": 3.3333333333333335e-07, "loss": -0.0824, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.039705291390419006, "mask/share_reasoning": 0.8246287107467651, "mask/share_step_conf": 0.12004102021455765, "num_tokens": 42593898.0, "reward": 0.9408511519432068, "reward_std": 0.20251992344856262, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7240562438964844, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.849052369594574, "step": 188 }, { "adv/mean_abs_final_conf": 0.6120193004608154, "adv/mean_abs_reasoning": 0.39446377754211426, "adv/mean_abs_step_conf": 0.7473738193511963, "adv/ratio_final_to_reasoning": 1.5515221810080502, "adv/ratio_step_to_reasoning": 1.8946576641537236, "adv/std_final_conf": 0.8193371891975403, "adv/std_reasoning": 0.7012815475463867, "adv/std_step_conf": 0.933303713798523, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7354870129870129, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.2661417322834645, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.484251968503937, "calib/gap": 0.3562532467532468, "calib/mean_conf": 0.5514960629921261, "calib/mu_c": 0.6917532467532468, "calib/mu_w": 0.33549999999999996, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10566929133858265, "calib/std_conf": 0.4516201433965361, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.43661951909476654, "calib/step_q_c_n": 707.0, "calib/step_q_gap": 0.14031884341909084, "calib/step_q_w": 0.2963006756756757, "calib/step_q_w_n": 592.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 440.23046875, "completions/mean_terminated_length": 441.9568786621094, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.2016, "grad_norm": 0.124534972012043, "learning_rate": 3.055555555555556e-07, "loss": -0.0073, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03976032882928848, "mask/share_reasoning": 0.8296130895614624, "mask/share_step_conf": 0.1267203837633133, "num_tokens": 42814365.0, "reward": 0.9378255605697632, "reward_std": 0.1679186224937439, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7129054665565491, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8463394045829773, "step": 189 }, { "adv/mean_abs_final_conf": 0.6316561698913574, "adv/mean_abs_reasoning": 0.40052786469459534, "adv/mean_abs_step_conf": 0.7750684022903442, "adv/ratio_final_to_reasoning": 1.577059239993199, "adv/ratio_step_to_reasoning": 1.9351173054622257, "adv/std_final_conf": 0.8263714909553528, "adv/std_reasoning": 0.661289632320404, "adv/std_step_conf": 0.9340594410896301, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7126622348844571, "calib/avg_num_step_conf": 5.3828125, "calib/ece": 0.2652777777777779, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5119047619047619, "calib/gap": 0.34605698005698016, "calib/mean_conf": 0.579404761904762, "calib/mu_c": 0.7400740740740741, "calib/mu_w": 0.39401709401709395, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1544841269841271, "calib/std_conf": 0.4439979386226176, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4244771723122238, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.11141565586014945, "calib/step_q_w": 0.31306151645207436, "calib/step_q_w_n": 699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 508.71875, "completions/mean_terminated_length": 512.7244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.20266666666666666, "grad_norm": 0.06018049269914627, "learning_rate": 2.7777777777777776e-07, "loss": -0.0009, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03598202019929886, "mask/share_reasoning": 0.8418542146682739, "mask/share_step_conf": 0.1143513023853302, "num_tokens": 43050205.0, "reward": 0.9317029118537903, "reward_std": 0.18911895155906677, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7130597829818726, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8480023145675659, "step": 190 }, { "adv/mean_abs_final_conf": 0.6036766767501831, "adv/mean_abs_reasoning": 0.40422987937927246, "adv/mean_abs_step_conf": 0.7332354784011841, "adv/ratio_final_to_reasoning": 1.4933994431019728, "adv/ratio_step_to_reasoning": 1.8139071746183786, "adv/std_final_conf": 0.817194402217865, "adv/std_reasoning": 0.7013508677482605, "adv/std_step_conf": 0.9334839582443237, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6981868628210092, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.25333333333333324, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.33146535036778946, "calib/mean_conf": 0.67285140562249, "calib/mu_c": 0.8365873015873017, "calib/mu_w": 0.5051219512195122, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21008032128514048, "calib/std_conf": 0.4111599592084048, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45365625, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.13104275306748464, "calib/step_q_w": 0.32261349693251534, "calib/step_q_w_n": 815.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 458.2265625, "completions/mean_terminated_length": 461.83465576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.20373333333333332, "grad_norm": 0.0659196600317955, "learning_rate": 2.5000000000000004e-07, "loss": -0.0602, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.04230055958032608, "mask/share_reasoning": 0.8142014741897583, "mask/share_step_conf": 0.1356854885816574, "num_tokens": 43271679.0, "reward": 0.9137133955955505, "reward_std": 0.2043464332818985, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.699204683303833, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8352533578872681, "step": 191 }, { "adv/mean_abs_final_conf": 0.6116088628768921, "adv/mean_abs_reasoning": 0.4545787572860718, "adv/mean_abs_step_conf": 0.7597032785415649, "adv/ratio_final_to_reasoning": 1.345440923214983, "adv/ratio_step_to_reasoning": 1.6712247687884692, "adv/std_final_conf": 0.8217902183532715, "adv/std_reasoning": 0.7206497192382812, "adv/std_step_conf": 0.9321048259735107, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7860244769048559, "calib/avg_num_step_conf": 4.578125, "calib/ece": 0.18760956175298799, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5816733067729084, "calib/gap": 0.4638748519542044, "calib/mean_conf": 0.6509561752988047, "calib/mu_c": 0.8394630872483221, "calib/mu_w": 0.37558823529411767, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12247011952191228, "calib/std_conf": 0.4221108524837698, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.48903125000000003, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.22457260338345864, "calib/step_q_w": 0.2644586466165414, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 453.33984375, "completions/mean_terminated_length": 456.9094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 75.0, "epoch": 0.2048, "grad_norm": 0.0771113857626915, "learning_rate": 2.2222222222222224e-07, "loss": -0.0687, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04301437363028526, "mask/share_reasoning": 0.830362856388092, "mask/share_step_conf": 0.1188102513551712, "num_tokens": 43492710.0, "reward": 0.9810526371002197, "reward_std": 0.19653618335723877, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7776480913162231, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.873519778251648, "step": 192 }, { "adv/mean_abs_final_conf": 0.696414053440094, "adv/mean_abs_reasoning": 0.503356397151947, "adv/mean_abs_step_conf": 0.7308834195137024, "adv/ratio_final_to_reasoning": 1.3835406828650458, "adv/ratio_step_to_reasoning": 1.4520197292596886, "adv/std_final_conf": 0.8705217242240906, "adv/std_reasoning": 0.7575937509536743, "adv/std_step_conf": 0.9286946058273315, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7478233034571063, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.21253968253968253, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.44841269841269843, "calib/gap": 0.3764302176696542, "calib/mean_conf": 0.5742063492063492, "calib/mu_c": 0.7385211267605634, "calib/mu_w": 0.3620909090909092, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11162698412698414, "calib/std_conf": 0.420615296038167, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.41900151285930404, "calib/step_q_c_n": 661.0, "calib/step_q_gap": 0.12649426648249246, "calib/step_q_w": 0.2925072463768116, "calib/step_q_w_n": 690.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2465.0, "completions/max_terminated_length": 2465.0, "completions/mean_length": 477.4609375, "completions/mean_terminated_length": 481.220458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.20586666666666667, "grad_norm": 0.07757709175348282, "learning_rate": 1.9444444444444447e-07, "loss": -0.1058, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.037302643060684204, "mask/share_reasoning": 0.8361595273017883, "mask/share_step_conf": 0.11872531473636627, "num_tokens": 43720652.0, "reward": 0.9659968018531799, "reward_std": 0.18238919973373413, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7463644742965698, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8785977363586426, "step": 193 }, { "adv/mean_abs_final_conf": 0.5742350816726685, "adv/mean_abs_reasoning": 0.44685792922973633, "adv/mean_abs_step_conf": 0.7691583633422852, "adv/ratio_final_to_reasoning": 1.285050670718759, "adv/ratio_step_to_reasoning": 1.7212592930108874, "adv/std_final_conf": 0.8032878637313843, "adv/std_reasoning": 0.7014285922050476, "adv/std_step_conf": 0.9336322546005249, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8076305994095752, "calib/avg_num_step_conf": 4.7890625, "calib/ece": 0.17900197628458492, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5533596837944664, "calib/gap": 0.4853922795533309, "calib/mean_conf": 0.6321442687747035, "calib/mu_c": 0.8355102040816328, "calib/mu_w": 0.35011792452830187, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11505928853754935, "calib/std_conf": 0.4271501896552184, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.48788321167883203, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.18552184384149378, "calib/step_q_w": 0.30236136783733825, "calib/step_q_w_n": 541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 402.9296875, "completions/mean_terminated_length": 406.10235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.20693333333333333, "grad_norm": 0.12211701273918152, "learning_rate": 1.6666666666666668e-07, "loss": -0.0248, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04216302931308746, "mask/share_reasoning": 0.8294909596443176, "mask/share_step_conf": 0.12053349614143372, "num_tokens": 43929746.0, "reward": 0.9791326522827148, "reward_std": 0.15515466034412384, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7964698672294617, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8500766754150391, "step": 194 }, { "adv/mean_abs_final_conf": 0.6161271333694458, "adv/mean_abs_reasoning": 0.40150701999664307, "adv/mean_abs_step_conf": 0.7621166110038757, "adv/ratio_final_to_reasoning": 1.534536390857119, "adv/ratio_step_to_reasoning": 1.8981401894548386, "adv/std_final_conf": 0.8294500708580017, "adv/std_reasoning": 0.6612933874130249, "adv/std_step_conf": 0.9313527345657349, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7397683648903638, "calib/avg_num_step_conf": 4.71875, "calib/ece": 0.2267599999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.58, "calib/gap": 0.37528986921725543, "calib/mean_conf": 0.6614800000000001, "calib/mu_c": 0.8251063829787233, "calib/mu_w": 0.44981651376146786, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1621199999999999, "calib/std_conf": 0.4196358059079325, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4617519379844961, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.07622795929888326, "calib/step_q_w": 0.38552397868561283, "calib/step_q_w_n": 563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 410.81640625, "completions/mean_terminated_length": 415.6877746582031, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.208, "grad_norm": 0.07284358888864517, "learning_rate": 1.3888888888888888e-07, "loss": -0.0873, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.04061251878738403, "mask/share_reasoning": 0.826056718826294, "mask/share_step_conf": 0.12161204218864441, "num_tokens": 44140899.0, "reward": 0.9444481730461121, "reward_std": 0.17379814386367798, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7354198694229126, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8480076193809509, "step": 195 }, { "adv/mean_abs_final_conf": 0.4589390754699707, "adv/mean_abs_reasoning": 0.2612670063972473, "adv/mean_abs_step_conf": 0.7599738836288452, "adv/ratio_final_to_reasoning": 1.7565902476494486, "adv/ratio_step_to_reasoning": 2.9088015900229345, "adv/std_final_conf": 0.7345681190490723, "adv/std_reasoning": 0.5724842548370361, "adv/std_step_conf": 0.9323859214782715, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7516464906228439, "calib/avg_num_step_conf": 4.28125, "calib/ece": 0.1903085937500001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.6328125, "calib/gap": 0.39288044909991854, "calib/mean_conf": 0.7239023437500001, "calib/mu_c": 0.8881140939597316, "calib/mu_w": 0.4952336448598131, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1660898437500001, "calib/std_conf": 0.38376090456103895, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5202241086587436, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.14073692917156416, "calib/step_q_w": 0.3794871794871794, "calib/step_q_w_n": 507.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 344.3125, "completions/mean_terminated_length": 345.66278076171875, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.20906666666666668, "grad_norm": 0.07948347926139832, "learning_rate": 1.1111111111111112e-07, "loss": 0.0368, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04886038973927498, "mask/share_reasoning": 0.8164427280426025, "mask/share_step_conf": 0.1307906210422516, "num_tokens": 44331587.0, "reward": 0.9816206693649292, "reward_std": 0.1148589700460434, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7804819941520691, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8663530349731445, "step": 196 }, { "adv/mean_abs_final_conf": 0.5788639783859253, "adv/mean_abs_reasoning": 0.43158918619155884, "adv/mean_abs_step_conf": 0.7658077478408813, "adv/ratio_final_to_reasoning": 1.3412383741445255, "adv/ratio_step_to_reasoning": 1.7743904906388948, "adv/std_final_conf": 0.7939836382865906, "adv/std_reasoning": 0.6817267537117004, "adv/std_step_conf": 0.9311909675598145, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7302751695357329, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.2826399999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.568, "calib/gap": 0.35685054773082925, "calib/mean_conf": 0.65592, "calib/mu_c": 0.858611111111111, "calib/mu_w": 0.5017605633802817, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25327999999999995, "calib/std_conf": 0.4114221112191225, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5152714932126696, "calib/step_q_c_n": 442.0, "calib/step_q_gap": 0.15917613238792738, "calib/step_q_w": 0.35609536082474225, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 424.859375, "completions/mean_terminated_length": 431.60321044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.21013333333333334, "grad_norm": 0.06191324070096016, "learning_rate": 8.333333333333334e-08, "loss": -0.0622, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.040418848395347595, "mask/share_reasoning": 0.8236619234085083, "mask/share_step_conf": 0.1202942505478859, "num_tokens": 44545407.0, "reward": 0.912652850151062, "reward_std": 0.18009766936302185, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.6936922073364258, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8519259691238403, "step": 197 }, { "adv/mean_abs_final_conf": 0.552986741065979, "adv/mean_abs_reasoning": 0.3963276445865631, "adv/mean_abs_step_conf": 0.7611110806465149, "adv/ratio_final_to_reasoning": 1.3952767328225057, "adv/ratio_step_to_reasoning": 1.9204087603843096, "adv/std_final_conf": 0.802998960018158, "adv/std_reasoning": 0.6814876198768616, "adv/std_step_conf": 0.9322589039802551, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7270659354356226, "calib/avg_num_step_conf": 4.890625, "calib/ece": 0.23898039215686262, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.4666666666666667, "calib/gap": 0.3391617848685031, "calib/mean_conf": 0.5709019607843139, "calib/mu_c": 0.7211971830985916, "calib/mu_w": 0.3820353982300885, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12650980392156852, "calib/std_conf": 0.42496732307605706, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.45695652173913043, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.10902057867863224, "calib/step_q_w": 0.3479359430604982, "calib/step_q_w_n": 562.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 410.4453125, "completions/mean_terminated_length": 410.4453125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2112, "grad_norm": 0.078434057533741, "learning_rate": 5.555555555555556e-08, "loss": 0.0446, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.04607347771525383, "mask/share_reasoning": 0.8190134763717651, "mask/share_step_conf": 0.13491299748420715, "num_tokens": 44755865.0, "reward": 0.9583539962768555, "reward_std": 0.1432131975889206, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7369366884231567, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8696149587631226, "step": 198 }, { "adv/mean_abs_final_conf": 0.661697506904602, "adv/mean_abs_reasoning": 0.5621737837791443, "adv/mean_abs_step_conf": 0.7594678997993469, "adv/ratio_final_to_reasoning": 1.177033732267666, "adv/ratio_step_to_reasoning": 1.350948624274005, "adv/std_final_conf": 0.8686993718147278, "adv/std_reasoning": 0.8265514969825745, "adv/std_step_conf": 0.9334787130355835, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7286388140161726, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.23304878048780484, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5487804878048781, "calib/gap": 0.34104851752021564, "calib/mean_conf": 0.6541869918699187, "calib/mu_c": 0.8011428571428573, "calib/mu_w": 0.4600943396226416, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15906504065040647, "calib/std_conf": 0.4130591179876554, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.437112970711297, "calib/step_q_c_n": 717.0, "calib/step_q_gap": 0.1297291725705268, "calib/step_q_w": 0.3073837981407702, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 504.63671875, "completions/mean_terminated_length": 514.6892700195312, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.21226666666666666, "grad_norm": 0.09063680469989777, "learning_rate": 2.777777777777778e-08, "loss": -0.0419, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03921530395746231, "mask/share_reasoning": 0.8160558342933655, "mask/share_step_conf": 0.1251976191997528, "num_tokens": 44989252.0, "reward": 0.9256468415260315, "reward_std": 0.23798325657844543, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7116104960441589, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8396830558776855, "step": 199 }, { "adv/mean_abs_final_conf": 0.5623447895050049, "adv/mean_abs_reasoning": 0.45310142636299133, "adv/mean_abs_step_conf": 0.7232848405838013, "adv/ratio_final_to_reasoning": 1.2411013446126207, "adv/ratio_step_to_reasoning": 1.5962978673220043, "adv/std_final_conf": 0.7876421809196472, "adv/std_reasoning": 0.7206158638000488, "adv/std_step_conf": 0.9335463047027588, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8116764514024788, "calib/avg_num_step_conf": 4.26953125, "calib/ece": 0.16904382470119506, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5896414342629482, "calib/gap": 0.5388421395955645, "calib/mean_conf": 0.634382470119522, "calib/mu_c": 0.8597945205479454, "calib/mu_w": 0.32095238095238093, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.11087649402390425, "calib/std_conf": 0.4465395021981601, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.507291280148423, "calib/step_q_c_n": 539.0, "calib/step_q_gap": 0.22765229097874784, "calib/step_q_w": 0.2796389891696751, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 466.46875, "completions/mean_terminated_length": 470.1417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 95.0, "epoch": 0.21333333333333335, "grad_norm": 0.06912390142679214, "learning_rate": 0.0, "loss": -0.0252, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.04253852739930153, "mask/share_reasoning": 0.842212975025177, "mask/share_step_conf": 0.10743597149848938, "num_tokens": 45216716.0, "reward": 0.9851292371749878, "reward_std": 0.18738599121570587, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7934476733207703, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8682170510292053, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.022248380884993823, "train_runtime": 12721.3125, "train_samples_per_second": 4.025, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 45216716, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }