{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7490277290344238, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343300461769104, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04268835112452507, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.078, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.7281402349472046, "reward_std": 0.16804265975952148, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7420004606246948, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7698483467102051, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345317482948303, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.03963535279035568, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0095, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.6806339025497437, "reward_std": 0.16487614810466766, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7291916012763977, "step": 2 }, { "adv/mean_abs_final_conf": 0.8019323348999023, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.781822681427002, "adv/std_final_conf": 0.9299672245979309, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9331517219543457, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.4182875264270613, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.22414342629482076, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.29880478087649404, "calib/gap": -0.007272022551092383, "calib/mean_conf": 0.8795219123505977, "calib/mu_c": 0.877030303030303, "calib/mu_w": 0.8843023255813954, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22314741035856578, "calib/std_conf": 0.051840339838680916, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7927968337730872, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.03380093213374291, "calib/step_q_w": 0.7589959016393443, "calib/step_q_w_n": 488.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 496.19921875, "completions/mean_terminated_length": 500.1062927246094, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.04587876424193382, "kl": 0.0007779151201248169, "learning_rate": 7.5e-07, "loss": 0.0432, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032924000173807144, "mask/share_reasoning": 0.8506391048431396, "mask/share_step_conf": 0.1086243987083435, "num_tokens": 690944.0, "reward": 0.7217806577682495, "reward_std": 0.1670541763305664, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7043820023536682, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.739179253578186, "step": 3 }, { "adv/mean_abs_final_conf": 0.7670461535453796, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7715258002281189, "adv/std_final_conf": 0.9306488037109375, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346135854721069, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4428854875283447, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.20373015873015893, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.23412698412698413, "calib/gap": -0.0038690476190477163, "calib/mean_conf": 0.8703968253968254, "calib/mu_c": 0.8691071428571429, "calib/mu_w": 0.8729761904761906, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20373015873015893, "calib/std_conf": 0.04819604135839693, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.796887573964497, "calib/step_q_c_n": 845.0, "calib/step_q_gap": 0.0004169857292029011, "calib/step_q_w": 0.7964705882352942, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 523.91015625, "completions/mean_terminated_length": 525.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.004266666666666667, "grad_norm": 0.04513922333717346, "kl": 0.00026550889015197754, "learning_rate": 1.0000000000000002e-06, "loss": 0.0224, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032802172005176544, "mask/share_reasoning": 0.8497129082679749, "mask/share_step_conf": 0.1135786697268486, "num_tokens": 931233.0, "reward": 0.7231439352035522, "reward_std": 0.152303084731102, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.720788300037384, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7254995107650757, "step": 4 }, { "adv/mean_abs_final_conf": 0.7628365159034729, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7605997323989868, "adv/std_final_conf": 0.9310415387153625, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9340444207191467, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4282638481111763, "calib/avg_num_step_conf": 4.75390625, "calib/ece": 0.3553225806451613, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.2903225806451613, "calib/gap": -0.010919292751353837, "calib/mean_conf": 0.8835483870967742, "calib/mu_c": 0.8783969465648854, "calib/mu_w": 0.8893162393162393, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3553225806451613, "calib/std_conf": 0.04231589246917829, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7984055727554179, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.012661264524244542, "calib/step_q_w": 0.7857443082311734, "calib/step_q_w_n": 571.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 532.23828125, "completions/mean_terminated_length": 534.3255004882812, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.005333333333333333, "grad_norm": 0.03690319508314133, "kl": 0.0002651214599609375, "learning_rate": 1.25e-06, "loss": -0.0964, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034227386116981506, "mask/share_reasoning": 0.8531880974769592, "mask/share_step_conf": 0.10867826640605927, "num_tokens": 1174174.0, "reward": 0.6350550651550293, "reward_std": 0.16529521346092224, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5958515405654907, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.6742585301399231, "step": 5 }, { "adv/mean_abs_final_conf": 0.7556248307228088, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7537124156951904, "adv/std_final_conf": 0.9313980340957642, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342355728149414, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5419920624759954, "calib/avg_num_step_conf": 5.26171875, "calib/ece": 0.30031620553359695, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.25691699604743085, "calib/gap": 0.0024433491230316795, "calib/mean_conf": 0.8773913043478262, "calib/mu_c": 0.8784246575342466, "calib/mu_w": 0.8759813084112149, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30031620553359695, "calib/std_conf": 0.04182016179178139, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.8026478873239438, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.0031502421120127577, "calib/step_q_w": 0.799497645211931, "calib/step_q_w_n": 637.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 445.99609375, "completions/mean_terminated_length": 447.7451171875, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.0064, "grad_norm": 0.041909750550985336, "kl": 0.0006876289844512939, "learning_rate": 1.5e-06, "loss": 0.0061, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037554264068603516, "mask/share_reasoning": 0.8299777507781982, "mask/share_step_conf": 0.12856173515319824, "num_tokens": 1394301.0, "reward": 0.6814709901809692, "reward_std": 0.15129441022872925, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6562492251396179, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7066926956176758, "step": 6 }, { "adv/mean_abs_final_conf": 0.7541855573654175, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7434177994728088, "adv/std_final_conf": 0.9303765296936035, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9332534074783325, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.43873786407766985, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.2913833992094862, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.30039525691699603, "calib/gap": -0.009607766990291533, "calib/mean_conf": 0.8807114624505928, "calib/mu_c": 0.8768, "calib/mu_w": 0.8864077669902916, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.289604743083004, "calib/std_conf": 0.044262598383373285, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7931042382588774, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.019700358188330713, "calib/step_q_w": 0.7734038800705467, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 547.6875, "completions/mean_terminated_length": 549.8353271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.007466666666666667, "grad_norm": 0.04376210644841194, "kl": 0.000276029109954834, "learning_rate": 1.75e-06, "loss": 0.0142, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0305500365793705, "mask/share_reasoning": 0.850243091583252, "mask/share_step_conf": 0.11530055105686188, "num_tokens": 1641933.0, "reward": 0.6913090348243713, "reward_std": 0.156791090965271, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6555039286613464, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7271141409873962, "step": 7 }, { "adv/mean_abs_final_conf": 0.7654787302017212, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7579349279403687, "adv/std_final_conf": 0.9322072863578796, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352673888206482, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.527547938008931, "calib/avg_num_step_conf": 4.5390625, "calib/ece": 0.3118473895582329, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3132530120481928, "calib/gap": 0.01203703703703729, "calib/mean_conf": 0.8781124497991968, "calib/mu_c": 0.8833333333333334, "calib/mu_w": 0.8712962962962961, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3118473895582329, "calib/std_conf": 0.05985299432225467, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7923771790808241, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.02527736740474118, "calib/step_q_w": 0.7670998116760829, "calib/step_q_w_n": 531.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 518.8828125, "completions/mean_terminated_length": 522.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.008533333333333334, "grad_norm": 0.04647514969110489, "kl": 0.00046569108963012695, "learning_rate": 2.0000000000000003e-06, "loss": -0.055, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032357074320316315, "mask/share_reasoning": 0.8551695346832275, "mask/share_step_conf": 0.10466089844703674, "num_tokens": 1881279.0, "reward": 0.6843916177749634, "reward_std": 0.1819065511226654, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6338390707969666, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.734944224357605, "step": 8 }, { "adv/mean_abs_final_conf": 0.775026798248291, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7733014822006226, "adv/std_final_conf": 0.9304593205451965, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345585107803345, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.5300694444444444, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.2850819672131148, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.2581967213114754, "calib/gap": 0.006669444444444506, "calib/mean_conf": 0.8748360655737704, "calib/mu_c": 0.8775694444444445, "calib/mu_w": 0.8709, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.2848770491803279, "calib/std_conf": 0.051418144506781754, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7824285714285714, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.09888465771516608, "calib/step_q_w": 0.6835439137134053, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 512.48046875, "completions/mean_terminated_length": 520.6151123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.0096, "grad_norm": 0.04125397652387619, "kl": 0.0003396272659301758, "learning_rate": 2.25e-06, "loss": -0.0476, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.033868737518787384, "mask/share_reasoning": 0.8458621501922607, "mask/share_step_conf": 0.10464408993721008, "num_tokens": 2120010.0, "reward": 0.6498497724533081, "reward_std": 0.19337286055088043, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6333246231079102, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.6663750410079956, "step": 9 }, { "adv/mean_abs_final_conf": 0.761988639831543, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7337956428527832, "adv/std_final_conf": 0.9305899143218994, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9335968494415283, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.4558519437551695, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.2638955823293173, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3654618473895582, "calib/gap": -0.008000413564929976, "calib/mean_conf": 0.8863855421686747, "calib/mu_c": 0.8833974358974357, "calib/mu_w": 0.8913978494623657, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2618875502008033, "calib/std_conf": 0.041126066737147415, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7831216931216931, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.010296522080759263, "calib/step_q_w": 0.7728251710409338, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 534.75, "completions/mean_terminated_length": 538.9606323242188, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.010666666666666666, "grad_norm": 0.049111757427453995, "kl": 0.00047522783279418945, "learning_rate": 2.5e-06, "loss": -0.001, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032005734741687775, "mask/share_reasoning": 0.848203718662262, "mask/share_step_conf": 0.11197805404663086, "num_tokens": 2363706.0, "reward": 0.6990261077880859, "reward_std": 0.16522833704948425, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6740808486938477, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.723971426486969, "step": 10 }, { "adv/mean_abs_final_conf": 0.7775152325630188, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7695873975753784, "adv/std_final_conf": 0.9275996685028076, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341156482696533, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4470198675496689, "calib/avg_num_step_conf": 5.37109375, "calib/ece": 0.3026171875, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.38671875, "calib/gap": -0.009318826868495722, "calib/mean_conf": 0.8887890625000001, "calib/mu_c": 0.8849668874172185, "calib/mu_w": 0.8942857142857142, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30078125, "calib/std_conf": 0.048304236412255744, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7787966751918158, "calib/step_q_c_n": 782.0, "calib/step_q_gap": -0.018420862750848532, "calib/step_q_w": 0.7972175379426644, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 501.5546875, "completions/mean_terminated_length": 503.5216064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.011733333333333333, "grad_norm": 0.03449160233139992, "kl": 0.0007895231246948242, "learning_rate": 2.7500000000000004e-06, "loss": -0.0331, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03353225439786911, "mask/share_reasoning": 0.8452185392379761, "mask/share_step_conf": 0.117343008518219, "num_tokens": 2596584.0, "reward": 0.6882535219192505, "reward_std": 0.13917958736419678, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6580113172531128, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.718495786190033, "step": 11 }, { "adv/mean_abs_final_conf": 0.7754501104354858, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7735542058944702, "adv/std_final_conf": 0.9258955717086792, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9328970909118652, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5133865993832352, "calib/avg_num_step_conf": 5.60546875, "calib/ece": 0.22546875000000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.46484375, "calib/gap": 0.0020493411830670993, "calib/mean_conf": 0.8927343750000001, "calib/mu_c": 0.8933908045977013, "calib/mu_w": 0.8913414634146342, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21925781250000004, "calib/std_conf": 0.05257754457331928, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7640579710144928, "calib/step_q_c_n": 897.0, "calib/step_q_gap": 0.008463175475459384, "calib/step_q_w": 0.7555947955390334, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 463.76953125, "completions/mean_terminated_length": 465.5882568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.0128, "grad_norm": 0.0531257800757885, "kl": 0.0015319585800170898, "learning_rate": 3e-06, "loss": 0.0205, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.035761505365371704, "mask/share_reasoning": 0.8305474519729614, "mask/share_step_conf": 0.12978483736515045, "num_tokens": 2819485.0, "reward": 0.7614898681640625, "reward_std": 0.13899016380310059, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7311863303184509, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7917934060096741, "step": 12 }, { "adv/mean_abs_final_conf": 0.7574409246444702, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7679467797279358, "adv/std_final_conf": 0.9247254729270935, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341288208961487, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5054759060135403, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.2670980392156863, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5098039215686274, "calib/gap": 0.007996814018319531, "calib/mean_conf": 0.9023921568627451, "calib/mu_c": 0.9053086419753088, "calib/mu_w": 0.8973118279569893, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2670980392156863, "calib/std_conf": 0.047898617783230665, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7763171355498721, "calib/step_q_c_n": 782.0, "calib/step_q_gap": 0.02666126510452793, "calib/step_q_w": 0.7496558704453442, "calib/step_q_w_n": 494.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 478.09765625, "completions/mean_terminated_length": 479.9725646972656, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.013866666666666666, "grad_norm": 0.047136712819337845, "kl": 0.0021216869354248047, "learning_rate": 3.2500000000000002e-06, "loss": 0.0671, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034416116774082184, "mask/share_reasoning": 0.8466586470603943, "mask/share_step_conf": 0.11501900106668472, "num_tokens": 3046470.0, "reward": 0.7384437918663025, "reward_std": 0.15288321673870087, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6956464648246765, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7812410593032837, "step": 13 }, { "adv/mean_abs_final_conf": 0.7636485695838928, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7400187253952026, "adv/std_final_conf": 0.921985387802124, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344073534011841, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5393435251798562, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.36896414342629474, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7450199203187251, "calib/gap": 0.006576310380267181, "calib/mean_conf": 0.9227490039840637, "calib/mu_c": 0.92568345323741, "calib/mu_w": 0.9191071428571428, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36896414342629474, "calib/std_conf": 0.037871831833412944, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7489583333333333, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.04890191584391157, "calib/step_q_w": 0.7000564174894217, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 549.0, "completions/mean_terminated_length": 553.3228149414062, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.014933333333333333, "grad_norm": 0.04201479256153107, "kl": 0.00584101676940918, "learning_rate": 3.5e-06, "loss": -0.0607, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03314138203859329, "mask/share_reasoning": 0.8421472907066345, "mask/share_step_conf": 0.11689884215593338, "num_tokens": 3292414.0, "reward": 0.6788654327392578, "reward_std": 0.13546247780323029, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6064925789833069, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7512382864952087, "step": 14 }, { "adv/mean_abs_final_conf": 0.7457209825515747, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7551430463790894, "adv/std_final_conf": 0.9170833826065063, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934794545173645, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5344794094794094, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.35776892430278884, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8207171314741036, "calib/gap": 0.013309375809375745, "calib/mean_conf": 0.9274900398406375, "calib/mu_c": 0.9332167832167831, "calib/mu_w": 0.9199074074074074, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.35776892430278884, "calib/std_conf": 0.052168846510283806, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7192296918767507, "calib/step_q_c_n": 714.0, "calib/step_q_gap": -0.004067409572524716, "calib/step_q_w": 0.7232971014492754, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 475.109375, "completions/mean_terminated_length": 480.74310302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.016, "grad_norm": 0.041417043656110764, "kl": 0.008565902709960938, "learning_rate": 3.7500000000000005e-06, "loss": -0.0614, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034135397523641586, "mask/share_reasoning": 0.8391706943511963, "mask/share_step_conf": 0.11497519910335541, "num_tokens": 3521922.0, "reward": 0.6850520372390747, "reward_std": 0.1696966141462326, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6183484196662903, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7517555952072144, "step": 15 }, { "adv/mean_abs_final_conf": 0.7481696605682373, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7608056664466858, "adv/std_final_conf": 0.9169109463691711, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348391890525818, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5255022637238257, "calib/avg_num_step_conf": 6.68359375, "calib/ece": 0.31497959183673474, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8734693877551021, "calib/gap": 0.016136813808715234, "calib/mean_conf": 0.9353877551020408, "calib/mu_c": 0.9415131578947369, "calib/mu_w": 0.9253763440860217, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.31497959183673474, "calib/std_conf": 0.07009871067157779, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6679646936656283, "calib/step_q_c_n": 963.0, "calib/step_q_gap": 0.0389941054303341, "calib/step_q_w": 0.6289705882352942, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 693.61328125, "completions/mean_terminated_length": 696.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.017066666666666667, "grad_norm": 0.04022593051195145, "kl": 0.008114814758300781, "learning_rate": 4.000000000000001e-06, "loss": -0.0603, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.025023385882377625, "mask/share_reasoning": 0.8619957566261292, "mask/share_step_conf": 0.10907462239265442, "num_tokens": 3808335.0, "reward": 0.6982525587081909, "reward_std": 0.16592827439308167, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6386706829071045, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7578344941139221, "step": 16 }, { "adv/mean_abs_final_conf": 0.7530292868614197, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7690562009811401, "adv/std_final_conf": 0.9123719334602356, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345831871032715, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.43895716945996277, "calib/avg_num_step_conf": 5.515625, "calib/ece": 0.2460236220472442, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8937007874015748, "calib/gap": -0.010618249534450785, "calib/mean_conf": 0.9418503937007875, "calib/mu_c": 0.9387150837988826, "calib/mu_w": 0.9493333333333334, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.24157480314960642, "calib/std_conf": 0.04497726087600256, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6559919839679358, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.0018132400065831167, "calib/step_q_w": 0.6541787439613527, "calib/step_q_w_n": 414.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 516.7421875, "completions/mean_terminated_length": 516.7421875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.018133333333333335, "grad_norm": 0.03731020539999008, "kl": 0.012533187866210938, "learning_rate": 4.25e-06, "loss": 0.0001, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033470265567302704, "mask/share_reasoning": 0.8462820053100586, "mask/share_step_conf": 0.12024769186973572, "num_tokens": 4044149.0, "reward": 0.7667979001998901, "reward_std": 0.16370335221290588, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7154378890991211, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8181577920913696, "step": 17 }, { "adv/mean_abs_final_conf": 0.7296743392944336, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7674551010131836, "adv/std_final_conf": 0.9048014879226685, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343299865722656, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5590599279835391, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.3760317460317461, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9325396825396826, "calib/gap": 0.010787037037037095, "calib/mean_conf": 0.9474603174603174, "calib/mu_c": 0.9520833333333335, "calib/mu_w": 0.9412962962962964, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.3760317460317461, "calib/std_conf": 0.04386010422646486, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6303954802259887, "calib/step_q_c_n": 708.0, "calib/step_q_gap": -0.013862122636265295, "calib/step_q_w": 0.644257602862254, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 514.765625, "completions/mean_terminated_length": 516.7843627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.0192, "grad_norm": 0.03311552107334137, "kl": 0.0136260986328125, "learning_rate": 4.5e-06, "loss": -0.068, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03235187008976936, "mask/share_reasoning": 0.8596428632736206, "mask/share_step_conf": 0.10409902781248093, "num_tokens": 4286649.0, "reward": 0.6812572479248047, "reward_std": 0.13839876651763916, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6026082038879395, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7599062323570251, "step": 18 }, { "adv/mean_abs_final_conf": 0.7078334093093872, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7547190189361572, "adv/std_final_conf": 0.9071928262710571, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344522953033447, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5825134461498097, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.34913043478260863, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9723320158102767, "calib/gap": 0.005721500721500572, "calib/mean_conf": 0.9578260869565217, "calib/mu_c": 0.9600649350649348, "calib/mu_w": 0.9543434343434343, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.34913043478260863, "calib/std_conf": 0.02437392728569622, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6258414464534076, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.05862035111872199, "calib/step_q_w": 0.5672210953346856, "calib/step_q_w_n": 493.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 500.71875, "completions/mean_terminated_length": 500.71875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.020266666666666665, "grad_norm": 0.03164170682430267, "kl": 0.018739700317382812, "learning_rate": 4.75e-06, "loss": -0.0468, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032328102737665176, "mask/share_reasoning": 0.8583135604858398, "mask/share_step_conf": 0.10935834795236588, "num_tokens": 4519593.0, "reward": 0.712796688079834, "reward_std": 0.13104841113090515, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6345300674438477, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7910631895065308, "step": 19 }, { "adv/mean_abs_final_conf": 0.6984207630157471, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7432424426078796, "adv/std_final_conf": 0.8983197212219238, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347612261772156, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.45215311004784686, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.3531075697211156, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9800796812749004, "calib/gap": 0.01472953216374251, "calib/mean_conf": 0.957808764940239, "calib/mu_c": 0.9636184210526314, "calib/mu_w": 0.9488888888888889, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.35266932270916346, "calib/std_conf": 0.08103156683400753, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5916091954022988, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.012994058526292429, "calib/step_q_w": 0.5786151368760064, "calib/step_q_w_n": 621.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 488.15234375, "completions/mean_terminated_length": 488.15234375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.021333333333333333, "grad_norm": 0.034622061997652054, "kl": 0.027156829833984375, "learning_rate": 5e-06, "loss": -0.0, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03587206080555916, "mask/share_reasoning": 0.8324520587921143, "mask/share_step_conf": 0.1316758692264557, "num_tokens": 4749432.0, "reward": 0.7167103886604309, "reward_std": 0.14828336238861084, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6250980496406555, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8083226680755615, "step": 20 }, { "adv/mean_abs_final_conf": 0.7363002896308899, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7545459270477295, "adv/std_final_conf": 0.8944153189659119, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350616931915283, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.520718954248366, "calib/avg_num_step_conf": 6.04296875, "calib/ece": 0.3646640316205533, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9841897233201581, "calib/gap": 0.0015000000000002789, "calib/mean_conf": 0.9694071146245059, "calib/mu_c": 0.9700000000000002, "calib/mu_w": 0.9684999999999999, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3646640316205533, "calib/std_conf": 0.019028901172082977, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5519160997732426, "calib/step_q_c_n": 882.0, "calib/step_q_gap": 0.031510084735648536, "calib/step_q_w": 0.5204060150375941, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 515.29296875, "completions/mean_terminated_length": 517.3137817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.0224, "grad_norm": 0.04176400601863861, "kl": 0.032192230224609375, "learning_rate": 4.9722222222222224e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03280477970838547, "mask/share_reasoning": 0.8365024328231812, "mask/share_step_conf": 0.1267865151166916, "num_tokens": 4984307.0, "reward": 0.7157672643661499, "reward_std": 0.18473777174949646, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6209827661514282, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.8105518221855164, "step": 21 }, { "adv/mean_abs_final_conf": 0.6889126300811768, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7624020576477051, "adv/std_final_conf": 0.8601471185684204, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344401955604553, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.40761486103233124, "calib/avg_num_step_conf": 5.62890625, "calib/ece": 0.2945275590551182, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.004523539421440792, "calib/mean_conf": 0.9716929133858269, "calib/mu_c": 0.9702325581395349, "calib/mu_w": 0.9747560975609757, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2945275590551182, "calib/std_conf": 0.014110370236287091, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5819481865284973, "calib/step_q_c_n": 965.0, "calib/step_q_gap": 0.027935581486480454, "calib/step_q_w": 0.5540126050420169, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 477.93359375, "completions/mean_terminated_length": 479.807861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.023466666666666667, "grad_norm": 0.0354311503469944, "kl": 0.048328399658203125, "learning_rate": 4.944444444444445e-06, "loss": -0.0143, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03266695886850357, "mask/share_reasoning": 0.836262583732605, "mask/share_step_conf": 0.12716418504714966, "num_tokens": 5208474.0, "reward": 0.7574120759963989, "reward_std": 0.13079439103603363, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6870542764663696, "rewards/format_reward_step": 0.9921875, "rewards/step_l2_reward": 0.8277699947357178, "step": 22 }, { "adv/mean_abs_final_conf": 0.7558117508888245, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7470569610595703, "adv/std_final_conf": 0.8796916007995605, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351850152015686, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.4980074719800748, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.40570312500000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.99609375, "calib/gap": -0.0002914072229142084, "calib/mean_conf": 0.976015625, "calib/mu_c": 0.975890410958904, "calib/mu_w": 0.9761818181818183, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.40570312500000005, "calib/std_conf": 0.014188455020169575, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5975, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.04490368509212739, "calib/step_q_w": 0.5525963149078726, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 495.65625, "completions/mean_terminated_length": 497.60003662109375, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.024533333333333334, "grad_norm": 0.026641665026545525, "kl": 0.04611968994140625, "learning_rate": 4.9166666666666665e-06, "loss": -0.0015, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03485388308763504, "mask/share_reasoning": 0.8375639319419861, "mask/share_step_conf": 0.12367593497037888, "num_tokens": 5439298.0, "reward": 0.6970131397247314, "reward_std": 0.16669629514217377, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.5900046825408936, "rewards/format_reward_step": 1.0, "rewards/step_l2_reward": 0.8040215969085693, "step": 23 }, { "adv/mean_abs_final_conf": 0.7947186827659607, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7730585932731628, "adv/std_final_conf": 0.9157706499099731, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354721307754517, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5589953271028038, "calib/avg_num_step_conf": 6.30859375, "calib/ece": 0.5504780876494024, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.003018561786085283, "calib/mean_conf": 0.976772908366534, "calib/mu_c": 0.9785046728971963, "calib/mu_w": 0.975486111111111, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.5504780876494024, "calib/std_conf": 0.013071372536486728, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5927873563218391, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.01646526709441798, "calib/step_q_w": 0.5763220892274211, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2747.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 578.1171875, "completions/mean_terminated_length": 582.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.0256, "grad_norm": 0.024851465597748756, "kl": 0.027318954467773438, "learning_rate": 4.888888888888889e-06, "loss": -0.0613, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030539821833372116, "mask/share_reasoning": 0.8392899036407471, "mask/share_step_conf": 0.1223578006029129, "num_tokens": 5691808.0, "reward": 0.6098623275756836, "reward_std": 0.20302972197532654, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.4448503851890564, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.774874210357666, "step": 24 }, { "adv/mean_abs_final_conf": 0.6968058347702026, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7300499677658081, "adv/std_final_conf": 0.8484832644462585, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348785281181335, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5112973760932945, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.3676587301587303, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0004916512059368605, "calib/mean_conf": 0.9787698412698415, "calib/mu_c": 0.9789610389610389, "calib/mu_w": 0.9784693877551021, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3676587301587303, "calib/std_conf": 0.012711488803198966, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5837976190476191, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.009502948201224104, "calib/step_q_w": 0.574294670846395, "calib/step_q_w_n": 638.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 515.44921875, "completions/mean_terminated_length": 517.4706420898438, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.02666666666666667, "grad_norm": 0.02434094063937664, "kl": 0.033573150634765625, "learning_rate": 4.861111111111111e-06, "loss": -0.0468, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03205115348100662, "mask/share_reasoning": 0.8405336141586304, "mask/share_step_conf": 0.12350897490978241, "num_tokens": 5926987.0, "reward": 0.7052030563354492, "reward_std": 0.15304076671600342, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6174441576004028, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7929619550704956, "step": 25 }, { "adv/mean_abs_final_conf": 0.701289176940918, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7613296508789062, "adv/std_final_conf": 0.8713129758834839, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348834753036499, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.4972997299729973, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.36984313725490203, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.009512665552269817, "calib/mean_conf": 0.973764705882353, "calib/mu_c": 0.9775324675324677, "calib/mu_w": 0.9680198019801979, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36984313725490203, "calib/std_conf": 0.062295497031516586, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.62498687664042, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.0509868766404199, "calib/step_q_w": 0.5740000000000001, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 508.30859375, "completions/mean_terminated_length": 508.30859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.027733333333333332, "grad_norm": 73.61032104492188, "kl": 64.53376007080078, "learning_rate": 4.833333333333333e-06, "loss": 1.4161, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030653975903987885, "mask/share_reasoning": 0.857771635055542, "mask/share_step_conf": 0.11157442629337311, "num_tokens": 6162354.0, "reward": 0.7190060615539551, "reward_std": 0.17579255998134613, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.6222456693649292, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8157663345336914, "step": 26 }, { "adv/mean_abs_final_conf": 0.7280699610710144, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7500325441360474, "adv/std_final_conf": 0.878308117389679, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935150682926178, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5284929950776222, "calib/avg_num_step_conf": 6.078125, "calib/ece": 0.4282608695652175, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0016628802221377859, "calib/mean_conf": 0.9776679841897234, "calib/mu_c": 0.9784172661870502, "calib/mu_w": 0.9767543859649124, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4282608695652175, "calib/std_conf": 0.012842018279301592, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5972738853503184, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.036378943716077194, "calib/step_q_w": 0.5608949416342413, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 519.99609375, "completions/mean_terminated_length": 519.99609375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.0288, "grad_norm": 0.027362557128071785, "kl": 0.040985107421875, "learning_rate": 4.805555555555556e-06, "loss": -0.0143, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032479990273714066, "mask/share_reasoning": 0.8387442827224731, "mask/share_step_conf": 0.1287757158279419, "num_tokens": 6400689.0, "reward": 0.6784756779670715, "reward_std": 0.18861247599124908, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5627851486206055, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.7941662073135376, "step": 27 }, { "adv/mean_abs_final_conf": 0.692762017250061, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7368833422660828, "adv/std_final_conf": 0.8757468461990356, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355268478393555, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5933370427041313, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.3396385542168675, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9879518072289156, "calib/gap": 0.0077945472249270065, "calib/mean_conf": 0.9741767068273092, "calib/mu_c": 0.9770253164556962, "calib/mu_w": 0.9692307692307692, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3396385542168675, "calib/std_conf": 0.021458489384508266, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6009316037735849, "calib/step_q_c_n": 848.0, "calib/step_q_gap": 0.0021378294545186804, "calib/step_q_w": 0.5987937743190662, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 549.1171875, "completions/mean_terminated_length": 560.0557861328125, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.029866666666666666, "grad_norm": 0.044755179435014725, "kl": 0.041538238525390625, "learning_rate": 4.777777777777778e-06, "loss": -0.1011, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03004935197532177, "mask/share_reasoning": 0.8406248688697815, "mask/share_step_conf": 0.10979451984167099, "num_tokens": 6648207.0, "reward": 0.7052626609802246, "reward_std": 0.18966689705848694, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6301574110984802, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7803678512573242, "step": 28 }, { "adv/mean_abs_final_conf": 0.7540074586868286, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7608869671821594, "adv/std_final_conf": 0.8825486898422241, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354591965675354, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5462503183091417, "calib/avg_num_step_conf": 5.76953125, "calib/ece": 0.5014741035856575, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.9960159362549801, "calib/gap": 0.009847211611917595, "calib/mean_conf": 0.975577689243028, "calib/mu_c": 0.9807563025210084, "calib/mu_w": 0.9709090909090908, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.5014741035856575, "calib/std_conf": 0.06301809121954761, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5873322932917316, "calib/step_q_c_n": 641.0, "calib/step_q_gap": 0.027404063626659814, "calib/step_q_w": 0.5599282296650718, "calib/step_q_w_n": 836.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 580.1484375, "completions/mean_terminated_length": 582.423583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.030933333333333334, "grad_norm": 0.02169017121195793, "kl": 0.042827606201171875, "learning_rate": 4.75e-06, "loss": -0.0789, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028678342700004578, "mask/share_reasoning": 0.8571142554283142, "mask/share_step_conf": 0.1103011816740036, "num_tokens": 6903853.0, "reward": 0.6350164413452148, "reward_std": 0.1887853443622589, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.49021053314208984, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7798223495483398, "step": 29 }, { "adv/mean_abs_final_conf": 0.7241463661193848, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7513007521629333, "adv/std_final_conf": 0.8834677934646606, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353786706924438, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.5153054313964514, "calib/avg_num_step_conf": 6.20703125, "calib/ece": 0.4110612244897961, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9836734693877551, "calib/gap": 0.009838141676824952, "calib/mean_conf": 0.9743265306122451, "calib/mu_c": 0.978623188405797, "calib/mu_w": 0.968785046728972, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4110612244897961, "calib/std_conf": 0.0643147197278946, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5532292917166867, "calib/step_q_c_n": 833.0, "calib/step_q_gap": 0.019089080076474918, "calib/step_q_w": 0.5341402116402117, "calib/step_q_w_n": 756.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 615.69140625, "completions/mean_terminated_length": 622.9921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.032, "grad_norm": 0.034742482006549835, "kl": 0.05080413818359375, "learning_rate": 4.722222222222222e-06, "loss": -0.0564, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.028462983667850494, "mask/share_reasoning": 0.8461865186691284, "mask/share_step_conf": 0.11363177001476288, "num_tokens": 7168454.0, "reward": 0.6671479344367981, "reward_std": 0.20367759466171265, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.560566782951355, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.7737290859222412, "step": 30 }, { "adv/mean_abs_final_conf": 0.7788076996803284, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7677104473114014, "adv/std_final_conf": 0.9012274742126465, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355896711349487, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5034076015727391, "calib/avg_num_step_conf": 6.3046875, "calib/ece": 0.5326104417670683, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9839357429718876, "calib/gap": 0.014532110091743156, "calib/mean_conf": 0.9703614457831325, "calib/mu_c": 0.978532110091743, "calib/mu_w": 0.9639999999999999, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.5326104417670683, "calib/std_conf": 0.0843321857283264, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5646365105008078, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.0229179175359836, "calib/step_q_w": 0.5417185929648242, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 607.578125, "completions/mean_terminated_length": 607.578125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.03306666666666667, "grad_norm": 0.02423461340367794, "kl": 0.048610687255859375, "learning_rate": 4.694444444444445e-06, "loss": -0.0788, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.02809649333357811, "mask/share_reasoning": 0.8581934571266174, "mask/share_step_conf": 0.11371007561683655, "num_tokens": 7429906.0, "reward": 0.6151062250137329, "reward_std": 0.20114478468894958, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.4573843777179718, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7728281021118164, "step": 31 }, { "adv/mean_abs_final_conf": 0.7334926128387451, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7424031496047974, "adv/std_final_conf": 0.8949069976806641, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356457591056824, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5813483288052064, "calib/avg_num_step_conf": 5.86328125, "calib/ece": 0.49710317460317455, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9722222222222222, "calib/gap": 0.017518796992481378, "calib/mean_conf": 0.9693253968253969, "calib/mu_c": 0.9785714285714286, "calib/mu_w": 0.9610526315789473, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.49710317460317455, "calib/std_conf": 0.07284429244836445, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5945141065830721, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.09094516104425399, "calib/step_q_w": 0.5035689455388181, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2739.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 579.6484375, "completions/mean_terminated_length": 579.6484375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.034133333333333335, "grad_norm": 0.02419840358197689, "kl": 0.06121063232421875, "learning_rate": 4.666666666666667e-06, "loss": -0.0403, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0297107957303524, "mask/share_reasoning": 0.8531237840652466, "mask/share_step_conf": 0.11716543138027191, "num_tokens": 7685000.0, "reward": 0.6433032751083374, "reward_std": 0.17889225482940674, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.4952601194381714, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7913463115692139, "step": 32 }, { "adv/mean_abs_final_conf": 0.7403093576431274, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7863280773162842, "adv/std_final_conf": 0.8979626893997192, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354382753372192, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5144976606747106, "calib/avg_num_step_conf": 5.63671875, "calib/ece": 0.46631372549019623, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9882352941176471, "calib/gap": -0.005614996306328446, "calib/mean_conf": 0.9721960784313727, "calib/mu_c": 0.9694656488549619, "calib/mu_w": 0.9750806451612903, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.46239215686274526, "calib/std_conf": 0.0633808963634705, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5588917861799217, "calib/step_q_c_n": 767.0, "calib/step_q_gap": -0.024762059973924333, "calib/step_q_w": 0.5836538461538461, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 530.28125, "completions/mean_terminated_length": 532.36083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0352, "grad_norm": 0.026419516652822495, "kl": 0.07160186767578125, "learning_rate": 4.638888888888889e-06, "loss": -0.0381, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03006555698812008, "mask/share_reasoning": 0.852064847946167, "mask/share_step_conf": 0.11396338045597076, "num_tokens": 7927624.0, "reward": 0.6580392122268677, "reward_std": 0.17391237616539001, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5310878753662109, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.7849905490875244, "step": 33 }, { "adv/mean_abs_final_conf": 0.7404600977897644, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7718697786331177, "adv/std_final_conf": 0.9089685678482056, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351857304573059, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.48072519083969467, "calib/avg_num_step_conf": 6.2109375, "calib/ece": 0.4719521912350599, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9601593625498008, "calib/gap": -0.03308524173028016, "calib/mean_conf": 0.9585657370517928, "calib/mu_c": 0.9427480916030534, "calib/mu_w": 0.9758333333333336, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4543027888446216, "calib/std_conf": 0.11797568230315518, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.49260292164674635, "calib/step_q_c_n": 753.0, "calib/step_q_gap": -0.03952372112505764, "calib/step_q_w": 0.532126642771804, "calib/step_q_w_n": 837.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 527.76171875, "completions/mean_terminated_length": 527.76171875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.03626666666666667, "grad_norm": 0.01992679014801979, "kl": 0.0793304443359375, "learning_rate": 4.611111111111112e-06, "loss": -0.0357, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032113149762153625, "mask/share_reasoning": 0.8370362520217896, "mask/share_step_conf": 0.13085061311721802, "num_tokens": 8167843.0, "reward": 0.6504442691802979, "reward_std": 0.2038143277168274, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.519045352935791, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7818431854248047, "step": 34 }, { "adv/mean_abs_final_conf": 0.7602913975715637, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7359387278556824, "adv/std_final_conf": 0.9223473072052002, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9359014630317688, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5150700025926885, "calib/avg_num_step_conf": 5.97265625, "calib/ece": 0.43694779116465876, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.9317269076305221, "calib/gap": -0.0009625356494685766, "calib/mean_conf": 0.9481927710843374, "calib/mu_c": 0.9477443609022558, "calib/mu_w": 0.9487068965517244, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.4255020080321287, "calib/std_conf": 0.129554174477126, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5164872139973082, "calib/step_q_c_n": 743.0, "calib/step_q_gap": 0.024947773793745853, "calib/step_q_w": 0.4915394402035624, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 609.65234375, "completions/mean_terminated_length": 612.0431518554688, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.037333333333333336, "grad_norm": 0.08677566051483154, "kl": 0.20432281494140625, "learning_rate": 4.583333333333333e-06, "loss": -0.0317, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.027661994099617004, "mask/share_reasoning": 0.8633013963699341, "mask/share_step_conf": 0.10513034462928772, "num_tokens": 8433170.0, "reward": 0.6564799547195435, "reward_std": 0.23788407444953918, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.5431902408599854, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7697696685791016, "step": 35 }, { "adv/mean_abs_final_conf": 0.7184458374977112, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7714102864265442, "adv/std_final_conf": 0.9145676493644714, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353185892105103, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5131339310731298, "calib/avg_num_step_conf": 6.1875, "calib/ece": 0.30407843137254903, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9137254901960784, "calib/gap": -0.0005365648641073584, "calib/mean_conf": 0.9516862745098039, "calib/mu_c": 0.9515116279069767, "calib/mu_w": 0.9520481927710841, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29062745098039217, "calib/std_conf": 0.11270601880270643, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.47419541015625, "calib/step_q_c_n": 1024.0, "calib/step_q_gap": 0.007927553013392818, "calib/step_q_w": 0.4662678571428572, "calib/step_q_w_n": 560.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 502.25390625, "completions/mean_terminated_length": 504.22357177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0384, "grad_norm": 0.02859259396791458, "kl": 0.09160614013671875, "learning_rate": 4.555555555555556e-06, "loss": -0.0312, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03455417603254318, "mask/share_reasoning": 0.8255020380020142, "mask/share_step_conf": 0.13603754341602325, "num_tokens": 8664459.0, "reward": 0.7533528804779053, "reward_std": 0.16682732105255127, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.6879905462265015, "rewards/format_reward_step": 0.99609375, "rewards/step_l2_reward": 0.8187150955200195, "step": 36 }, { "adv/mean_abs_final_conf": 0.724345326423645, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.752377986907959, "adv/std_final_conf": 0.9264432787895203, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355803728103638, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.4804136011135415, "calib/avg_num_step_conf": 5.5859375, "calib/ece": 0.53625, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8508064516129032, "calib/gap": -0.027505136872804425, "calib/mean_conf": 0.9165725806451614, "calib/mu_c": 0.9009345794392523, "calib/mu_w": 0.9284397163120567, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.5106854838709678, "calib/std_conf": 0.19489644369333167, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4986245353159851, "calib/step_q_c_n": 538.0, "calib/step_q_gap": 0.06208866087652326, "calib/step_q_w": 0.43653587443946185, "calib/step_q_w_n": 892.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2615.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 559.3046875, "completions/mean_terminated_length": 563.7086791992188, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.039466666666666664, "grad_norm": 0.03272354602813721, "kl": 0.09531402587890625, "learning_rate": 4.527777777777778e-06, "loss": -0.1342, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030177973210811615, "mask/share_reasoning": 0.8478702306747437, "mask/share_step_conf": 0.11413928866386414, "num_tokens": 8914737.0, "reward": 0.6202419400215149, "reward_std": 0.1850229799747467, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.4493539035320282, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7911299467086792, "step": 37 }, { "adv/mean_abs_final_conf": 0.7505814433097839, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7393084764480591, "adv/std_final_conf": 0.9262279272079468, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.93584144115448, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5682171044119549, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.4629192771084339, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8634538152610441, "calib/gap": 0.05123264329149957, "calib/mean_conf": 0.9118598393574298, "calib/mu_c": 0.9388135593220339, "calib/mu_w": 0.8875809160305344, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.4504417670682732, "calib/std_conf": 0.20639150264249348, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.493696682464455, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.006098977150445273, "calib/step_q_w": 0.4875977053140097, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 550.43359375, "completions/mean_terminated_length": 550.43359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.04053333333333333, "grad_norm": 0.029129181057214737, "kl": 0.09613037109375, "learning_rate": 4.5e-06, "loss": -0.0535, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031951624900102615, "mask/share_reasoning": 0.8504153490066528, "mask/share_step_conf": 0.11763307452201843, "num_tokens": 9162536.0, "reward": 0.6530201435089111, "reward_std": 0.2201344519853592, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.5270024538040161, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7790378332138062, "step": 38 }, { "adv/mean_abs_final_conf": 0.7539465427398682, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.761415958404541, "adv/std_final_conf": 0.9165918231010437, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355933666229248, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5276138003410731, "calib/avg_num_step_conf": 5.8046875, "calib/ece": 0.45404453441295556, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8421052631578947, "calib/gap": 0.03396035681490217, "calib/mean_conf": 0.9127570850202429, "calib/mu_c": 0.930080991735537, "calib/mu_w": 0.8961206349206349, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.43846153846153857, "calib/std_conf": 0.197062096967272, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.522846, "calib/step_q_c_n": 600.0, "calib/step_q_gap": 0.054166541760722386, "calib/step_q_w": 0.46867945823927765, "calib/step_q_w_n": 886.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 561.53125, "completions/mean_terminated_length": 568.1897583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.0416, "grad_norm": 0.031767114996910095, "kl": 0.0828094482421875, "learning_rate": 4.472222222222223e-06, "loss": -0.0377, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.030686449259519577, "mask/share_reasoning": 0.842280924320221, "mask/share_step_conf": 0.11531387269496918, "num_tokens": 9412376.0, "reward": 0.6522339582443237, "reward_std": 0.20904475450515747, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5222196578979492, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7822482585906982, "step": 39 }, { "adv/mean_abs_final_conf": 0.7848721742630005, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7495676279067993, "adv/std_final_conf": 0.9360496997833252, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935839056968689, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5932433305342032, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.4544000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.78, "calib/gap": 0.06007880627866424, "calib/mean_conf": 0.89336, "calib/mu_c": 0.926283185840708, "calib/mu_w": 0.8662043795620438, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.44788000000000006, "calib/std_conf": 0.2023756665214472, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5128762430939227, "calib/step_q_c_n": 543.0, "calib/step_q_gap": 0.039638080092629036, "calib/step_q_w": 0.4732381630012936, "calib/step_q_w_n": 773.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 548.6015625, "completions/mean_terminated_length": 550.7529907226562, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.042666666666666665, "grad_norm": 0.030587203800678253, "kl": 0.096893310546875, "learning_rate": 4.444444444444444e-06, "loss": -0.0833, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.030954282730817795, "mask/share_reasoning": 0.8567135334014893, "mask/share_step_conf": 0.10842593014240265, "num_tokens": 9659578.0, "reward": 0.6597231030464172, "reward_std": 0.23681029677391052, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.5329078435897827, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.7865383625030518, "step": 40 }, { "adv/mean_abs_final_conf": 0.770974338054657, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7481893301010132, "adv/std_final_conf": 0.9226452708244324, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354609251022339, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5879452458051222, "calib/avg_num_step_conf": 5.359375, "calib/ece": 0.2507525896414342, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6812749003984063, "calib/gap": 0.08494085222254932, "calib/mean_conf": 0.8374545816733069, "calib/mu_c": 0.8641889534883721, "calib/mu_w": 0.7792481012658228, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20147410358565732, "calib/std_conf": 0.2666757460938386, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.491091425389755, "calib/step_q_c_n": 898.0, "calib/step_q_gap": 0.041407881085957554, "calib/step_q_w": 0.4496835443037974, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 494.796875, "completions/mean_terminated_length": 496.7372741699219, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.04373333333333333, "grad_norm": 0.02226729691028595, "kl": 0.100555419921875, "learning_rate": 4.416666666666667e-06, "loss": -0.048, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033622778952121735, "mask/share_reasoning": 0.8409379720687866, "mask/share_step_conf": 0.12153302878141403, "num_tokens": 9893494.0, "reward": 0.7606412172317505, "reward_std": 0.19762994349002838, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7124884128570557, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.8087941408157349, "step": 41 }, { "adv/mean_abs_final_conf": 0.7752968072891235, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.755486249923706, "adv/std_final_conf": 0.9295274019241333, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355602264404297, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5258946572580645, "calib/avg_num_step_conf": 5.93359375, "calib/ece": 0.38066547619047625, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6309523809523809, "calib/gap": 0.04844412802419351, "calib/mean_conf": 0.8299694444444443, "calib/mu_c": 0.85380703125, "calib/mu_w": 0.8053629032258065, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3513492063492064, "calib/std_conf": 0.25918747776435896, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4970636734693878, "calib/step_q_c_n": 735.0, "calib/step_q_gap": -0.0015517602040815426, "calib/step_q_w": 0.49861543367346933, "calib/step_q_w_n": 784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 471.75, "completions/mean_terminated_length": 471.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.0448, "grad_norm": 0.0334896519780159, "kl": 0.1055450439453125, "learning_rate": 4.388888888888889e-06, "loss": 0.018, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03502010926604271, "mask/share_reasoning": 0.8282299041748047, "mask/share_step_conf": 0.1367499828338623, "num_tokens": 10118630.0, "reward": 0.6918591856956482, "reward_std": 0.21222494542598724, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5937366485595703, "rewards/format_reward_step": 0.98046875, "rewards/step_l2_reward": 0.7899817824363708, "step": 42 }, { "adv/mean_abs_final_conf": 0.8061068058013916, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7791420817375183, "adv/std_final_conf": 0.9365590214729309, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353040456771851, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5696798493408664, "calib/avg_num_step_conf": 5.34765625, "calib/ece": 0.32089328063241107, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5019762845849802, "calib/gap": 0.07902419962335205, "calib/mean_conf": 0.744798418972332, "calib/mu_c": 0.7816555555555554, "calib/mu_w": 0.7026313559322034, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2660474308300395, "calib/std_conf": 0.31904407131952894, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4950938858695652, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.012943648902740579, "calib/step_q_w": 0.48215023696682463, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 536.25, "completions/mean_terminated_length": 536.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.04586666666666667, "grad_norm": 0.028792526572942734, "kl": 0.0963134765625, "learning_rate": 4.361111111111112e-06, "loss": -0.0078, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03177189826965332, "mask/share_reasoning": 0.851236879825592, "mask/share_step_conf": 0.11699119210243225, "num_tokens": 10361134.0, "reward": 0.715941309928894, "reward_std": 0.21710166335105896, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6365195512771606, "rewards/format_reward_step": 0.98828125, "rewards/step_l2_reward": 0.7953630685806274, "step": 43 }, { "adv/mean_abs_final_conf": 0.8082307577133179, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7397419214248657, "adv/std_final_conf": 0.9357187151908875, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356873631477356, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5607762017336485, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.3218108433734941, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.41767068273092367, "calib/gap": 0.09593624901497244, "calib/mean_conf": 0.6958598393574298, "calib/mu_c": 0.7501851851851852, "calib/mu_w": 0.6542489361702127, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.29196787148594383, "calib/std_conf": 0.3225989883784285, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4841393264840182, "calib/step_q_c_n": 584.0, "calib/step_q_gap": 0.02785838453440148, "calib/step_q_w": 0.4562809419496167, "calib/step_q_w_n": 913.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2823.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 581.10546875, "completions/mean_terminated_length": 583.3843383789062, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.046933333333333334, "grad_norm": 0.03097447007894516, "kl": 0.0913848876953125, "learning_rate": 4.333333333333334e-06, "loss": -0.0729, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0295084398239851, "mask/share_reasoning": 0.8522259593009949, "mask/share_step_conf": 0.11435935646295547, "num_tokens": 10616217.0, "reward": 0.6976090669631958, "reward_std": 0.22840039432048798, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.6077798008918762, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7874383926391602, "step": 44 }, { "adv/mean_abs_final_conf": 0.806146502494812, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.773840606212616, "adv/std_final_conf": 0.9366689920425415, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935440719127655, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.555874840357599, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.28786334661354573, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.32270916334661354, "calib/gap": 0.06492851213282225, "calib/mean_conf": 0.606080876494024, "calib/mu_c": 0.6410025862068964, "calib/mu_w": 0.5760740740740742, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21589641434262938, "calib/std_conf": 0.33849399559315413, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.49986407766990293, "calib/step_q_c_n": 618.0, "calib/step_q_gap": 0.028646072803722944, "calib/step_q_w": 0.47121800486618, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 525.578125, "completions/mean_terminated_length": 527.6392211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.048, "grad_norm": 0.027358626946806908, "kl": 0.101593017578125, "learning_rate": 4.305555555555556e-06, "loss": -0.0421, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03319074958562851, "mask/share_reasoning": 0.8404501080513, "mask/share_step_conf": 0.12245286256074905, "num_tokens": 10855813.0, "reward": 0.7107663750648499, "reward_std": 0.21518194675445557, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6325352787971497, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.7889974117279053, "step": 45 }, { "adv/mean_abs_final_conf": 0.8161130547523499, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.78526771068573, "adv/std_final_conf": 0.9366722702980042, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356869459152222, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5504032258064516, "calib/avg_num_step_conf": 6.5234375, "calib/ece": 0.2506731707317073, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.22764227642276422, "calib/gap": 0.06345403225806456, "calib/mean_conf": 0.5604650406504067, "calib/mu_c": 0.59245, "calib/mu_w": 0.5289959677419355, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15760162601626018, "calib/std_conf": 0.3357974991030459, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.46490219941348976, "calib/step_q_c_n": 682.0, "calib/step_q_gap": 0.09222669334061528, "calib/step_q_w": 0.3726755060728745, "calib/step_q_w_n": 988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 579.35546875, "completions/mean_terminated_length": 581.6275024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.04906666666666667, "grad_norm": 0.023970698937773705, "kl": 0.090087890625, "learning_rate": 4.277777777777778e-06, "loss": -0.0945, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03279152512550354, "mask/share_reasoning": 0.8372425436973572, "mask/share_step_conf": 0.1260596662759781, "num_tokens": 11108896.0, "reward": 0.7093468904495239, "reward_std": 0.2259354442358017, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6335732936859131, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7851204872131348, "step": 46 }, { "adv/mean_abs_final_conf": 0.8137375116348267, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7501428127288818, "adv/std_final_conf": 0.9365950226783752, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354432821273804, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5766758186566664, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.24800772357723574, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.14634146341463414, "calib/gap": 0.08106213754771313, "calib/mean_conf": 0.5145939024390244, "calib/mu_c": 0.5505116788321168, "calib/mu_w": 0.4694495412844037, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.10284552845528454, "calib/std_conf": 0.3287003753442299, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.44728381962864716, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.04672740563447808, "calib/step_q_w": 0.4005564139941691, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 560.16796875, "completions/mean_terminated_length": 562.36474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 63.0, "epoch": 0.050133333333333335, "grad_norm": 0.02703959122300148, "kl": 0.1092376708984375, "learning_rate": 4.25e-06, "loss": -0.1382, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030245723202824593, "mask/share_reasoning": 0.8493208885192871, "mask/share_step_conf": 0.11652719229459763, "num_tokens": 11358275.0, "reward": 0.722832441329956, "reward_std": 0.20725570619106293, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6541945338249207, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7914702892303467, "step": 47 }, { "adv/mean_abs_final_conf": 0.8032364845275879, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7610890865325928, "adv/std_final_conf": 0.9366337656974792, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935653805732727, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.552454916414374, "calib/avg_num_step_conf": 5.15234375, "calib/ece": 0.243355421686747, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.09236947791164658, "calib/gap": 0.05796554561010914, "calib/mean_conf": 0.46853212851405623, "calib/mu_c": 0.5015887850467289, "calib/mu_w": 0.44362323943661974, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.14108433734939763, "calib/std_conf": 0.3066094013429056, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4177630975143403, "calib/step_q_c_n": 523.0, "calib/step_q_gap": 0.03488998193645093, "calib/step_q_w": 0.3828731155778894, "calib/step_q_w_n": 796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 504.65234375, "completions/mean_terminated_length": 504.65234375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.0512, "grad_norm": 0.02584809623658657, "kl": 0.1268463134765625, "learning_rate": 4.222222222222223e-06, "loss": -0.0833, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03356067091226578, "mask/share_reasoning": 0.8469815254211426, "mask/share_step_conf": 0.11945784091949463, "num_tokens": 11591154.0, "reward": 0.7318041324615479, "reward_std": 0.21007341146469116, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.6651387214660645, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7984695434570312, "step": 48 }, { "adv/mean_abs_final_conf": 0.7968345284461975, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7676880955696106, "adv/std_final_conf": 0.9365724325180054, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351038932800293, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6370830608240681, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.18111686746987948, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.07630522088353414, "calib/gap": 0.14417363636363645, "calib/mean_conf": 0.4253088353413655, "calib/mu_c": 0.48900000000000005, "calib/mu_w": 0.3448263636363636, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.02409638554216871, "calib/std_conf": 0.2932193968704206, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.40479230769230773, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.0026721875721875676, "calib/step_q_w": 0.40212012012012016, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 515.71875, "completions/mean_terminated_length": 517.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.05226666666666667, "grad_norm": 0.030575614422559738, "kl": 0.1236572265625, "learning_rate": 4.194444444444445e-06, "loss": -0.0208, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032488685101270676, "mask/share_reasoning": 0.8429965376853943, "mask/share_step_conf": 0.12060855329036713, "num_tokens": 11827714.0, "reward": 0.7448811531066895, "reward_std": 0.1802704930305481, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6911728978157043, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7985894083976746, "step": 49 }, { "adv/mean_abs_final_conf": 0.7772702574729919, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.742446780204773, "adv/std_final_conf": 0.9365054965019226, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935427188873291, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5517457139281692, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.2025691699604743, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.05533596837944664, "calib/gap": 0.04930421724440004, "calib/mean_conf": 0.42454545454545456, "calib/mu_c": 0.44832061068702295, "calib/mu_w": 0.3990163934426229, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.05466403162055336, "calib/std_conf": 0.26587370123610027, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.40103092783505156, "calib/step_q_c_n": 679.0, "calib/step_q_gap": 0.04098436402018485, "calib/step_q_w": 0.3600465638148667, "calib/step_q_w_n": 713.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 522.71875, "completions/mean_terminated_length": 522.71875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.05333333333333334, "grad_norm": 0.02779863029718399, "kl": 0.1237640380859375, "learning_rate": 4.166666666666667e-06, "loss": -0.1123, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03340587019920349, "mask/share_reasoning": 0.8507226705551147, "mask/share_step_conf": 0.11587147414684296, "num_tokens": 12066890.0, "reward": 0.7342731952667236, "reward_std": 0.19280801713466644, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6673511862754822, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8011952638626099, "step": 50 }, { "adv/mean_abs_final_conf": 0.7935712337493896, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7670265436172485, "adv/std_final_conf": 0.9364770650863647, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352994561195374, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6248664886515354, "calib/avg_num_step_conf": 4.90625, "calib/ece": 0.1584615384615385, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.044534412955465584, "calib/gap": 0.11627236315086775, "calib/mean_conf": 0.4502024291497976, "calib/mu_c": 0.5005714285714286, "calib/mu_w": 0.3842990654205608, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.020931174089068832, "calib/std_conf": 0.2637451373655593, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.40795526315789477, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.030193025395657047, "calib/step_q_w": 0.3777622377622377, "calib/step_q_w_n": 572.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 503.4765625, "completions/mean_terminated_length": 509.4466552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.0544, "grad_norm": 0.03294004499912262, "kl": 0.1114349365234375, "learning_rate": 4.138888888888889e-06, "loss": -0.1588, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03207635134458542, "mask/share_reasoning": 0.8514719605445862, "mask/share_step_conf": 0.104732945561409, "num_tokens": 12305076.0, "reward": 0.7408157587051392, "reward_std": 0.18963393568992615, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6903417706489563, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7912896871566772, "step": 51 }, { "adv/mean_abs_final_conf": 0.7684762477874756, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7616976499557495, "adv/std_final_conf": 0.9363729357719421, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348715543746948, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6600189933523266, "calib/avg_num_step_conf": 4.9296875, "calib/ece": 0.21298418972332014, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.04743083003952569, "calib/gap": 0.12831908831908823, "calib/mean_conf": 0.4525494071146246, "calib/mu_c": 0.49870370370370365, "calib/mu_w": 0.3703846153846154, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.01260869565217391, "calib/std_conf": 0.24637194023352396, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.43677819083023545, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.08304192709397173, "calib/step_q_w": 0.3537362637362637, "calib/step_q_w_n": 455.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 487.19921875, "completions/mean_terminated_length": 489.1098327636719, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.055466666666666664, "grad_norm": 0.036368850618600845, "kl": 0.1165313720703125, "learning_rate": 4.111111111111111e-06, "loss": -0.0632, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.034070298075675964, "mask/share_reasoning": 0.850233256816864, "mask/share_step_conf": 0.11179022490978241, "num_tokens": 12537751.0, "reward": 0.7687431573867798, "reward_std": 0.1700069010257721, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7088756561279297, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8286106586456299, "step": 52 }, { "adv/mean_abs_final_conf": 0.7876460552215576, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7616404294967651, "adv/std_final_conf": 0.9364662766456604, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351630806922913, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5511803011803011, "calib/avg_num_step_conf": 5.45703125, "calib/ece": 0.19147470355731222, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.07905138339920949, "calib/gap": 0.03547876136209471, "calib/mean_conf": 0.5397505928853755, "calib/mu_c": 0.5525117283950617, "calib/mu_w": 0.517032967032967, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.045454545454545456, "calib/std_conf": 0.2577434600437044, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.45877871825876665, "calib/step_q_c_n": 827.0, "calib/step_q_gap": 0.06298222703069645, "calib/step_q_w": 0.3957964912280702, "calib/step_q_w_n": 570.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 536.9609375, "completions/mean_terminated_length": 541.18896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.05653333333333333, "grad_norm": 0.04446394741535187, "kl": 0.101318359375, "learning_rate": 4.083333333333334e-06, "loss": -0.114, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030670396983623505, "mask/share_reasoning": 0.8516263961791992, "mask/share_step_conf": 0.10989070683717728, "num_tokens": 12781037.0, "reward": 0.7480237483978271, "reward_std": 0.17704400420188904, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6884797215461731, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8075677752494812, "step": 53 }, { "adv/mean_abs_final_conf": 0.7647438049316406, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7672848701477051, "adv/std_final_conf": 0.9361473917961121, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348371624946594, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.646909903201787, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.11072289156626511, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.10040160642570281, "calib/gap": 0.11389873417721519, "calib/mean_conf": 0.6158634538152611, "calib/mu_c": 0.652, "calib/mu_w": 0.5381012658227848, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.021927710843373513, "calib/std_conf": 0.22712662425064317, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4767085427135679, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.05126957055082698, "calib/step_q_w": 0.4254389721627409, "calib/step_q_w_n": 467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 480.26953125, "completions/mean_terminated_length": 482.1529846191406, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.0576, "grad_norm": 0.04478487744927406, "kl": 0.1110992431640625, "learning_rate": 4.055555555555556e-06, "loss": -0.1372, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03677823022007942, "mask/share_reasoning": 0.8410859107971191, "mask/share_step_conf": 0.11822962015867233, "num_tokens": 13010218.0, "reward": 0.7791956663131714, "reward_std": 0.15325689315795898, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7515324354171753, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8068588972091675, "step": 54 }, { "adv/mean_abs_final_conf": 0.8050023317337036, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7768208980560303, "adv/std_final_conf": 0.9359601140022278, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349936246871948, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6821805006587616, "calib/avg_num_step_conf": 4.4765625, "calib/ece": 0.17230769230769225, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.12550607287449392, "calib/gap": 0.1376495388669302, "calib/mean_conf": 0.6965182186234817, "calib/mu_c": 0.7606060606060606, "calib/mu_w": 0.6229565217391304, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.16720647773279343, "calib/std_conf": 0.21997244645283676, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5090816326530613, "calib/step_q_c_n": 588.0, "calib/step_q_gap": 0.048275181040158044, "calib/step_q_w": 0.46080645161290323, "calib/step_q_w_n": 558.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 494.07421875, "completions/mean_terminated_length": 496.01177978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.058666666666666666, "grad_norm": 0.02491808868944645, "kl": 0.1061553955078125, "learning_rate": 4.027777777777779e-06, "loss": -0.0768, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.033575639128685, "mask/share_reasoning": 0.8579612970352173, "mask/share_step_conf": 0.10455679893493652, "num_tokens": 13244525.0, "reward": 0.7591003775596619, "reward_std": 0.17801684141159058, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7125382423400879, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8056625127792358, "step": 55 }, { "adv/mean_abs_final_conf": 0.7873808145523071, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7654106020927429, "adv/std_final_conf": 0.9311465620994568, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346365928649902, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6846971955019645, "calib/avg_num_step_conf": 5.265625, "calib/ece": 0.29641975308641977, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.32510288065843623, "calib/gap": 0.10250237095244541, "calib/mean_conf": 0.7961728395061728, "calib/mu_c": 0.8472131147540983, "calib/mu_w": 0.7447107438016529, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.2952674897119342, "calib/std_conf": 0.17294282626148935, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.538655589123867, "calib/step_q_c_n": 662.0, "calib/step_q_gap": 0.07869203227255511, "calib/step_q_w": 0.4599635568513119, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 525.75390625, "completions/mean_terminated_length": 525.75390625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05973333333333333, "grad_norm": 0.02171236462891102, "kl": 0.0927276611328125, "learning_rate": 4.000000000000001e-06, "loss": -0.0945, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03197760134935379, "mask/share_reasoning": 0.8525564670562744, "mask/share_step_conf": 0.11546589434146881, "num_tokens": 13485958.0, "reward": 0.7075331211090088, "reward_std": 0.17797213792800903, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6334031224250793, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7816630601882935, "step": 56 }, { "adv/mean_abs_final_conf": 0.7610681653022766, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7601795196533203, "adv/std_final_conf": 0.9341049194335938, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350772500038147, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6245230662504336, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.2414516129032258, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.4717741935483871, "calib/gap": 0.058666666666666534, "calib/mean_conf": 0.8558064516129031, "calib/mu_c": 0.8778064516129033, "calib/mu_w": 0.8191397849462367, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.23612903225806453, "calib/std_conf": 0.1289886014856173, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5579113924050633, "calib/step_q_c_n": 790.0, "calib/step_q_gap": 0.028339407969265595, "calib/step_q_w": 0.5295719844357977, "calib/step_q_w_n": 514.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 511.91796875, "completions/mean_terminated_length": 513.925537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.0608, "grad_norm": 0.021534863859415054, "kl": 0.09047698974609375, "learning_rate": 3.972222222222223e-06, "loss": -0.1495, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.033345114439725876, "mask/share_reasoning": 0.8466142416000366, "mask/share_step_conf": 0.11613436043262482, "num_tokens": 13723801.0, "reward": 0.729364812374115, "reward_std": 0.2116679847240448, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.674622654914856, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.784106969833374, "step": 57 }, { "adv/mean_abs_final_conf": 0.7951971292495728, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7578577995300293, "adv/std_final_conf": 0.9347510933876038, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353342652320862, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.5490826302397214, "calib/avg_num_step_conf": 5.57421875, "calib/ece": 0.37469387755102046, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.46122448979591835, "calib/gap": 0.03215481451720903, "calib/mean_conf": 0.8400000000000001, "calib/mu_c": 0.8571929824561404, "calib/mu_w": 0.8250381679389314, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.37469387755102046, "calib/std_conf": 0.14832396974191325, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.5785714285714286, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.04798463150541399, "calib/step_q_w": 0.5305867970660146, "calib/step_q_w_n": 818.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 592.01953125, "completions/mean_terminated_length": 594.3411865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.06186666666666667, "grad_norm": 0.02118760719895363, "kl": 0.0829010009765625, "learning_rate": 3.944444444444445e-06, "loss": -0.2503, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.02853698655962944, "mask/share_reasoning": 0.8620185852050781, "mask/share_step_conf": 0.10553818941116333, "num_tokens": 13981678.0, "reward": 0.653290867805481, "reward_std": 0.22565579414367676, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.5532636642456055, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7533180713653564, "step": 58 }, { "adv/mean_abs_final_conf": 0.7879300117492676, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7936879396438599, "adv/std_final_conf": 0.9314253926277161, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934874951839447, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.590799339179057, "calib/avg_num_step_conf": 5.6640625, "calib/ece": 0.37725099601593626, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5776892430278885, "calib/gap": 0.03818528402592447, "calib/mean_conf": 0.8830677290836653, "calib/mu_c": 0.9016279069767442, "calib/mu_w": 0.8634426229508197, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.3731872509960159, "calib/std_conf": 0.11880778692277857, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.6050219619326501, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.04757737262365391, "calib/step_q_w": 0.5574445893089962, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 545.60546875, "completions/mean_terminated_length": 545.60546875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.06293333333333333, "grad_norm": 0.025029340758919716, "kl": 0.093658447265625, "learning_rate": 3.916666666666667e-06, "loss": -0.071, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0332186222076416, "mask/share_reasoning": 0.8511619567871094, "mask/share_step_conf": 0.11561942845582962, "num_tokens": 14227601.0, "reward": 0.6718493103981018, "reward_std": 0.2027919441461563, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.5778699517250061, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7658286094665527, "step": 59 }, { "adv/mean_abs_final_conf": 0.7543654441833496, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7350926399230957, "adv/std_final_conf": 0.931016206741333, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350695610046387, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6637228260869565, "calib/avg_num_step_conf": 5.453125, "calib/ece": 0.3563374485596708, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5432098765432098, "calib/gap": 0.0603362771739131, "calib/mean_conf": 0.8830864197530863, "calib/mu_c": 0.911640625, "calib/mu_w": 0.8513043478260869, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3563374485596708, "calib/std_conf": 0.10384352713739116, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.6065560821484992, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.03696237310524886, "calib/step_q_w": 0.5695937090432504, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 528.80859375, "completions/mean_terminated_length": 532.972412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.064, "grad_norm": 0.026383792981505394, "kl": 0.08768463134765625, "learning_rate": 3.88888888888889e-06, "loss": -0.0774, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.031654588878154755, "mask/share_reasoning": 0.8491592407226562, "mask/share_step_conf": 0.11137367784976959, "num_tokens": 14471832.0, "reward": 0.6921672821044922, "reward_std": 0.22028900682926178, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6036254167556763, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7807092070579529, "step": 60 }, { "adv/mean_abs_final_conf": 0.7289884090423584, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7417490482330322, "adv/std_final_conf": 0.9285774230957031, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348416924476624, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6275510204081632, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.3030677290836653, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6334661354581673, "calib/gap": 0.027582366279845116, "calib/mean_conf": 0.9126294820717131, "calib/mu_c": 0.9233986928104574, "calib/mu_w": 0.8958163265306123, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3030677290836653, "calib/std_conf": 0.06633443403523974, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6239313984168865, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.07113139841688643, "calib/step_q_w": 0.5528000000000001, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 458.60546875, "completions/mean_terminated_length": 458.60546875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.06506666666666666, "grad_norm": 0.019766852259635925, "kl": 0.0915374755859375, "learning_rate": 3.861111111111112e-06, "loss": -0.036, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.036816179752349854, "mask/share_reasoning": 0.8392590284347534, "mask/share_step_conf": 0.12392475455999374, "num_tokens": 14693299.0, "reward": 0.7142760157585144, "reward_std": 0.18115657567977905, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6492277383804321, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7793242931365967, "step": 61 }, { "adv/mean_abs_final_conf": 0.8142984509468079, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.768386960029602, "adv/std_final_conf": 0.9315682053565979, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351645112037659, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6190079999999999, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.3777199999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.528, "calib/gap": 0.04552000000000023, "calib/mean_conf": 0.87452, "calib/mu_c": 0.8972800000000002, "calib/mu_w": 0.85176, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.3761199999999999, "calib/std_conf": 0.12240249017074774, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5904244482173174, "calib/step_q_c_n": 589.0, "calib/step_q_gap": 0.04210850186858428, "calib/step_q_w": 0.5483159463487332, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 520.140625, "completions/mean_terminated_length": 520.140625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.06613333333333334, "grad_norm": 0.027727412059903145, "kl": 0.091064453125, "learning_rate": 3.833333333333334e-06, "loss": -0.1161, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.033058080822229385, "mask/share_reasoning": 0.858847975730896, "mask/share_step_conf": 0.10809390246868134, "num_tokens": 14933535.0, "reward": 0.6698973178863525, "reward_std": 0.23705357313156128, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.5798171758651733, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7599775791168213, "step": 62 }, { "adv/mean_abs_final_conf": 0.7539543509483337, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7746871709823608, "adv/std_final_conf": 0.9303356409072876, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353242516517639, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7003731343283583, "calib/avg_num_step_conf": 4.44140625, "calib/ece": 0.3334426229508196, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.5286885245901639, "calib/gap": 0.07777747625508813, "calib/mean_conf": 0.8826229508196721, "calib/mu_c": 0.9176865671641791, "calib/mu_w": 0.8399090909090909, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.875, "calib/pce": 0.3334426229508196, "calib/std_conf": 0.10850552058697308, "calib/step_conf_rate": 0.875, "calib/step_q_c": 0.5954471544715446, "calib/step_q_c_n": 615.0, "calib/step_q_gap": 0.04335903186618062, "calib/step_q_w": 0.552088122605364, "calib/step_q_w_n": 522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 572.15234375, "completions/mean_terminated_length": 572.15234375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0672, "grad_norm": 0.022924363613128662, "kl": 0.08589935302734375, "learning_rate": 3.8055555555555556e-06, "loss": -0.1857, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.030279844999313354, "mask/share_reasoning": 0.876987874507904, "mask/share_step_conf": 0.09273228049278259, "num_tokens": 15188646.0, "reward": 0.6390208601951599, "reward_std": 0.2594981789588928, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.570832371711731, "rewards/format_reward_step": 0.8515625, "rewards/step_l2_reward": 0.7072093486785889, "step": 63 }, { "adv/mean_abs_final_conf": 0.7875009775161743, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7415667772293091, "adv/std_final_conf": 0.9346509575843811, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355990886688232, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6633228840125391, "calib/avg_num_step_conf": 4.72265625, "calib/ece": 0.29401639344262287, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.5655737704918032, "calib/gap": 0.05561267850923057, "calib/mean_conf": 0.8855737704918034, "calib/mu_c": 0.9081379310344828, "calib/mu_w": 0.8525252525252522, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.2926639344262295, "calib/std_conf": 0.10430919658563556, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5758356940509914, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.03462297039293971, "calib/step_q_w": 0.5412127236580517, "calib/step_q_w_n": 503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 528.9765625, "completions/mean_terminated_length": 533.1417236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 44.0, "epoch": 0.06826666666666667, "grad_norm": 0.024568337947130203, "kl": 0.092010498046875, "learning_rate": 3.777777777777778e-06, "loss": -0.2132, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03288152813911438, "mask/share_reasoning": 0.8555514812469482, "mask/share_step_conf": 0.10375450551509857, "num_tokens": 15427840.0, "reward": 0.6932743787765503, "reward_std": 0.25233006477355957, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6326503753662109, "rewards/format_reward_step": 0.9140625, "rewards/step_l2_reward": 0.7538983821868896, "step": 64 }, { "adv/mean_abs_final_conf": 0.7839001417160034, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7552789449691772, "adv/std_final_conf": 0.9260660409927368, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350070357322693, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6429189322787939, "calib/avg_num_step_conf": 4.31640625, "calib/ece": 0.37121568627450974, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.6313725490196078, "calib/gap": 0.050913865546218795, "calib/mean_conf": 0.9045490196078432, "calib/mu_c": 0.9283088235294118, "calib/mu_w": 0.877394957983193, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.37121568627450974, "calib/std_conf": 0.09942428663836936, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.6139208633093525, "calib/step_q_c_n": 556.0, "calib/step_q_gap": 0.05392086330935253, "calib/step_q_w": 0.5599999999999999, "calib/step_q_w_n": 549.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 424.796875, "completions/mean_terminated_length": 426.4627685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.06933333333333333, "grad_norm": 0.028239957988262177, "kl": 0.101104736328125, "learning_rate": 3.7500000000000005e-06, "loss": -0.1308, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03776917979121208, "mask/share_reasoning": 0.8475953340530396, "mask/share_step_conf": 0.11072924733161926, "num_tokens": 15641612.0, "reward": 0.6717511415481567, "reward_std": 0.20962993800640106, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5822335481643677, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7612687945365906, "step": 65 }, { "adv/mean_abs_final_conf": 0.762721598148346, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7631368637084961, "adv/std_final_conf": 0.9323623180389404, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354203939437866, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.709381913716814, "calib/avg_num_step_conf": 4.87890625, "calib/ece": 0.3873029045643154, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.48132780082987553, "calib/gap": 0.0859990320796461, "calib/mean_conf": 0.8561825726141079, "calib/mu_c": 0.901858407079646, "calib/mu_w": 0.8158593749999999, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.3873029045643154, "calib/std_conf": 0.13314715857788725, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.5657032755298652, "calib/step_q_c_n": 519.0, "calib/step_q_gap": 0.08078546731068714, "calib/step_q_w": 0.48491780821917807, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 581.00390625, "completions/mean_terminated_length": 583.2824096679688, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.0704, "grad_norm": 0.027212418615818024, "kl": 0.08538055419921875, "learning_rate": 3.7222222222222225e-06, "loss": -0.158, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.031282857060432434, "mask/share_reasoning": 0.8646191358566284, "mask/share_step_conf": 0.10019171983003616, "num_tokens": 15896701.0, "reward": 0.6610898971557617, "reward_std": 0.23560646176338196, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.5697222948074341, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7524575591087341, "step": 66 }, { "adv/mean_abs_final_conf": 0.7664501667022705, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7277902960777283, "adv/std_final_conf": 0.9249961972236633, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350871443748474, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7230132450331126, "calib/avg_num_step_conf": 4.37890625, "calib/ece": 0.27430278884462156, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5537848605577689, "calib/gap": 0.09163708609271526, "calib/mean_conf": 0.8679282868525896, "calib/mu_c": 0.9044370860927152, "calib/mu_w": 0.8128, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.27031872509960164, "calib/std_conf": 0.13576289838663588, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5753651685393258, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.048323603747149835, "calib/step_q_w": 0.527041564792176, "calib/step_q_w_n": 409.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 519.05078125, "completions/mean_terminated_length": 519.05078125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.07146666666666666, "grad_norm": 0.03001784160733223, "kl": 0.09526824951171875, "learning_rate": 3.694444444444445e-06, "loss": -0.1272, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0325121246278286, "mask/share_reasoning": 0.8712092638015747, "mask/share_step_conf": 0.09627857059240341, "num_tokens": 16134586.0, "reward": 0.726654589176178, "reward_std": 0.22766532003879547, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6616469025611877, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.791662335395813, "step": 67 }, { "adv/mean_abs_final_conf": 0.7865017056465149, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7429081201553345, "adv/std_final_conf": 0.9320579171180725, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935204803943634, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7609587199139438, "calib/avg_num_step_conf": 4.5546875, "calib/ece": 0.4028979591836734, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.49795918367346936, "calib/gap": 0.11741898615032953, "calib/mean_conf": 0.8559591836734693, "calib/mu_c": 0.9201801801801803, "calib/mu_w": 0.8027611940298508, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.4028979591836734, "calib/std_conf": 0.13943471286001208, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5615936254980078, "calib/step_q_c_n": 502.0, "calib/step_q_gap": 0.09680446887150179, "calib/step_q_w": 0.46478915662650605, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 507.08984375, "completions/mean_terminated_length": 507.08984375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07253333333333334, "grad_norm": 0.02626444399356842, "kl": 0.10231781005859375, "learning_rate": 3.6666666666666666e-06, "loss": -0.2239, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03458181768655777, "mask/share_reasoning": 0.8608101010322571, "mask/share_step_conf": 0.10460809618234634, "num_tokens": 16368489.0, "reward": 0.6846904158592224, "reward_std": 0.21365387737751007, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.586223840713501, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7831569910049438, "step": 68 }, { "adv/mean_abs_final_conf": 0.7961992025375366, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7524364590644836, "adv/std_final_conf": 0.9349520206451416, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355664253234863, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6321779795062237, "calib/avg_num_step_conf": 4.59765625, "calib/ece": 0.36954545454545445, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.875, "calib/frac_conf_gt_0.9": 0.32644628099173556, "calib/gap": 0.06065951447630813, "calib/mean_conf": 0.8169834710743801, "calib/mu_c": 0.8498198198198196, "calib/mu_w": 0.7891603053435114, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.36392561983471067, "calib/std_conf": 0.1447382638283699, "calib/step_conf_rate": 0.89453125, "calib/step_q_c": 0.5257009345794392, "calib/step_q_c_n": 535.0, "calib/step_q_gap": 0.0514953271028038, "calib/step_q_w": 0.4742056074766354, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 615.87890625, "completions/mean_terminated_length": 615.87890625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.0736, "grad_norm": 0.031040944159030914, "kl": 0.08621978759765625, "learning_rate": 3.638888888888889e-06, "loss": -0.2658, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.029404189437627792, "mask/share_reasoning": 0.8814509510993958, "mask/share_step_conf": 0.08914485573768616, "num_tokens": 16630650.0, "reward": 0.6388742923736572, "reward_std": 0.2512214183807373, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.5482457280158997, "rewards/format_reward_step": 0.875, "rewards/step_l2_reward": 0.72950279712677, "step": 69 }, { "adv/mean_abs_final_conf": 0.7597904801368713, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7445217370986938, "adv/std_final_conf": 0.9348887801170349, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351232647895813, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7801652892561984, "calib/avg_num_step_conf": 4.578125, "calib/ece": 0.29930894308943085, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.35365853658536583, "calib/gap": 0.16780628099173567, "calib/mean_conf": 0.7911788617886178, "calib/mu_c": 0.8764462809917355, "calib/mu_w": 0.7086399999999998, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.29930894308943085, "calib/std_conf": 0.1871171769965541, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.5376904676258993, "calib/step_q_c_n": 556.0, "calib/step_q_gap": 0.07898429879473051, "calib/step_q_w": 0.4587061688311688, "calib/step_q_w_n": 616.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 543.51171875, "completions/mean_terminated_length": 543.51171875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.07466666666666667, "grad_norm": 0.028346510604023933, "kl": 0.092010498046875, "learning_rate": 3.6111111111111115e-06, "loss": -0.2265, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03363034129142761, "mask/share_reasoning": 0.8627834320068359, "mask/share_step_conf": 0.10358622670173645, "num_tokens": 16876781.0, "reward": 0.7075166702270508, "reward_std": 0.23655042052268982, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.646140992641449, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7688924074172974, "step": 70 }, { "adv/mean_abs_final_conf": 0.7985207438468933, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7744780778884888, "adv/std_final_conf": 0.9292157292366028, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356667995452881, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6687994248741913, "calib/avg_num_step_conf": 4.765625, "calib/ece": 0.25822784810126587, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": 0.0902508986340761, "calib/mean_conf": 0.7828691983122363, "calib/mu_c": 0.8236153846153845, "calib/mu_w": 0.7333644859813084, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.24628691983122367, "calib/std_conf": 0.19327347696814814, "calib/step_conf_rate": 0.9140625, "calib/step_q_c": 0.4937739130434783, "calib/step_q_c_n": 575.0, "calib/step_q_gap": 0.04875685877991248, "calib/step_q_w": 0.4450170542635658, "calib/step_q_w_n": 645.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 542.9296875, "completions/mean_terminated_length": 547.2047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.07573333333333333, "grad_norm": 0.026825150474905968, "kl": 0.08586883544921875, "learning_rate": 3.5833333333333335e-06, "loss": -0.2883, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.03389473259449005, "mask/share_reasoning": 0.8552953600883484, "mask/share_step_conf": 0.10299739241600037, "num_tokens": 17120179.0, "reward": 0.6880367994308472, "reward_std": 0.23565784096717834, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6271457076072693, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.748927891254425, "step": 71 }, { "adv/mean_abs_final_conf": 0.7596650123596191, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7277320623397827, "adv/std_final_conf": 0.9333326816558838, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347254037857056, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7557932263814617, "calib/avg_num_step_conf": 4.77734375, "calib/ece": 0.2757370517928288, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.32270916334661354, "calib/gap": 0.13253692386045335, "calib/mean_conf": 0.8016334661354582, "calib/mu_c": 0.864469696969697, "calib/mu_w": 0.7319327731092437, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.2757370517928288, "calib/std_conf": 0.15685695616309417, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5198018292682927, "calib/step_q_c_n": 656.0, "calib/step_q_gap": 0.06816161762808104, "calib/step_q_w": 0.4516402116402116, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 476.59375, "completions/mean_terminated_length": 476.59375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0768, "grad_norm": 0.02857062965631485, "kl": 0.1085052490234375, "learning_rate": 3.555555555555556e-06, "loss": -0.1094, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034555524587631226, "mask/share_reasoning": 0.8587918877601624, "mask/share_step_conf": 0.10665258765220642, "num_tokens": 17346595.0, "reward": 0.7392154932022095, "reward_std": 0.20494453608989716, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6732785105705261, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8051523566246033, "step": 72 }, { "adv/mean_abs_final_conf": 0.7999264001846313, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7472314834594727, "adv/std_final_conf": 0.9317479133605957, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346880912780762, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6718409586056645, "calib/avg_num_step_conf": 4.32421875, "calib/ece": 0.17265060240963856, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.321285140562249, "calib/gap": 0.10831290849673203, "calib/mean_conf": 0.7830120481927711, "calib/mu_c": 0.8247712418300653, "calib/mu_w": 0.7164583333333333, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.17060240963855422, "calib/std_conf": 0.18381544092024615, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5335294117647059, "calib/step_q_c_n": 663.0, "calib/step_q_gap": 0.08039878113407528, "calib/step_q_w": 0.45313063063063064, "calib/step_q_w_n": 444.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 474.546875, "completions/mean_terminated_length": 474.546875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.07786666666666667, "grad_norm": 0.028949644416570663, "kl": 0.09024810791015625, "learning_rate": 3.5277777777777784e-06, "loss": -0.1175, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03428073972463608, "mask/share_reasoning": 0.8634704351425171, "mask/share_step_conf": 0.10224878787994385, "num_tokens": 17575111.0, "reward": 0.7635636925697327, "reward_std": 0.18511897325515747, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7138800621032715, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8132473826408386, "step": 73 }, { "adv/mean_abs_final_conf": 0.7531702518463135, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7528907656669617, "adv/std_final_conf": 0.9215183258056641, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350585341453552, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6780112044817928, "calib/avg_num_step_conf": 4.953125, "calib/ece": 0.277551867219917, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.890625, "calib/frac_conf_gt_0.9": 0.25311203319502074, "calib/gap": 0.1256932773109246, "calib/mean_conf": 0.7104979253112034, "calib/mu_c": 0.7814285714285716, "calib/mu_w": 0.655735294117647, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.27618257261410784, "calib/std_conf": 0.22586064304938835, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.47659090909090907, "calib/step_q_c_n": 528.0, "calib/step_q_gap": 0.061158476658476624, "calib/step_q_w": 0.41543243243243244, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 520.171875, "completions/mean_terminated_length": 522.2117919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.07893333333333333, "grad_norm": 0.04748916998505592, "kl": 0.106292724609375, "learning_rate": 3.5e-06, "loss": -0.2116, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.034913722425699234, "mask/share_reasoning": 0.8536270260810852, "mask/share_step_conf": 0.10755302011966705, "num_tokens": 17812203.0, "reward": 0.6924230456352234, "reward_std": 0.2192223221063614, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6199167966842651, "rewards/format_reward_step": 0.890625, "rewards/step_l2_reward": 0.7649291753768921, "step": 74 }, { "adv/mean_abs_final_conf": 0.7570982575416565, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7516752481460571, "adv/std_final_conf": 0.9306212067604065, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934241771697998, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7717879604672058, "calib/avg_num_step_conf": 4.98828125, "calib/ece": 0.12952000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.36, "calib/gap": 0.20682148040638615, "calib/mean_conf": 0.76, "calib/mu_c": 0.8352830188679246, "calib/mu_w": 0.6284615384615384, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12676000000000004, "calib/std_conf": 0.21719484340103473, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5181912144702843, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.0815311747088529, "calib/step_q_w": 0.43666003976143136, "calib/step_q_w_n": 503.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2728.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 458.69921875, "completions/mean_terminated_length": 458.69921875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.08, "grad_norm": 0.03306273743510246, "kl": 0.1181640625, "learning_rate": 3.4722222222222224e-06, "loss": -0.085, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.036380767822265625, "mask/share_reasoning": 0.8421682119369507, "mask/share_step_conf": 0.1214509978890419, "num_tokens": 18034382.0, "reward": 0.8006812334060669, "reward_std": 0.1714782565832138, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7690855264663696, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8322769403457642, "step": 75 }, { "adv/mean_abs_final_conf": 0.7428159117698669, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7283253073692322, "adv/std_final_conf": 0.9327108860015869, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351808428764343, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7631460818263435, "calib/avg_num_step_conf": 4.35546875, "calib/ece": 0.11960159362549802, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.27091633466135456, "calib/gap": 0.23035462357316916, "calib/mean_conf": 0.6670916334661354, "calib/mu_c": 0.7707971014492754, "calib/mu_w": 0.5404424778761062, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.11844621513944223, "calib/std_conf": 0.25134678582140274, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5085766423357663, "calib/step_q_c_n": 548.0, "calib/step_q_gap": 0.09956429665675398, "calib/step_q_w": 0.4090123456790123, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 471.109375, "completions/mean_terminated_length": 471.109375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.08106666666666666, "grad_norm": 0.03533868491649628, "kl": 0.117095947265625, "learning_rate": 3.444444444444445e-06, "loss": -0.1891, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03789925202727318, "mask/share_reasoning": 0.8529291152954102, "mask/share_step_conf": 0.10917161405086517, "num_tokens": 18258042.0, "reward": 0.7864531874656677, "reward_std": 0.18198052048683167, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7521445751190186, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8207618594169617, "step": 76 }, { "adv/mean_abs_final_conf": 0.7651739120483398, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7566498517990112, "adv/std_final_conf": 0.9274589419364929, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351546764373779, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6154385964912281, "calib/avg_num_step_conf": 4.5703125, "calib/ece": 0.15612244897959182, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.1469387755102041, "calib/gap": 0.0977754385964913, "calib/mean_conf": 0.6050204081632653, "calib/mu_c": 0.6429333333333334, "calib/mu_w": 0.5451578947368421, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.07444897959183673, "calib/std_conf": 0.2362714186205692, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.4585238784370478, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.04535060077525238, "calib/step_q_w": 0.4131732776617954, "calib/step_q_w_n": 479.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 483.76953125, "completions/mean_terminated_length": 483.76953125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.08213333333333334, "grad_norm": 0.044293832033872604, "kl": 0.11598968505859375, "learning_rate": 3.416666666666667e-06, "loss": -0.2489, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03868642449378967, "mask/share_reasoning": 0.8451880812644958, "mask/share_step_conf": 0.11612546443939209, "num_tokens": 18486551.0, "reward": 0.7469485998153687, "reward_std": 0.19202597439289093, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6997886896133423, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.794108510017395, "step": 77 }, { "adv/mean_abs_final_conf": 0.7482144832611084, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7161533832550049, "adv/std_final_conf": 0.9358103275299072, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349690079689026, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7036554354736172, "calib/avg_num_step_conf": 4.5859375, "calib/ece": 0.08665338645418323, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.09561752988047809, "calib/gap": 0.16321233312142414, "calib/mean_conf": 0.5792430278884463, "calib/mu_c": 0.6579230769230769, "calib/mu_w": 0.4947107438016528, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.07398406374501992, "calib/std_conf": 0.22992252441260763, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.47225155279503106, "calib/step_q_c_n": 644.0, "calib/step_q_gap": 0.059949666002578283, "calib/step_q_w": 0.4123018867924528, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 518.328125, "completions/mean_terminated_length": 518.328125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.0832, "grad_norm": 0.038161709904670715, "kl": 0.1164093017578125, "learning_rate": 3.3888888888888893e-06, "loss": -0.1816, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.032433316111564636, "mask/share_reasoning": 0.8684175610542297, "mask/share_step_conf": 0.09914913028478622, "num_tokens": 18727267.0, "reward": 0.7633633613586426, "reward_std": 0.20355360209941864, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7194637060165405, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.8072630167007446, "step": 78 }, { "adv/mean_abs_final_conf": 0.7439166307449341, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7461695075035095, "adv/std_final_conf": 0.9325432181358337, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348044991493225, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6385620915032678, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.12332015810276681, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.18181818181818182, "calib/gap": 0.12320457516339867, "calib/mean_conf": 0.6094071146245059, "calib/mu_c": 0.6581045751633987, "calib/mu_w": 0.5349, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.0639920948616601, "calib/std_conf": 0.252071348416406, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.45882926829268295, "calib/step_q_c_n": 820.0, "calib/step_q_gap": 0.038238550993104914, "calib/step_q_w": 0.42059071729957803, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 489.1875, "completions/mean_terminated_length": 491.1059265136719, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.08426666666666667, "grad_norm": 0.03568899631500244, "kl": 0.105499267578125, "learning_rate": 3.3611111111111117e-06, "loss": -0.1682, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03434523940086365, "mask/share_reasoning": 0.8567074537277222, "mask/share_step_conf": 0.10504105687141418, "num_tokens": 18958875.0, "reward": 0.7738837003707886, "reward_std": 0.18652935326099396, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7215226292610168, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8262448310852051, "step": 79 }, { "adv/mean_abs_final_conf": 0.7373925447463989, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7588784098625183, "adv/std_final_conf": 0.9170923829078674, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348174333572388, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.6928098391674551, "calib/avg_num_step_conf": 5.02734375, "calib/ece": 0.19398437500000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3984375, "calib/gap": 0.14165121412803539, "calib/mean_conf": 0.7442187499999999, "calib/mu_c": 0.802317880794702, "calib/mu_w": 0.6606666666666666, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17417968750000007, "calib/std_conf": 0.24461739696603244, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4986915887850467, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.052037313692109954, "calib/step_q_w": 0.44665427509293676, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 424.6015625, "completions/mean_terminated_length": 426.2666931152344, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.08533333333333333, "grad_norm": 0.040913067758083344, "kl": 0.1164703369140625, "learning_rate": 3.3333333333333333e-06, "loss": -0.0902, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03784197196364403, "mask/share_reasoning": 0.8360422849655151, "mask/share_step_conf": 0.12220950424671173, "num_tokens": 19169733.0, "reward": 0.779463529586792, "reward_std": 0.15755271911621094, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7320824265480042, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8268446922302246, "step": 80 }, { "adv/mean_abs_final_conf": 0.7503237724304199, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7460405826568604, "adv/std_final_conf": 0.9145488142967224, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935416042804718, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.719396551724138, "calib/avg_num_step_conf": 4.859375, "calib/ece": 0.14979253112033195, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.3817427385892116, "calib/gap": 0.21781681034482758, "calib/mean_conf": 0.7126141078838175, "calib/mu_c": 0.7993793103448276, "calib/mu_w": 0.5815625, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.1303734439834025, "calib/std_conf": 0.27592209129547424, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.5065610142630744, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.09197700121250346, "calib/step_q_w": 0.41458401305057097, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 506.01953125, "completions/mean_terminated_length": 512.019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.0864, "grad_norm": 0.026412300765514374, "kl": 0.104217529296875, "learning_rate": 3.3055555555555558e-06, "loss": -0.1094, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03644543141126633, "mask/share_reasoning": 0.8455771207809448, "mask/share_step_conf": 0.10625872015953064, "num_tokens": 19405522.0, "reward": 0.7318441867828369, "reward_std": 0.2443157136440277, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6915718913078308, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7721166610717773, "step": 81 }, { "adv/mean_abs_final_conf": 0.7479487657546997, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7465330362319946, "adv/std_final_conf": 0.8972213864326477, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349361658096313, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7302356902356902, "calib/avg_num_step_conf": 4.953125, "calib/ece": 0.19634538152610445, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4859437751004016, "calib/gap": 0.20788484848484867, "calib/mean_conf": 0.7700803212851406, "calib/mu_c": 0.8527333333333335, "calib/mu_w": 0.6448484848484848, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.18200803212851407, "calib/std_conf": 0.2656491510623321, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.518390977443609, "calib/step_q_c_n": 665.0, "calib/step_q_gap": 0.09661651641541663, "calib/step_q_w": 0.4217744610281923, "calib/step_q_w_n": 603.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2958.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 467.06640625, "completions/mean_terminated_length": 468.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.08746666666666666, "grad_norm": 0.02705484628677368, "kl": 0.11284637451171875, "learning_rate": 3.277777777777778e-06, "loss": -0.047, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.038292378187179565, "mask/share_reasoning": 0.8431802988052368, "mask/share_step_conf": 0.1146211326122284, "num_tokens": 19630643.0, "reward": 0.7625449895858765, "reward_std": 0.19928604364395142, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7182894945144653, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8068004846572876, "step": 82 }, { "adv/mean_abs_final_conf": 0.7600466012954712, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7641905546188354, "adv/std_final_conf": 0.9219870567321777, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346417784690857, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7885663034812649, "calib/avg_num_step_conf": 4.84765625, "calib/ece": 0.2078225806451613, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.5403225806451613, "calib/gap": 0.2661785809194791, "calib/mean_conf": 0.7763709677419356, "calib/mu_c": 0.8901408450704225, "calib/mu_w": 0.6239622641509434, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.20580645161290323, "calib/std_conf": 0.2691888492796142, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.523275039745628, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.12392863451686975, "calib/step_q_w": 0.3993464052287582, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 526.734375, "completions/mean_terminated_length": 528.800048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.08853333333333334, "grad_norm": 0.08767831325531006, "kl": 0.1004638671875, "learning_rate": 3.2500000000000002e-06, "loss": -0.1739, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03471631184220314, "mask/share_reasoning": 0.8556775450706482, "mask/share_step_conf": 0.10569989681243896, "num_tokens": 19872751.0, "reward": 0.7552716732025146, "reward_std": 0.20535266399383545, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7049202919006348, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.8056230545043945, "step": 83 }, { "adv/mean_abs_final_conf": 0.7469096779823303, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7453149557113647, "adv/std_final_conf": 0.9164627194404602, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347103238105774, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.75181598062954, "calib/avg_num_step_conf": 4.5546875, "calib/ece": 0.2758565737051793, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5617529880478087, "calib/gap": 0.21050019115585572, "calib/mean_conf": 0.7904382470119522, "calib/mu_c": 0.8893984962406015, "calib/mu_w": 0.6788983050847458, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.26820717131474103, "calib/std_conf": 0.2728136689746266, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5189006309148264, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.07365627001257075, "calib/step_q_w": 0.44524436090225566, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 443.578125, "completions/mean_terminated_length": 443.578125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0896, "grad_norm": 0.03372732549905777, "kl": 0.11348724365234375, "learning_rate": 3.2222222222222227e-06, "loss": -0.0607, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03929414972662926, "mask/share_reasoning": 0.8493955135345459, "mask/share_step_conf": 0.11131033301353455, "num_tokens": 20092227.0, "reward": 0.7540429830551147, "reward_std": 0.19966432452201843, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.6865949630737305, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8214911222457886, "step": 84 }, { "adv/mean_abs_final_conf": 0.7252588868141174, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7495740056037903, "adv/std_final_conf": 0.8995879888534546, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350873231887817, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.721048951048951, "calib/avg_num_step_conf": 4.2890625, "calib/ece": 0.26483333333333337, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.625, "calib/gap": 0.23826573426573439, "calib/mean_conf": 0.8063333333333333, "calib/mu_c": 0.9155384615384616, "calib/mu_w": 0.6772727272727272, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.26475000000000004, "calib/std_conf": 0.27401774070709767, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.512791519434629, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.06890054199101991, "calib/step_q_w": 0.44389097744360906, "calib/step_q_w_n": 532.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 521.86328125, "completions/mean_terminated_length": 528.0513916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.09066666666666667, "grad_norm": 0.04208715260028839, "kl": 0.097900390625, "learning_rate": 3.1944444444444443e-06, "loss": -0.1331, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.035177890211343765, "mask/share_reasoning": 0.8520656824111938, "mask/share_step_conf": 0.10103766620159149, "num_tokens": 20333648.0, "reward": 0.6964380145072937, "reward_std": 0.25430044531822205, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6360054612159729, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.7568705081939697, "step": 85 }, { "adv/mean_abs_final_conf": 0.7735016345977783, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7450942993164062, "adv/std_final_conf": 0.9120759963989258, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9358606934547424, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.726973241898615, "calib/avg_num_step_conf": 4.22265625, "calib/ece": 0.30175510204081624, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.5387755102040817, "calib/gap": 0.23299852090896878, "calib/mean_conf": 0.7548163265306123, "calib/mu_c": 0.8822522522522523, "calib/mu_w": 0.6492537313432836, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.30175510204081624, "calib/std_conf": 0.2997680513854063, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.49515030060120235, "calib/step_q_c_n": 499.0, "calib/step_q_gap": 0.06975511159776593, "calib/step_q_w": 0.4253951890034364, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2581.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 467.9921875, "completions/mean_terminated_length": 469.8274841308594, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.09173333333333333, "grad_norm": 0.04277937859296799, "kl": 0.1038665771484375, "learning_rate": 3.1666666666666667e-06, "loss": -0.2479, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03794998303055763, "mask/share_reasoning": 0.8551421165466309, "mask/share_step_conf": 0.10300161689519882, "num_tokens": 20558966.0, "reward": 0.6861547231674194, "reward_std": 0.2804429829120636, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6155492067337036, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7567602396011353, "step": 86 }, { "adv/mean_abs_final_conf": 0.7481697797775269, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7582245469093323, "adv/std_final_conf": 0.9145613312721252, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349111914634705, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.714245472837022, "calib/avg_num_step_conf": 3.984375, "calib/ece": 0.17402439024390248, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.6341463414634146, "calib/gap": 0.17621167002012095, "calib/mean_conf": 0.8381707317073173, "calib/mu_c": 0.8890285714285716, "calib/mu_w": 0.7128169014084507, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9140625, "calib/pce": 0.15040650406504066, "calib/std_conf": 0.2357351141035544, "calib/step_conf_rate": 0.9140625, "calib/step_q_c": 0.533342939481268, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.10831226463464222, "calib/step_q_w": 0.42503067484662577, "calib/step_q_w_n": 326.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 436.5625, "completions/mean_terminated_length": 443.4920959472656, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.0928, "grad_norm": 0.04709944874048233, "kl": 0.120513916015625, "learning_rate": 3.138888888888889e-06, "loss": -0.185, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03978754207491875, "mask/share_reasoning": 0.8435720205307007, "mask/share_step_conf": 0.10101540386676788, "num_tokens": 20776222.0, "reward": 0.748905599117279, "reward_std": 0.251054048538208, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7142261862754822, "rewards/format_reward_step": 0.90234375, "rewards/step_l2_reward": 0.7835850715637207, "step": 87 }, { "adv/mean_abs_final_conf": 0.7659087777137756, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7348840236663818, "adv/std_final_conf": 0.9296475052833557, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355849027633667, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8344155844155844, "calib/avg_num_step_conf": 4.19140625, "calib/ece": 0.15074803149606297, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.5275590551181102, "calib/gap": 0.3850636363636363, "calib/mean_conf": 0.7547637795275591, "calib/mu_c": 0.9063636363636363, "calib/mu_w": 0.5213, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.91796875, "calib/pce": 0.14960629921259838, "calib/std_conf": 0.2958408264482595, "calib/step_conf_rate": 0.91796875, "calib/step_q_c": 0.4859428571428571, "calib/step_q_c_n": 700.0, "calib/step_q_gap": 0.05403937188816543, "calib/step_q_w": 0.4319034852546917, "calib/step_q_w_n": 373.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 474.49609375, "completions/mean_terminated_length": 474.49609375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.09386666666666667, "grad_norm": 0.030993729829788208, "kl": 0.1110076904296875, "learning_rate": 3.1111111111111116e-06, "loss": -0.1947, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03716718778014183, "mask/share_reasoning": 0.862873911857605, "mask/share_step_conf": 0.09995885193347931, "num_tokens": 21007541.0, "reward": 0.7658560276031494, "reward_std": 0.2499769777059555, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7555378675460815, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7761742472648621, "step": 88 }, { "adv/mean_abs_final_conf": 0.7708313465118408, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7514389157295227, "adv/std_final_conf": 0.9173318147659302, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9359842538833618, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8226373032493415, "calib/avg_num_step_conf": 3.84375, "calib/ece": 0.2592622950819672, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 0.48770491803278687, "calib/gap": 0.32950753225697516, "calib/mean_conf": 0.7145081967213115, "calib/mu_c": 0.89141592920354, "calib/mu_w": 0.5619083969465648, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.890625, "calib/pce": 0.25532786885245895, "calib/std_conf": 0.31482522463100265, "calib/step_conf_rate": 0.890625, "calib/step_q_c": 0.5899484536082473, "calib/step_q_c_n": 388.0, "calib/step_q_gap": 0.1898981180377775, "calib/step_q_w": 0.4000503355704698, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 511.546875, "completions/mean_terminated_length": 511.546875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.09493333333333333, "grad_norm": 0.03927363082766533, "kl": 0.10543060302734375, "learning_rate": 3.0833333333333336e-06, "loss": -0.1832, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03691529855132103, "mask/share_reasoning": 0.8710620999336243, "mask/share_step_conf": 0.0920226126909256, "num_tokens": 21247385.0, "reward": 0.6905965805053711, "reward_std": 0.2838398814201355, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.64203941822052, "rewards/format_reward_step": 0.86328125, "rewards/step_l2_reward": 0.7391537427902222, "step": 89 }, { "adv/mean_abs_final_conf": 0.7335283160209656, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7159548997879028, "adv/std_final_conf": 0.9096397161483765, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355127215385437, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6708086916074159, "calib/avg_num_step_conf": 5.3046875, "calib/ece": 0.19730666666666682, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.572, "calib/gap": 0.20785478547854785, "calib/mean_conf": 0.7626933333333333, "calib/mu_c": 0.8466666666666667, "calib/mu_w": 0.6388118811881188, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.18200000000000016, "calib/std_conf": 0.29880582948351814, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.47783783783783784, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.08791874398670518, "calib/step_q_w": 0.38991909385113266, "calib/step_q_w_n": 618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 506.33984375, "completions/mean_terminated_length": 506.33984375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.096, "grad_norm": 0.035200610756874084, "kl": 0.1154632568359375, "learning_rate": 3.055555555555556e-06, "loss": -0.0727, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03638852387666702, "mask/share_reasoning": 0.8510505557060242, "mask/share_step_conf": 0.11256091296672821, "num_tokens": 21480328.0, "reward": 0.7477128505706787, "reward_std": 0.23210914433002472, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.6938439607620239, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8015816807746887, "step": 90 }, { "adv/mean_abs_final_conf": 0.7306746244430542, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7812052965164185, "adv/std_final_conf": 0.9120433330535889, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345096349716187, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7520235467255335, "calib/avg_num_step_conf": 4.40234375, "calib/ece": 0.20731999999999984, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.608, "calib/gap": 0.24788347046625203, "calib/mean_conf": 0.7935599999999999, "calib/mu_c": 0.8917218543046358, "calib/mu_w": 0.6438383838383838, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.19843999999999984, "calib/std_conf": 0.2806102749366102, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5113258785942493, "calib/step_q_c_n": 626.0, "calib/step_q_gap": 0.10529793448247277, "calib/step_q_w": 0.4060279441117765, "calib/step_q_w_n": 501.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 481.265625, "completions/mean_terminated_length": 481.265625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.09706666666666666, "grad_norm": 0.039202846586704254, "kl": 0.1165771484375, "learning_rate": 3.0277777777777776e-06, "loss": -0.1038, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03410191461443901, "mask/share_reasoning": 0.8663723468780518, "mask/share_step_conf": 0.09952573478221893, "num_tokens": 21711244.0, "reward": 0.780426025390625, "reward_std": 0.19417354464530945, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7288113236427307, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8320407271385193, "step": 91 }, { "adv/mean_abs_final_conf": 0.7099396586418152, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7603292465209961, "adv/std_final_conf": 0.8862280249595642, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935433566570282, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.716020436730123, "calib/avg_num_step_conf": 4.26171875, "calib/ece": 0.19447154471544736, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.88671875, "calib/frac_conf_gt_0.9": 0.5691056910569106, "calib/gap": 0.23869120940649502, "calib/mean_conf": 0.770569105691057, "calib/mu_c": 0.8617763157894738, "calib/mu_w": 0.6230851063829788, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.8984375, "calib/pce": 0.17357723577235795, "calib/std_conf": 0.3021543266185711, "calib/step_conf_rate": 0.8984375, "calib/step_q_c": 0.5257218309859155, "calib/step_q_c_n": 568.0, "calib/step_q_gap": 0.16543693614843935, "calib/step_q_w": 0.36028489483747617, "calib/step_q_w_n": 523.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2613.0, "completions/max_terminated_length": 2613.0, "completions/mean_length": 466.65234375, "completions/mean_terminated_length": 466.65234375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.09813333333333334, "grad_norm": 0.03666877746582031, "kl": 0.11454010009765625, "learning_rate": 3e-06, "loss": -0.1683, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.037959203124046326, "mask/share_reasoning": 0.8583852052688599, "mask/share_step_conf": 0.10365556925535202, "num_tokens": 21937427.0, "reward": 0.7203973531723022, "reward_std": 0.2536795139312744, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6676589250564575, "rewards/format_reward_step": 0.88671875, "rewards/step_l2_reward": 0.7731357216835022, "step": 92 }, { "adv/mean_abs_final_conf": 0.6869341731071472, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7488905191421509, "adv/std_final_conf": 0.8937593102455139, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355915188789368, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7200474898236093, "calib/avg_num_step_conf": 4.875, "calib/ece": 0.17467213114754096, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.3524590163934426, "calib/gap": 0.24422523744911817, "calib/mean_conf": 0.6580327868852459, "calib/mu_c": 0.768134328358209, "calib/mu_w": 0.5239090909090909, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.14176229508196717, "calib/std_conf": 0.3129673930123625, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.5013162118780097, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.09379621187800963, "calib/step_q_w": 0.40752000000000005, "calib/step_q_w_n": 625.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 487.45703125, "completions/mean_terminated_length": 491.2952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.0992, "grad_norm": 0.046914614737033844, "kl": 0.108612060546875, "learning_rate": 2.9722222222222225e-06, "loss": -0.1615, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.0355367548763752, "mask/share_reasoning": 0.8486747145652771, "mask/share_step_conf": 0.10797599703073502, "num_tokens": 22167992.0, "reward": 0.7335118651390076, "reward_std": 0.24725374579429626, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6891273260116577, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7778963446617126, "step": 93 }, { "adv/mean_abs_final_conf": 0.7514272928237915, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7241270542144775, "adv/std_final_conf": 0.9320777058601379, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356358051300049, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.753915504080021, "calib/avg_num_step_conf": 3.984375, "calib/ece": 0.1504048582995951, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.3562753036437247, "calib/gap": 0.2947216372729665, "calib/mean_conf": 0.6417408906882591, "calib/mu_c": 0.7801526717557252, "calib/mu_w": 0.4854310344827587, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.1308906882591093, "calib/std_conf": 0.327217142534491, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.5525203252032521, "calib/step_q_c_n": 492.0, "calib/step_q_gap": 0.15824002217294908, "calib/step_q_w": 0.39428030303030304, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 433.24609375, "completions/mean_terminated_length": 436.657470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.10026666666666667, "grad_norm": 0.03202759847044945, "kl": 0.1334075927734375, "learning_rate": 2.944444444444445e-06, "loss": -0.157, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03922412917017937, "mask/share_reasoning": 0.8538342714309692, "mask/share_step_conf": 0.09912911057472229, "num_tokens": 22387583.0, "reward": 0.7599334716796875, "reward_std": 0.21481086313724518, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7212663888931274, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7986005544662476, "step": 94 }, { "adv/mean_abs_final_conf": 0.7279517650604248, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7865815758705139, "adv/std_final_conf": 0.9090731143951416, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9359537959098816, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.805735294117647, "calib/avg_num_step_conf": 3.58984375, "calib/ece": 0.15635593220338972, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.828125, "calib/frac_conf_gt_0.9": 0.4576271186440678, "calib/gap": 0.3495029411764706, "calib/mean_conf": 0.6955084745762713, "calib/mu_c": 0.8436029411764706, "calib/mu_w": 0.4941, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.84765625, "calib/pce": 0.13779661016949143, "calib/std_conf": 0.3303300800602154, "calib/step_conf_rate": 0.84765625, "calib/step_q_c": 0.5449999999999999, "calib/step_q_c_n": 506.0, "calib/step_q_gap": 0.1501089588377723, "calib/step_q_w": 0.3948910411622276, "calib/step_q_w_n": 413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 454.76953125, "completions/mean_terminated_length": 456.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.10133333333333333, "grad_norm": 0.04344727098941803, "kl": 0.111175537109375, "learning_rate": 2.916666666666667e-06, "loss": -0.3002, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.03688206151127815, "mask/share_reasoning": 0.8678974509239197, "mask/share_step_conf": 0.09131423383951187, "num_tokens": 22610132.0, "reward": 0.6868751049041748, "reward_std": 0.2803459167480469, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6729038953781128, "rewards/format_reward_step": 0.828125, "rewards/step_l2_reward": 0.7008463740348816, "step": 95 }, { "adv/mean_abs_final_conf": 0.7357560992240906, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7510946393013, "adv/std_final_conf": 0.8989567160606384, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351812601089478, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8593205574912893, "calib/avg_num_step_conf": 3.69921875, "calib/ece": 0.04665322580645161, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.39919354838709675, "calib/gap": 0.42188443670150994, "calib/mean_conf": 0.65625, "calib/mu_c": 0.7991463414634147, "calib/mu_w": 0.37726190476190474, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.89453125, "calib/pce": 0.020806451612903232, "calib/std_conf": 0.32958069952592794, "calib/step_conf_rate": 0.89453125, "calib/step_q_c": 0.5652158894645942, "calib/step_q_c_n": 579.0, "calib/step_q_gap": 0.16162893294285507, "calib/step_q_w": 0.40358695652173915, "calib/step_q_w_n": 368.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 434.06640625, "completions/mean_terminated_length": 434.06640625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1024, "grad_norm": 0.05312202498316765, "kl": 0.135467529296875, "learning_rate": 2.888888888888889e-06, "loss": -0.1138, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03864780068397522, "mask/share_reasoning": 0.8665939569473267, "mask/share_step_conf": 0.09475824236869812, "num_tokens": 22827069.0, "reward": 0.767045259475708, "reward_std": 0.24355840682983398, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7619215250015259, "rewards/format_reward_step": 0.8828125, "rewards/step_l2_reward": 0.7721689939498901, "step": 96 }, { "adv/mean_abs_final_conf": 0.7440429925918579, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7356593608856201, "adv/std_final_conf": 0.9356138706207275, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9362950325012207, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7425941599661448, "calib/avg_num_step_conf": 3.796875, "calib/ece": 0.16780082987551875, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.87109375, "calib/frac_conf_gt_0.9": 0.3360995850622407, "calib/gap": 0.26464381436027645, "calib/mean_conf": 0.6130290456431535, "calib/mu_c": 0.7250359712230215, "calib/mu_w": 0.4603921568627451, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.890625, "calib/pce": 0.10203319502074698, "calib/std_conf": 0.3310945559284828, "calib/step_conf_rate": 0.890625, "calib/step_q_c": 0.5561842105263157, "calib/step_q_c_n": 532.0, "calib/step_q_gap": 0.18259330143540664, "calib/step_q_w": 0.3735909090909091, "calib/step_q_w_n": 440.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 444.73046875, "completions/mean_terminated_length": 448.2322692871094, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.10346666666666667, "grad_norm": 0.04188898950815201, "kl": 0.128448486328125, "learning_rate": 2.861111111111111e-06, "loss": -0.3184, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.038415439426898956, "mask/share_reasoning": 0.8580073118209839, "mask/share_step_conf": 0.09576477110385895, "num_tokens": 23045992.0, "reward": 0.7061910629272461, "reward_std": 0.30235719680786133, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6707335710525513, "rewards/format_reward_step": 0.87109375, "rewards/step_l2_reward": 0.7416484951972961, "step": 97 }, { "adv/mean_abs_final_conf": 0.7775865793228149, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7880679368972778, "adv/std_final_conf": 0.9250810146331787, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9360907077789307, "calib/answer_extract_rate": 0.90625, "calib/auroc": 0.6808838643371018, "calib/avg_num_step_conf": 3.46875, "calib/ece": 0.18421739130434783, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.8125, "calib/frac_conf_gt_0.9": 0.4217391304347826, "calib/gap": 0.21805755395683457, "calib/mean_conf": 0.6917826086956521, "calib/mu_c": 0.7780575539568346, "calib/mu_w": 0.56, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.84375, "calib/pce": 0.13582608695652176, "calib/std_conf": 0.3202374739120057, "calib/step_conf_rate": 0.84375, "calib/step_q_c": 0.5706995884773661, "calib/step_q_c_n": 486.0, "calib/step_q_gap": 0.1187344143480129, "calib/step_q_w": 0.4519651741293532, "calib/step_q_w_n": 402.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2852.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 490.28515625, "completions/mean_terminated_length": 494.1456604003906, "completions/min_length": 0.0, "completions/min_terminated_length": 60.0, "epoch": 0.10453333333333334, "grad_norm": 0.0338410884141922, "kl": 0.1101226806640625, "learning_rate": 2.8333333333333335e-06, "loss": -0.2987, "mask/has_final_conf_rate": 0.8984375, "mask/share_final_conf": 0.03844252973794937, "mask/share_reasoning": 0.8692131638526917, "mask/share_step_conf": 0.08453181385993958, "num_tokens": 23277689.0, "reward": 0.6472041010856628, "reward_std": 0.3040306270122528, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6219742298126221, "rewards/format_reward_step": 0.8125, "rewards/step_l2_reward": 0.6724340915679932, "step": 98 }, { "adv/mean_abs_final_conf": 0.7542859315872192, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7670855522155762, "adv/std_final_conf": 0.9212293028831482, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9358445405960083, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7563062075016451, "calib/avg_num_step_conf": 3.56640625, "calib/ece": 0.1538235294117648, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.82421875, "calib/frac_conf_gt_0.9": 0.2605042016806723, "calib/gap": 0.3153067193097901, "calib/mean_conf": 0.5324789915966387, "calib/mu_c": 0.7192783505154638, "calib/mu_w": 0.40397163120567375, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.83984375, "calib/pce": 0.13936974789915976, "calib/std_conf": 0.34915887762958936, "calib/step_conf_rate": 0.83984375, "calib/step_q_c": 0.569861111111111, "calib/step_q_c_n": 360.0, "calib/step_q_gap": 0.15532223226843472, "calib/step_q_w": 0.4145388788426763, "calib/step_q_w_n": 553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 557.60546875, "completions/mean_terminated_length": 557.60546875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.1056, "grad_norm": 0.03545257821679115, "kl": 0.108612060546875, "learning_rate": 2.805555555555556e-06, "loss": -0.3171, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.03294376656413078, "mask/share_reasoning": 0.8888589143753052, "mask/share_step_conf": 0.07819731533527374, "num_tokens": 23526236.0, "reward": 0.6546341180801392, "reward_std": 0.3044378459453583, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6342898607254028, "rewards/format_reward_step": 0.82421875, "rewards/step_l2_reward": 0.6749784350395203, "step": 99 }, { "adv/mean_abs_final_conf": 0.7506855726242065, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7517011165618896, "adv/std_final_conf": 0.9101697206497192, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9359030723571777, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.8300634943283155, "calib/avg_num_step_conf": 3.640625, "calib/ece": 0.10945378151260507, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 0.41596638655462187, "calib/gap": 0.4138752942855104, "calib/mean_conf": 0.6333193277310926, "calib/mu_c": 0.8193893129770993, "calib/mu_w": 0.4055140186915889, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.890625, "calib/pce": 0.09617647058823532, "calib/std_conf": 0.3601883128571439, "calib/step_conf_rate": 0.890625, "calib/step_q_c": 0.5623626373626374, "calib/step_q_c_n": 546.0, "calib/step_q_gap": 0.1162227409895803, "calib/step_q_w": 0.44613989637305707, "calib/step_q_w_n": 386.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 543.05078125, "completions/mean_terminated_length": 543.05078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.10666666666666667, "grad_norm": 0.05612342059612274, "kl": 0.34037017822265625, "learning_rate": 2.7777777777777783e-06, "loss": -0.2418, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.035137295722961426, "mask/share_reasoning": 0.8781794905662537, "mask/share_step_conf": 0.08668322116136551, "num_tokens": 23772665.0, "reward": 0.7290331721305847, "reward_std": 0.2792990803718567, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7114968299865723, "rewards/format_reward_step": 0.86328125, "rewards/step_l2_reward": 0.7465694546699524, "step": 100 }, { "adv/mean_abs_final_conf": 0.7654350996017456, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7380661368370056, "adv/std_final_conf": 0.9364844560623169, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9359593987464905, "calib/answer_extract_rate": 0.90234375, "calib/auroc": 0.7199817726133516, "calib/avg_num_step_conf": 3.5, "calib/ece": 0.174698275862069, "calib/final_conf_rate": 0.90625, "calib/format_rate": 0.8203125, "calib/frac_conf_gt_0.9": 0.2974137931034483, "calib/gap": 0.2618310928837246, "calib/mean_conf": 0.5367672413793103, "calib/mu_c": 0.686868686868687, "calib/mu_w": 0.4250375939849624, "calib/nonempty_final_conf_rate": 0.90625, "calib/nonempty_reasoning_rate": 0.9296875, "calib/nonempty_step_conf_rate": 0.8515625, "calib/pce": 0.14237068965517247, "calib/std_conf": 0.3526749714377204, "calib/step_conf_rate": 0.8515625, "calib/step_q_c": 0.5205555555555555, "calib/step_q_c_n": 396.0, "calib/step_q_gap": 0.0778955555555556, "calib/step_q_w": 0.44265999999999994, "calib/step_q_w_n": 500.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 500.140625, "completions/mean_terminated_length": 506.0711669921875, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.10773333333333333, "grad_norm": 0.030985474586486816, "kl": 0.1172943115234375, "learning_rate": 2.7500000000000004e-06, "loss": -0.302, "mask/has_final_conf_rate": 0.90625, "mask/share_final_conf": 0.03270100802183151, "mask/share_reasoning": 0.8736139535903931, "mask/share_step_conf": 0.08196623623371124, "num_tokens": 24007693.0, "reward": 0.6413122415542603, "reward_std": 0.29796165227890015, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.5986918210983276, "rewards/format_reward_step": 0.8203125, "rewards/step_l2_reward": 0.6839326620101929, "step": 101 }, { "adv/mean_abs_final_conf": 0.6865018606185913, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7594526410102844, "adv/std_final_conf": 0.8888486623764038, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339803457260132, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8220064724919094, "calib/avg_num_step_conf": 3.98046875, "calib/ece": 0.11754098360655743, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.42213114754098363, "calib/gap": 0.40786614335881016, "calib/mean_conf": 0.6522950819672132, "calib/mu_c": 0.824468085106383, "calib/mu_w": 0.41660194174757287, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.91015625, "calib/pce": 0.09598360655737712, "calib/std_conf": 0.35119651890906567, "calib/step_conf_rate": 0.91015625, "calib/step_q_c": 0.6135378323108384, "calib/step_q_c_n": 489.0, "calib/step_q_gap": 0.21772651155612133, "calib/step_q_w": 0.39581132075471703, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 441.46484375, "completions/mean_terminated_length": 441.46484375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1088, "grad_norm": 0.04502592608332634, "kl": 0.119964599609375, "learning_rate": 2.7222222222222224e-06, "loss": -0.1303, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.04176633059978485, "mask/share_reasoning": 0.8533238172531128, "mask/share_step_conf": 0.10490988194942474, "num_tokens": 24227404.0, "reward": 0.7619272470474243, "reward_std": 0.2033213973045349, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7460718750953674, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7777825593948364, "step": 102 }, { "adv/mean_abs_final_conf": 0.7332376837730408, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7705097198486328, "adv/std_final_conf": 0.8930462598800659, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9359943866729736, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.804710793082886, "calib/avg_num_step_conf": 3.50390625, "calib/ece": 0.16360515021459227, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 0.463519313304721, "calib/gap": 0.3344596004770423, "calib/mean_conf": 0.672961373390558, "calib/mu_c": 0.8222480620155038, "calib/mu_w": 0.4877884615384615, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 0.94140625, "calib/nonempty_step_conf_rate": 0.890625, "calib/pce": 0.14145922746781114, "calib/std_conf": 0.34641422928406723, "calib/step_conf_rate": 0.890625, "calib/step_q_c": 0.6104255319148936, "calib/step_q_c_n": 423.0, "calib/step_q_gap": 0.16222721967860665, "calib/step_q_w": 0.4481983122362869, "calib/step_q_w_n": 474.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 592.2734375, "completions/mean_terminated_length": 592.2734375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.10986666666666667, "grad_norm": 0.027462903410196304, "kl": 0.0913848876953125, "learning_rate": 2.6944444444444444e-06, "loss": -0.3395, "mask/has_final_conf_rate": 0.91015625, "mask/share_final_conf": 0.03436726704239845, "mask/share_reasoning": 0.8866649866104126, "mask/share_step_conf": 0.07896770536899567, "num_tokens": 24483578.0, "reward": 0.7046219110488892, "reward_std": 0.28575944900512695, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6755644679069519, "rewards/format_reward_step": 0.86328125, "rewards/step_l2_reward": 0.7336792349815369, "step": 103 }, { "adv/mean_abs_final_conf": 0.7110572457313538, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7327368855476379, "adv/std_final_conf": 0.8901990056037903, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354986548423767, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8121157444533548, "calib/avg_num_step_conf": 4.140625, "calib/ece": 0.1080408163265306, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.3020408163265306, "calib/gap": 0.36994854317027537, "calib/mean_conf": 0.575469387755102, "calib/mu_c": 0.7702586206896552, "calib/mu_w": 0.40031007751937986, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.1050204081632653, "calib/std_conf": 0.3335833342654053, "calib/step_conf_rate": 0.90234375, "calib/step_q_c": 0.6183047210300429, "calib/step_q_c_n": 466.0, "calib/step_q_gap": 0.17468519241051428, "calib/step_q_w": 0.4436195286195286, "calib/step_q_w_n": 594.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 507.60546875, "completions/mean_terminated_length": 507.60546875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.11093333333333333, "grad_norm": 0.03985770791769028, "kl": 0.111175537109375, "learning_rate": 2.666666666666667e-06, "loss": -0.2427, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03428574651479721, "mask/share_reasoning": 0.8698962926864624, "mask/share_step_conf": 0.09581795334815979, "num_tokens": 24720205.0, "reward": 0.7403326630592346, "reward_std": 0.22890345752239227, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7359715104103088, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7446938753128052, "step": 104 }, { "adv/mean_abs_final_conf": 0.7528746128082275, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.761589765548706, "adv/std_final_conf": 0.9238329529762268, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935430109500885, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.766133442712606, "calib/avg_num_step_conf": 4.58203125, "calib/ece": 0.16840163934426228, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.430327868852459, "calib/gap": 0.3228370248837846, "calib/mean_conf": 0.6450409836065574, "calib/mu_c": 0.7852898550724638, "calib/mu_w": 0.4624528301886793, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.12393442622950818, "calib/std_conf": 0.35783329980801337, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.5743478260869566, "calib/step_q_c_n": 529.0, "calib/step_q_gap": 0.16812111801242235, "calib/step_q_w": 0.4062267080745342, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 493.16015625, "completions/mean_terminated_length": 495.0941467285156, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.112, "grad_norm": 0.05232438072562218, "kl": 0.10643768310546875, "learning_rate": 2.6388888888888893e-06, "loss": -0.1443, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03616030886769295, "mask/share_reasoning": 0.8623545169830322, "mask/share_step_conf": 0.09757894277572632, "num_tokens": 24952214.0, "reward": 0.7511138319969177, "reward_std": 0.25214970111846924, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7188429832458496, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7833847403526306, "step": 105 }, { "adv/mean_abs_final_conf": 0.6756178140640259, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7654324769973755, "adv/std_final_conf": 0.8626399636268616, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351634383201599, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7351481916699308, "calib/avg_num_step_conf": 4.26953125, "calib/ece": 0.19839357429718887, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5261044176706827, "calib/gap": 0.268264786525656, "calib/mean_conf": 0.7293975903614458, "calib/mu_c": 0.8489855072463768, "calib/mu_w": 0.5807207207207208, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.18678714859437762, "calib/std_conf": 0.3225239920626697, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.6267474048442906, "calib/step_q_c_n": 578.0, "calib/step_q_gap": 0.13886390969865953, "calib/step_q_w": 0.48788349514563106, "calib/step_q_w_n": 515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 474.6640625, "completions/mean_terminated_length": 474.6640625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.11306666666666666, "grad_norm": 0.031673479825258255, "kl": 0.10186004638671875, "learning_rate": 2.6111111111111113e-06, "loss": -0.1221, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03562416881322861, "mask/share_reasoning": 0.8671010732650757, "mask/share_step_conf": 0.09727469086647034, "num_tokens": 25178312.0, "reward": 0.7384791970252991, "reward_std": 0.22508171200752258, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7021632790565491, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7747950553894043, "step": 106 }, { "adv/mean_abs_final_conf": 0.7364275455474854, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7385661005973816, "adv/std_final_conf": 0.8878201246261597, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356435537338257, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6748719699556163, "calib/avg_num_step_conf": 4.2265625, "calib/ece": 0.24524390243902444, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.90625, "calib/frac_conf_gt_0.9": 0.6260162601626016, "calib/gap": 0.18848685558210987, "calib/mean_conf": 0.8042682926829267, "calib/mu_c": 0.8816551724137931, "calib/mu_w": 0.6931683168316832, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.23004065040650412, "calib/std_conf": 0.28188548021137166, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.6186688311688311, "calib/step_q_c_n": 616.0, "calib/step_q_gap": 0.10918385262805852, "calib/step_q_w": 0.5094849785407726, "calib/step_q_w_n": 466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 475.24609375, "completions/mean_terminated_length": 475.24609375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.11413333333333334, "grad_norm": 0.05143415182828903, "kl": 0.10625457763671875, "learning_rate": 2.5833333333333337e-06, "loss": -0.2211, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03504911810159683, "mask/share_reasoning": 0.862432599067688, "mask/share_step_conf": 0.102518230676651, "num_tokens": 25404591.0, "reward": 0.7037363052368164, "reward_std": 0.2831021547317505, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6533273458480835, "rewards/format_reward_step": 0.90625, "rewards/step_l2_reward": 0.7541453838348389, "step": 107 }, { "adv/mean_abs_final_conf": 0.6004273295402527, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7567219734191895, "adv/std_final_conf": 0.8114179372787476, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343503713607788, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7457677238116361, "calib/avg_num_step_conf": 4.83984375, "calib/ece": 0.18149193548387108, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7016129032258065, "calib/gap": 0.27782730834627056, "calib/mean_conf": 0.8316532258064516, "calib/mu_c": 0.9223952095808384, "calib/mu_w": 0.6445679012345679, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.1698790322580646, "calib/std_conf": 0.27590337089321565, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.6351222651222651, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.14395343395343396, "calib/step_q_w": 0.49116883116883114, "calib/step_q_w_n": 462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 506.25390625, "completions/mean_terminated_length": 508.2392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.1152, "grad_norm": 0.029269879683852196, "kl": 0.087646484375, "learning_rate": 2.5555555555555557e-06, "loss": -0.0016, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034039564430713654, "mask/share_reasoning": 0.8554302453994751, "mask/share_step_conf": 0.10662389546632767, "num_tokens": 25637424.0, "reward": 0.7967321872711182, "reward_std": 0.17280207574367523, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7668719291687012, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8265925049781799, "step": 108 }, { "adv/mean_abs_final_conf": 0.6863927841186523, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7505053281784058, "adv/std_final_conf": 0.8743253350257874, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346165060997009, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8137633970967304, "calib/avg_num_step_conf": 5.29296875, "calib/ece": 0.22839506172839505, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5308641975308642, "calib/gap": 0.38043345543345564, "calib/mean_conf": 0.6843621399176956, "calib/mu_c": 0.8816239316239318, "calib/mu_w": 0.5011904761904762, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2156378600823045, "calib/std_conf": 0.3631298338915262, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6091071428571428, "calib/step_q_c_n": 560.0, "calib/step_q_gap": 0.22157255166217432, "calib/step_q_w": 0.3875345911949685, "calib/step_q_w_n": 795.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2714.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 557.12890625, "completions/mean_terminated_length": 557.12890625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.11626666666666667, "grad_norm": 0.027278034016489983, "kl": 0.09093475341796875, "learning_rate": 2.5277777777777778e-06, "loss": -0.1032, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.032528433948755264, "mask/share_reasoning": 0.8628048896789551, "mask/share_step_conf": 0.10466665029525757, "num_tokens": 25884649.0, "reward": 0.7528167963027954, "reward_std": 0.2184036374092102, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7128750085830688, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7927586436271667, "step": 109 }, { "adv/mean_abs_final_conf": 0.7272202968597412, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7695968151092529, "adv/std_final_conf": 0.9092324376106262, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345746040344238, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6763870967741936, "calib/avg_num_step_conf": 4.5390625, "calib/ece": 0.26220883534136546, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5622489959839357, "calib/gap": 0.2412077419354839, "calib/mean_conf": 0.740120481927711, "calib/mu_c": 0.86024, "calib/mu_w": 0.6190322580645161, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.25016064257028114, "calib/std_conf": 0.3194931753552225, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5981086142322097, "calib/step_q_c_n": 534.0, "calib/step_q_gap": 0.12742390085641359, "calib/step_q_w": 0.47068471337579615, "calib/step_q_w_n": 628.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 449.93359375, "completions/mean_terminated_length": 451.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.11733333333333333, "grad_norm": 0.05075117200613022, "kl": 0.104644775390625, "learning_rate": 2.5e-06, "loss": -0.1052, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.036121100187301636, "mask/share_reasoning": 0.8499696850776672, "mask/share_step_conf": 0.11000296473503113, "num_tokens": 26104752.0, "reward": 0.7489697933197021, "reward_std": 0.19645348191261292, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6923683285713196, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8055711984634399, "step": 110 }, { "adv/mean_abs_final_conf": 0.6510974168777466, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7645963430404663, "adv/std_final_conf": 0.861767590045929, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355725646018982, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7273393761663556, "calib/avg_num_step_conf": 4.20703125, "calib/ece": 0.2798367346938776, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6204081632653061, "calib/gap": 0.2111796854172222, "calib/mean_conf": 0.778122448979592, "calib/mu_c": 0.8824193548387098, "calib/mu_w": 0.6712396694214876, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.27591836734693886, "calib/std_conf": 0.31335144464864084, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5854054054054054, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.11105835710486883, "calib/step_q_w": 0.4743470483005366, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 505.3671875, "completions/mean_terminated_length": 505.3671875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1184, "grad_norm": 0.029809826985001564, "kl": 0.09429168701171875, "learning_rate": 2.4722222222222226e-06, "loss": -0.2014, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03598381206393242, "mask/share_reasoning": 0.8623403906822205, "mask/share_step_conf": 0.10167580842971802, "num_tokens": 26341534.0, "reward": 0.7034972310066223, "reward_std": 0.23738251626491547, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6384176015853882, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7685769200325012, "step": 111 }, { "adv/mean_abs_final_conf": 0.6813622713088989, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7548239231109619, "adv/std_final_conf": 0.8669583797454834, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353672862052917, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8494515108688336, "calib/avg_num_step_conf": 4.671875, "calib/ece": 0.16639344262295083, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5163934426229508, "calib/gap": 0.4569513426206338, "calib/mean_conf": 0.6565573770491803, "calib/mu_c": 0.8756692913385826, "calib/mu_w": 0.4187179487179488, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.15122950819672132, "calib/std_conf": 0.3894379524892531, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5153577371048254, "calib/step_q_c_n": 601.0, "calib/step_q_gap": 0.16880311525608582, "calib/step_q_w": 0.34655462184873953, "calib/step_q_w_n": 595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2866.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 557.4453125, "completions/mean_terminated_length": 559.6314086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.11946666666666667, "grad_norm": 0.4219500720500946, "kl": 0.161285400390625, "learning_rate": 2.4444444444444447e-06, "loss": 0.0028, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.031249389052391052, "mask/share_reasoning": 0.868740439414978, "mask/share_step_conf": 0.09610390663146973, "num_tokens": 26592160.0, "reward": 0.778309166431427, "reward_std": 0.21401192247867584, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7561488151550293, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8004695773124695, "step": 112 }, { "adv/mean_abs_final_conf": 0.731112003326416, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7389805316925049, "adv/std_final_conf": 0.8750357627868652, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351128935813904, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7472007722007723, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.23422310756972098, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6055776892430279, "calib/gap": 0.2579607464607463, "calib/mean_conf": 0.7602788844621514, "calib/mu_c": 0.8743571428571428, "calib/mu_w": 0.6163963963963965, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2183665338645417, "calib/std_conf": 0.3249215687040707, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4958333333333333, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.1295093457943925, "calib/step_q_w": 0.3663239875389408, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 481.0234375, "completions/mean_terminated_length": 481.0234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.12053333333333334, "grad_norm": 0.02653811313211918, "kl": 0.1146087646484375, "learning_rate": 2.4166666666666667e-06, "loss": 0.0389, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.0368918776512146, "mask/share_reasoning": 0.8495938777923584, "mask/share_step_conf": 0.11351422965526581, "num_tokens": 26820502.0, "reward": 0.774255633354187, "reward_std": 0.21949368715286255, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7130206823348999, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8354905843734741, "step": 113 }, { "adv/mean_abs_final_conf": 0.6168874502182007, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7286229133605957, "adv/std_final_conf": 0.8332869410514832, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345226287841797, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7605859923740718, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.2521200000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.696, "calib/gap": 0.2965569603317947, "calib/mean_conf": 0.82508, "calib/mu_c": 0.9425165562913906, "calib/mu_w": 0.645959595959596, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2366000000000001, "calib/std_conf": 0.2983182756721418, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5084565499351491, "calib/step_q_c_n": 771.0, "calib/step_q_gap": 0.14667982037540062, "calib/step_q_w": 0.3617767295597485, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 468.2265625, "completions/mean_terminated_length": 473.7786865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1216, "grad_norm": 0.034657854586839676, "kl": 0.11187744140625, "learning_rate": 2.388888888888889e-06, "loss": -0.0766, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03760165348649025, "mask/share_reasoning": 0.8239021301269531, "mask/share_step_conf": 0.12677747011184692, "num_tokens": 27045392.0, "reward": 0.7951518297195435, "reward_std": 0.19239452481269836, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7430340051651001, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8472696542739868, "step": 114 }, { "adv/mean_abs_final_conf": 0.738232433795929, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7586345672607422, "adv/std_final_conf": 0.8943834900856018, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354433417320251, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6038682864450128, "calib/avg_num_step_conf": 5.109375, "calib/ece": 0.3742231075697211, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6573705179282868, "calib/gap": 0.08126214833759571, "calib/mean_conf": 0.8058565737051795, "calib/mu_c": 0.8430882352941176, "calib/mu_w": 0.7618260869565219, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3191235059760956, "calib/std_conf": 0.3182127056584067, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.440656634746922, "calib/step_q_c_n": 731.0, "calib/step_q_gap": 0.028663567155934155, "calib/step_q_w": 0.41199306759098786, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 457.87109375, "completions/mean_terminated_length": 459.66668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.12266666666666666, "grad_norm": 0.035833049565553665, "kl": 0.11376953125, "learning_rate": 2.361111111111111e-06, "loss": -0.0914, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03642076253890991, "mask/share_reasoning": 0.8400872349739075, "mask/share_step_conf": 0.11958577483892441, "num_tokens": 27267871.0, "reward": 0.6914719343185425, "reward_std": 0.21936823427677155, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5942429304122925, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7887008190155029, "step": 115 }, { "adv/mean_abs_final_conf": 0.7463953495025635, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7141386866569519, "adv/std_final_conf": 0.9021322131156921, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350417256355286, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7284679878048781, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.28270916334661356, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5816733067729084, "calib/gap": 0.2593025914634147, "calib/mean_conf": 0.7619123505976096, "calib/mu_c": 0.8941463414634148, "calib/mu_w": 0.63484375, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.2772908366533865, "calib/std_conf": 0.32402278504134363, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.43413078149920264, "calib/step_q_c_n": 627.0, "calib/step_q_gap": 0.07578913561391587, "calib/step_q_w": 0.3583416458852868, "calib/step_q_w_n": 802.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 516.63671875, "completions/mean_terminated_length": 518.6627807617188, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.12373333333333333, "grad_norm": 0.03071327693760395, "kl": 0.10791015625, "learning_rate": 2.3333333333333336e-06, "loss": -0.1317, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032803844660520554, "mask/share_reasoning": 0.8485397100448608, "mask/share_step_conf": 0.11475016176700592, "num_tokens": 27504650.0, "reward": 0.7508207559585571, "reward_std": 0.2131670117378235, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.676442563533783, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8251989483833313, "step": 116 }, { "adv/mean_abs_final_conf": 0.7544440031051636, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7447916269302368, "adv/std_final_conf": 0.9069930911064148, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348868131637573, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7259875541125542, "calib/avg_num_step_conf": 4.9453125, "calib/ece": 0.38098360655737695, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5368852459016393, "calib/gap": 0.1711742424242425, "calib/mean_conf": 0.7961475409836066, "calib/mu_c": 0.88875, "calib/mu_w": 0.7175757575757575, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3590573770491802, "calib/std_conf": 0.2844391857019804, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.44404444444444446, "calib/step_q_c_n": 495.0, "calib/step_q_gap": 0.08029606571552095, "calib/step_q_w": 0.3637483787289235, "calib/step_q_w_n": 771.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 492.6796875, "completions/mean_terminated_length": 494.6117858886719, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.1248, "grad_norm": 0.03247150406241417, "kl": 0.11163330078125, "learning_rate": 2.305555555555556e-06, "loss": -0.1601, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03434731066226959, "mask/share_reasoning": 0.847392737865448, "mask/share_step_conf": 0.11435367912054062, "num_tokens": 27737376.0, "reward": 0.705039381980896, "reward_std": 0.21480512619018555, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.607038676738739, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8030400276184082, "step": 117 }, { "adv/mean_abs_final_conf": 0.7744559049606323, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7268549799919128, "adv/std_final_conf": 0.9298220872879028, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353885650634766, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6918957284810944, "calib/avg_num_step_conf": 5.73828125, "calib/ece": 0.3121602409638554, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5622489959839357, "calib/gap": 0.18730857530003875, "calib/mean_conf": 0.7585224899598393, "calib/mu_c": 0.8510484126984126, "calib/mu_w": 0.6637398373983738, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2823293172690763, "calib/std_conf": 0.3323967390102428, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4162891207153503, "calib/step_q_c_n": 671.0, "calib/step_q_gap": 0.07127658938702952, "calib/step_q_w": 0.34501253132832077, "calib/step_q_w_n": 798.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 512.6796875, "completions/mean_terminated_length": 518.7589111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.12586666666666665, "grad_norm": 0.022973336279392242, "kl": 0.111297607421875, "learning_rate": 2.277777777777778e-06, "loss": -0.1643, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03558062016963959, "mask/share_reasoning": 0.8346555829048157, "mask/share_step_conf": 0.11804504692554474, "num_tokens": 27972630.0, "reward": 0.7236074209213257, "reward_std": 0.2213224172592163, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6395763754844666, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8076384663581848, "step": 118 }, { "adv/mean_abs_final_conf": 0.7573567628860474, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7542221546173096, "adv/std_final_conf": 0.9039705395698547, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353067874908447, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7770766559829061, "calib/avg_num_step_conf": 4.984375, "calib/ece": 0.27346122448979593, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5306122448979592, "calib/gap": 0.32107946047008573, "calib/mean_conf": 0.6977224489795919, "calib/mu_c": 0.8654700854700856, "calib/mu_w": 0.5443906249999999, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.2468163265306123, "calib/std_conf": 0.3755614742284074, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.43715547703180213, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.11232449111630916, "calib/step_q_w": 0.324830985915493, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 540.078125, "completions/mean_terminated_length": 542.1961059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.12693333333333334, "grad_norm": 0.020019669085741043, "kl": 0.11639404296875, "learning_rate": 2.25e-06, "loss": -0.0749, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03293025493621826, "mask/share_reasoning": 0.8575558662414551, "mask/share_step_conf": 0.10560765862464905, "num_tokens": 28215954.0, "reward": 0.7540497183799744, "reward_std": 0.22805644571781158, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6844784021377563, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8236210346221924, "step": 119 }, { "adv/mean_abs_final_conf": 0.7578812837600708, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7718867063522339, "adv/std_final_conf": 0.9244391322135925, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350153207778931, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7429498122226015, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.20085365853658543, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.508130081300813, "calib/gap": 0.3276824854899283, "calib/mean_conf": 0.6976016260162602, "calib/mu_c": 0.8321379310344827, "calib/mu_w": 0.5044554455445545, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1545121951219513, "calib/std_conf": 0.36518780645182547, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.41482662968099865, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.11378031750261097, "calib/step_q_w": 0.3010463121783877, "calib/step_q_w_n": 583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 495.9375, "completions/mean_terminated_length": 495.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.128, "grad_norm": 0.021387485787272453, "kl": 0.1209259033203125, "learning_rate": 2.222222222222222e-06, "loss": -0.0694, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.0344981849193573, "mask/share_reasoning": 0.8499711751937866, "mask/share_step_conf": 0.11553068459033966, "num_tokens": 28449602.0, "reward": 0.7760567665100098, "reward_std": 0.21278806030750275, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7316077947616577, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.820505678653717, "step": 120 }, { "adv/mean_abs_final_conf": 0.7948448657989502, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.77760249376297, "adv/std_final_conf": 0.9363829493522644, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354314804077148, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7047554347826087, "calib/avg_num_step_conf": 5.75390625, "calib/ece": 0.2051028806584362, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.3991769547325103, "calib/gap": 0.256281929347826, "calib/mean_conf": 0.671604938271605, "calib/mu_c": 0.792890625, "calib/mu_w": 0.536608695652174, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.17497942386831278, "calib/std_conf": 0.34387571969084624, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.3972486686390532, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.06644440264156265, "calib/step_q_w": 0.3308042659974906, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 556.609375, "completions/mean_terminated_length": 563.2095336914062, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.12906666666666666, "grad_norm": 0.02432946488261223, "kl": 0.1153717041015625, "learning_rate": 2.1944444444444445e-06, "loss": -0.0982, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.030991578474640846, "mask/share_reasoning": 0.848613440990448, "mask/share_step_conf": 0.1086762472987175, "num_tokens": 28697150.0, "reward": 0.742145836353302, "reward_std": 0.22360759973526, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6939094066619873, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7903823852539062, "step": 121 }, { "adv/mean_abs_final_conf": 0.7506898641586304, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7413808107376099, "adv/std_final_conf": 0.9135042428970337, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934847354888916, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7421900799154613, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.1756000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.456, "calib/gap": 0.3090033683376263, "calib/mean_conf": 0.68616, "calib/mu_c": 0.8134693877551021, "calib/mu_w": 0.5044660194174758, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1368800000000001, "calib/std_conf": 0.35813245370951796, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4297733333333334, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.1313027450980393, "calib/step_q_w": 0.2984705882352941, "calib/step_q_w_n": 714.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 505.79296875, "completions/mean_terminated_length": 507.7764892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.13013333333333332, "grad_norm": 0.019587241113185883, "kl": 0.1252899169921875, "learning_rate": 2.166666666666667e-06, "loss": -0.0778, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03283946216106415, "mask/share_reasoning": 0.8448941707611084, "mask/share_step_conf": 0.11836010217666626, "num_tokens": 28933977.0, "reward": 0.785601019859314, "reward_std": 0.19185268878936768, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7401956915855408, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8310062885284424, "step": 122 }, { "adv/mean_abs_final_conf": 0.7581375241279602, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.749475359916687, "adv/std_final_conf": 0.9185908436775208, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353261590003967, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.687900826446281, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.21955284552845528, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3252032520325203, "calib/gap": 0.21208528925619852, "calib/mean_conf": 0.6276016260162601, "calib/mu_c": 0.7319200000000001, "calib/mu_w": 0.5198347107438016, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.16951219512195123, "calib/std_conf": 0.34155124934966263, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.367410071942446, "calib/step_q_c_n": 695.0, "calib/step_q_gap": 0.0253109845891214, "calib/step_q_w": 0.3420990873533246, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 570.0, "completions/mean_terminated_length": 574.4881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.1312, "grad_norm": 0.022155309095978737, "kl": 0.11624908447265625, "learning_rate": 2.138888888888889e-06, "loss": -0.078, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.029910586774349213, "mask/share_reasoning": 0.853052020072937, "mask/share_step_conf": 0.10922486335039139, "num_tokens": 29185185.0, "reward": 0.7430111765861511, "reward_std": 0.1918821930885315, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6940113306045532, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.792011022567749, "step": 123 }, { "adv/mean_abs_final_conf": 0.7517533302307129, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7596127986907959, "adv/std_final_conf": 0.9228248596191406, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353944659233093, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7227831715210356, "calib/avg_num_step_conf": 5.00390625, "calib/ece": 0.16616877470355734, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4031620553359684, "calib/gap": 0.27088122977346263, "calib/mean_conf": 0.6841869565217391, "calib/mu_c": 0.7944666666666665, "calib/mu_w": 0.5235854368932039, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12873517786561267, "calib/std_conf": 0.33946601899337664, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.42114845938375356, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.07587509077705162, "calib/step_q_w": 0.34527336860670194, "calib/step_q_w_n": 567.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 493.35546875, "completions/mean_terminated_length": 493.35546875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.13226666666666667, "grad_norm": 0.03565501794219017, "kl": 0.12664794921875, "learning_rate": 2.1111111111111114e-06, "loss": -0.0886, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03346759080886841, "mask/share_reasoning": 0.8561069369316101, "mask/share_step_conf": 0.11042545735836029, "num_tokens": 29418300.0, "reward": 0.7763141989707947, "reward_std": 0.18388475477695465, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7414895296096802, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8111388683319092, "step": 124 }, { "adv/mean_abs_final_conf": 0.7748832106590271, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7449538707733154, "adv/std_final_conf": 0.9294499158859253, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354871511459351, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6227737905369484, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.2821544715447155, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3617886178861789, "calib/gap": 0.1516347687400318, "calib/mean_conf": 0.6411788617886177, "calib/mu_c": 0.7225438596491228, "calib/mu_w": 0.570909090909091, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.229959349593496, "calib/std_conf": 0.35106549854496266, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4089411764705882, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.04208439573548056, "calib/step_q_w": 0.36685678073510763, "calib/step_q_w_n": 789.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2594.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 528.625, "completions/mean_terminated_length": 530.6980590820312, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.13333333333333333, "grad_norm": 0.024459734559059143, "kl": 0.11830902099609375, "learning_rate": 2.0833333333333334e-06, "loss": -0.0704, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03457402065396309, "mask/share_reasoning": 0.8480210304260254, "mask/share_step_conf": 0.11349868029356003, "num_tokens": 29658436.0, "reward": 0.7104007005691528, "reward_std": 0.21995657682418823, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6415195465087891, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.7792819142341614, "step": 125 }, { "adv/mean_abs_final_conf": 0.744687557220459, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7507373094558716, "adv/std_final_conf": 0.9142650961875916, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354282021522522, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7776556044248968, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.23142857142857154, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4326530612244898, "calib/gap": 0.34006064240970274, "calib/mean_conf": 0.636734693877551, "calib/mu_c": 0.8074590163934426, "calib/mu_w": 0.4673983739837399, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.18510204081632667, "calib/std_conf": 0.38000530477018046, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4600654664484452, "calib/step_q_c_n": 611.0, "calib/step_q_gap": 0.1603538170828166, "calib/step_q_w": 0.2997116493656286, "calib/step_q_w_n": 867.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 544.24609375, "completions/mean_terminated_length": 544.24609375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.1344, "grad_norm": 0.024065330624580383, "kl": 0.1114044189453125, "learning_rate": 2.0555555555555555e-06, "loss": -0.0088, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03621228411793709, "mask/share_reasoning": 0.8417133092880249, "mask/share_step_conf": 0.12207438796758652, "num_tokens": 29903227.0, "reward": 0.7633358836174011, "reward_std": 0.1952163577079773, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7203140258789062, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.806357741355896, "step": 126 }, { "adv/mean_abs_final_conf": 0.7594637274742126, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7706195116043091, "adv/std_final_conf": 0.9249746203422546, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354552030563354, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7582298786441624, "calib/avg_num_step_conf": 6.2578125, "calib/ece": 0.20667634854771783, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.31950207468879666, "calib/gap": 0.3310576091505092, "calib/mean_conf": 0.5978049792531119, "calib/mu_c": 0.7818785046728972, "calib/mu_w": 0.450820895522388, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.18024896265560164, "calib/std_conf": 0.36942244838314564, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.45648788927335643, "calib/step_q_c_n": 578.0, "calib/step_q_gap": 0.16179745958585645, "calib/step_q_w": 0.2946904296875, "calib/step_q_w_n": 1024.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 562.96484375, "completions/mean_terminated_length": 565.172607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.13546666666666668, "grad_norm": 0.022668741643428802, "kl": 0.118194580078125, "learning_rate": 2.027777777777778e-06, "loss": -0.0401, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03429264575242996, "mask/share_reasoning": 0.8373519778251648, "mask/share_step_conf": 0.12444911152124405, "num_tokens": 30151018.0, "reward": 0.7562384009361267, "reward_std": 0.21813994646072388, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.7046475410461426, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8078293204307556, "step": 127 }, { "adv/mean_abs_final_conf": 0.7670286893844604, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7674146294593811, "adv/std_final_conf": 0.9218215942382812, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352871775627136, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7272982456140351, "calib/avg_num_step_conf": 4.7734375, "calib/ece": 0.2316849372384937, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.35564853556485354, "calib/gap": 0.2875707017543861, "calib/mean_conf": 0.6113276150627615, "calib/mu_c": 0.7617307017543861, "calib/mu_w": 0.47415999999999997, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.18301255230125524, "calib/std_conf": 0.3756346130230901, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.4362128731343283, "calib/step_q_c_n": 536.0, "calib/step_q_gap": 0.10781972444628168, "calib/step_q_w": 0.32839314868804664, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2969.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 537.8359375, "completions/mean_terminated_length": 539.9451293945312, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.13653333333333334, "grad_norm": 0.02352401241660118, "kl": 0.118560791015625, "learning_rate": 2.0000000000000003e-06, "loss": -0.114, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.035591743886470795, "mask/share_reasoning": 0.8582073450088501, "mask/share_step_conf": 0.10229466110467911, "num_tokens": 30395368.0, "reward": 0.7304539680480957, "reward_std": 0.22143486142158508, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.681101381778717, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7798066139221191, "step": 128 }, { "adv/mean_abs_final_conf": 0.7487454414367676, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7624181509017944, "adv/std_final_conf": 0.9342681169509888, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351850748062134, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6731820493262142, "calib/avg_num_step_conf": 5.31640625, "calib/ece": 0.23396785714285712, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.39285714285714285, "calib/gap": 0.2077184210526316, "calib/mean_conf": 0.6626988095238094, "calib/mu_c": 0.7566666666666667, "calib/mu_w": 0.5489482456140351, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.17452380952380953, "calib/std_conf": 0.3514597368484289, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.43732620320855614, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.055645942197136866, "calib/step_q_w": 0.3816802610114193, "calib/step_q_w_n": 613.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 462.25, "completions/mean_terminated_length": 464.0627746582031, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.1376, "grad_norm": 0.0282196793705225, "kl": 0.1335296630859375, "learning_rate": 1.9722222222222224e-06, "loss": -0.1281, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03705020248889923, "mask/share_reasoning": 0.8317481875419617, "mask/share_step_conf": 0.1272953748703003, "num_tokens": 30616088.0, "reward": 0.7445838451385498, "reward_std": 0.18311186134815216, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6939468383789062, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7952207326889038, "step": 129 }, { "adv/mean_abs_final_conf": 0.7417335510253906, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7767922282218933, "adv/std_final_conf": 0.8977218866348267, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350262880325317, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7888799355358582, "calib/avg_num_step_conf": 4.875, "calib/ece": 0.17321236559139783, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4717741935483871, "calib/gap": 0.33700913242009123, "calib/mean_conf": 0.6984005376344086, "calib/mu_c": 0.8370091324200913, "calib/mu_w": 0.5000000000000001, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1414516129032258, "calib/std_conf": 0.3465609264910288, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.47448717948717956, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.12732600732600735, "calib/step_q_w": 0.3471611721611722, "calib/step_q_w_n": 546.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 461.69140625, "completions/mean_terminated_length": 461.69140625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.13866666666666666, "grad_norm": 0.02844446338713169, "kl": 0.129486083984375, "learning_rate": 1.944444444444445e-06, "loss": -0.0903, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03700219839811325, "mask/share_reasoning": 0.8433710336685181, "mask/share_step_conf": 0.11962681263685226, "num_tokens": 30839569.0, "reward": 0.7797413468360901, "reward_std": 0.1676107943058014, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7607482671737671, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.7987344264984131, "step": 130 }, { "adv/mean_abs_final_conf": 0.7331950664520264, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.751865029335022, "adv/std_final_conf": 0.9233747124671936, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350845813751221, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7598986486486486, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.20532258064516135, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.2661290322580645, "calib/gap": 0.32641621621621636, "calib/mean_conf": 0.5329032258064516, "calib/mu_c": 0.7277000000000001, "calib/mu_w": 0.40128378378378377, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1675000000000001, "calib/std_conf": 0.3696538795072432, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4218803418803419, "calib/step_q_c_n": 468.0, "calib/step_q_gap": 0.10694652719923481, "calib/step_q_w": 0.3149338146811071, "calib/step_q_w_n": 831.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 481.13671875, "completions/mean_terminated_length": 483.0235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.13973333333333332, "grad_norm": 0.02161959744989872, "kl": 0.125335693359375, "learning_rate": 1.916666666666667e-06, "loss": -0.0873, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.034892257302999496, "mask/share_reasoning": 0.8474602699279785, "mask/share_step_conf": 0.11374115198850632, "num_tokens": 31068948.0, "reward": 0.7788792848587036, "reward_std": 0.17850662767887115, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.7352492213249207, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8225093483924866, "step": 131 }, { "adv/mean_abs_final_conf": 0.7242047190666199, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7435998916625977, "adv/std_final_conf": 0.8903018832206726, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353998303413391, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7301073021181717, "calib/avg_num_step_conf": 5.31640625, "calib/ece": 0.24270161290322584, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4959677419354839, "calib/gap": 0.24520345596432558, "calib/mean_conf": 0.6760887096774193, "calib/mu_c": 0.767051282051282, "calib/mu_w": 0.5218478260869565, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14487903225806456, "calib/std_conf": 0.379004990010915, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.48695418848167543, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.11089053689038564, "calib/step_q_w": 0.3760636515912898, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 516.24609375, "completions/mean_terminated_length": 518.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.1408, "grad_norm": 0.030215511098504066, "kl": 0.116729736328125, "learning_rate": 1.888888888888889e-06, "loss": -0.0388, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03585230931639671, "mask/share_reasoning": 0.8398134708404541, "mask/share_step_conf": 0.12042798101902008, "num_tokens": 31306699.0, "reward": 0.7527471780776978, "reward_std": 0.21779859066009521, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.6969640254974365, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8085302114486694, "step": 132 }, { "adv/mean_abs_final_conf": 0.7842279672622681, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7547422647476196, "adv/std_final_conf": 0.9241669774055481, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354990124702454, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7024958632101489, "calib/avg_num_step_conf": 5.7734375, "calib/ece": 0.2579268292682927, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.34552845528455284, "calib/gap": 0.2425606729178158, "calib/mean_conf": 0.5730487804878048, "calib/mu_c": 0.7189795918367347, "calib/mu_w": 0.47641891891891885, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.21630081300813012, "calib/std_conf": 0.375989823285099, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.43455341506129597, "calib/step_q_c_n": 571.0, "calib/step_q_gap": 0.06357215817044698, "calib/step_q_w": 0.370981256890849, "calib/step_q_w_n": 907.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 562.07421875, "completions/mean_terminated_length": 566.5, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.14186666666666667, "grad_norm": 0.03317323327064514, "kl": 0.11187744140625, "learning_rate": 1.8611111111111113e-06, "loss": -0.0778, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03122769482433796, "mask/share_reasoning": 0.855074405670166, "mask/share_step_conf": 0.10588540881872177, "num_tokens": 31556934.0, "reward": 0.7412779927253723, "reward_std": 0.22184857726097107, "rewards/accuracy_reward_step": 0.3828125, "rewards/final_brier_reward_step": 0.6710578203201294, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8114981651306152, "step": 133 }, { "adv/mean_abs_final_conf": 0.7846685647964478, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7685303688049316, "adv/std_final_conf": 0.9300345182418823, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356147646903992, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.6804602630074328, "calib/avg_num_step_conf": 4.36328125, "calib/ece": 0.2869747899159665, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.38235294117647056, "calib/gap": 0.22956975414522585, "calib/mean_conf": 0.5912605042016807, "calib/mu_c": 0.7185849056603775, "calib/mu_w": 0.4890151515151516, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.92578125, "calib/pce": 0.21642857142857153, "calib/std_conf": 0.38636325186236553, "calib/step_conf_rate": 0.92578125, "calib/step_q_c": 0.4460961538461538, "calib/step_q_c_n": 520.0, "calib/step_q_gap": 0.08304757763174841, "calib/step_q_w": 0.3630485762144054, "calib/step_q_w_n": 597.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 551.98828125, "completions/mean_terminated_length": 554.1529541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.14293333333333333, "grad_norm": 0.02674867957830429, "kl": 0.10691070556640625, "learning_rate": 1.8333333333333333e-06, "loss": -0.1443, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.030389247462153435, "mask/share_reasoning": 0.8766155242919922, "mask/share_step_conf": 0.08908900618553162, "num_tokens": 31807195.0, "reward": 0.6885699033737183, "reward_std": 0.25885897874832153, "rewards/accuracy_reward_step": 0.4140625, "rewards/final_brier_reward_step": 0.6310105323791504, "rewards/format_reward_step": 0.8984375, "rewards/step_l2_reward": 0.7461292743682861, "step": 134 }, { "adv/mean_abs_final_conf": 0.7832664251327515, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7556527853012085, "adv/std_final_conf": 0.9360292553901672, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9355044364929199, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6229491525423729, "calib/avg_num_step_conf": 5.84765625, "calib/ece": 0.2741316872427983, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.37448559670781895, "calib/gap": 0.14562169491525412, "calib/mean_conf": 0.6468065843621398, "calib/mu_c": 0.7175199999999998, "calib/mu_w": 0.5718983050847457, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.20326748971193412, "calib/std_conf": 0.3520078674150313, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.4287592319054653, "calib/step_q_c_n": 677.0, "calib/step_q_gap": 0.05091776849083113, "calib/step_q_w": 0.37784146341463415, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 583.97265625, "completions/mean_terminated_length": 588.5708618164062, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.144, "grad_norm": 0.01977521926164627, "kl": 0.1051788330078125, "learning_rate": 1.8055555555555557e-06, "loss": -0.1116, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.032413728535175323, "mask/share_reasoning": 0.8518058061599731, "mask/share_step_conf": 0.10796800255775452, "num_tokens": 32062572.0, "reward": 0.7115935683250427, "reward_std": 0.21959969401359558, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6358980536460876, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7872891426086426, "step": 135 }, { "adv/mean_abs_final_conf": 0.7137893438339233, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7463816404342651, "adv/std_final_conf": 0.9127443432807922, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350497126579285, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8045137736475274, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.17386178861788626, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.35365853658536583, "calib/gap": 0.4000192499170262, "calib/mean_conf": 0.5577642276422763, "calib/mu_c": 0.7707826086956522, "calib/mu_w": 0.370763358778626, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13207317073170738, "calib/std_conf": 0.3906034019080313, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4524784853700516, "calib/step_q_c_n": 581.0, "calib/step_q_gap": 0.1576671646153346, "calib/step_q_w": 0.294811320754717, "calib/step_q_w_n": 848.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 508.5703125, "completions/mean_terminated_length": 512.5748291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.14506666666666668, "grad_norm": 0.021196873858571053, "kl": 0.1201629638671875, "learning_rate": 1.777777777777778e-06, "loss": -0.1318, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03404550999403, "mask/share_reasoning": 0.8395554423332214, "mask/share_step_conf": 0.11858654022216797, "num_tokens": 32301254.0, "reward": 0.7984526753425598, "reward_std": 0.20844683051109314, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7547625303268433, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8421428203582764, "step": 136 }, { "adv/mean_abs_final_conf": 0.7769197821617126, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7149304151535034, "adv/std_final_conf": 0.9298633337020874, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9358368515968323, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6837907037815125, "calib/avg_num_step_conf": 5.7421875, "calib/ece": 0.23072874493927123, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.31983805668016196, "calib/gap": 0.208855698529412, "calib/mean_conf": 0.5788259109311741, "calib/mu_c": 0.6870588235294119, "calib/mu_w": 0.47820312499999995, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.16388663967611333, "calib/std_conf": 0.36934321570666856, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.42313432835820897, "calib/step_q_c_n": 670.0, "calib/step_q_gap": 0.05207182835820895, "calib/step_q_w": 0.3710625, "calib/step_q_w_n": 800.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 526.03515625, "completions/mean_terminated_length": 528.0980834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.14613333333333334, "grad_norm": 0.024452555924654007, "kl": 0.1158905029296875, "learning_rate": 1.75e-06, "loss": -0.1716, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03136240318417549, "mask/share_reasoning": 0.8453494310379028, "mask/share_step_conf": 0.11938194930553436, "num_tokens": 32542903.0, "reward": 0.7437588572502136, "reward_std": 0.24398964643478394, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6762160062789917, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8113017082214355, "step": 137 }, { "adv/mean_abs_final_conf": 0.7433382868766785, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7487839460372925, "adv/std_final_conf": 0.9363875389099121, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9357391595840454, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7615031757233592, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.17493723849372372, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.40585774058577406, "calib/gap": 0.3412371206774876, "calib/mean_conf": 0.624142259414226, "calib/mu_c": 0.7797692307692308, "calib/mu_w": 0.43853211009174314, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.12757322175732205, "calib/std_conf": 0.374984513840238, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.48342105263157886, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.1258750403616402, "calib/step_q_w": 0.35754601226993865, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 513.0859375, "completions/mean_terminated_length": 513.0859375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.1472, "grad_norm": 0.024063071236014366, "kl": 0.1184539794921875, "learning_rate": 1.7222222222222224e-06, "loss": -0.1174, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03499140590429306, "mask/share_reasoning": 0.856407105922699, "mask/share_step_conf": 0.10860144346952438, "num_tokens": 32778589.0, "reward": 0.7396991848945618, "reward_std": 0.25229695439338684, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7052828073501587, "rewards/format_reward_step": 0.91015625, "rewards/step_l2_reward": 0.7741155624389648, "step": 138 }, { "adv/mean_abs_final_conf": 0.774564266204834, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7389470338821411, "adv/std_final_conf": 0.9210377931594849, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353828430175781, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7207237965175828, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.18032520325203244, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.35365853658536583, "calib/gap": 0.2964711505633322, "calib/mean_conf": 0.5891056910569107, "calib/mu_c": 0.7108275862068965, "calib/mu_w": 0.4143564356435643, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.08999999999999994, "calib/std_conf": 0.3777726940126769, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.45176470588235296, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.08779157728350462, "calib/step_q_w": 0.36397312859884834, "calib/step_q_w_n": 521.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2171.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 455.25390625, "completions/mean_terminated_length": 455.25390625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.14826666666666666, "grad_norm": 0.02282751351594925, "kl": 0.133056640625, "learning_rate": 1.6944444444444446e-06, "loss": -0.0957, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03690766170620918, "mask/share_reasoning": 0.8504753112792969, "mask/share_step_conf": 0.11261705309152603, "num_tokens": 32998230.0, "reward": 0.7724387645721436, "reward_std": 0.24210603535175323, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.722465991973877, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8224115371704102, "step": 139 }, { "adv/mean_abs_final_conf": 0.743836522102356, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.766912043094635, "adv/std_final_conf": 0.9190491437911987, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348379373550415, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7683735837805605, "calib/avg_num_step_conf": 4.84765625, "calib/ece": 0.1443388429752065, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5165289256198347, "calib/gap": 0.3545706618962433, "calib/mean_conf": 0.7193801652892562, "calib/mu_c": 0.8453846153846154, "calib/mu_w": 0.4908139534883721, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.10954545454545442, "calib/std_conf": 0.3436460257467601, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5234966887417218, "calib/step_q_c_n": 755.0, "calib/step_q_gap": 0.16304401384460254, "calib/step_q_w": 0.3604526748971193, "calib/step_q_w_n": 486.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 475.21484375, "completions/mean_terminated_length": 478.9566955566406, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.14933333333333335, "grad_norm": 0.03636842966079712, "kl": 0.220977783203125, "learning_rate": 1.6666666666666667e-06, "loss": -0.1199, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.035617418587207794, "mask/share_reasoning": 0.8433914184570312, "mask/share_step_conf": 0.11317867785692215, "num_tokens": 33224901.0, "reward": 0.7863426804542542, "reward_std": 0.22152948379516602, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7550433874130249, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.8176419734954834, "step": 140 }, { "adv/mean_abs_final_conf": 0.7288170456886292, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7470444440841675, "adv/std_final_conf": 0.9097040295600891, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351239800453186, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8300006871435442, "calib/avg_num_step_conf": 4.96875, "calib/ece": 0.1373306233062331, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.45121951219512196, "calib/gap": 0.4221253349824779, "calib/mean_conf": 0.6560840108401084, "calib/mu_c": 0.8259637188208617, "calib/mu_w": 0.40383838383838383, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.09792682926829271, "calib/std_conf": 0.3738567930432718, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5041971830985916, "calib/step_q_c_n": 710.0, "calib/step_q_gap": 0.15293158595064155, "calib/step_q_w": 0.3512655971479501, "calib/step_q_w_n": 561.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 527.9921875, "completions/mean_terminated_length": 527.9921875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1504, "grad_norm": 0.027496205642819405, "kl": 0.1147613525390625, "learning_rate": 1.638888888888889e-06, "loss": -0.0659, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.033179208636283875, "mask/share_reasoning": 0.8603793382644653, "mask/share_step_conf": 0.10644148290157318, "num_tokens": 33467163.0, "reward": 0.8010966777801514, "reward_std": 0.2073913812637329, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7743968963623047, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.827796459197998, "step": 141 }, { "adv/mean_abs_final_conf": 0.7331016659736633, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7484983205795288, "adv/std_final_conf": 0.912481963634491, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935049295425415, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8432870679300682, "calib/avg_num_step_conf": 5.546875, "calib/ece": 0.1430204081632654, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3469387755102041, "calib/gap": 0.45674829841185105, "calib/mean_conf": 0.5617959183673469, "calib/mu_c": 0.7985593220338982, "calib/mu_w": 0.3418110236220472, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.11159183673469397, "calib/std_conf": 0.3909350893242674, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5323050847457627, "calib/step_q_c_n": 590.0, "calib/step_q_gap": 0.19322074739636502, "calib/step_q_w": 0.33908433734939764, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 540.51171875, "completions/mean_terminated_length": 540.51171875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.15146666666666667, "grad_norm": 0.03245364874601364, "kl": 0.1150970458984375, "learning_rate": 1.6111111111111113e-06, "loss": -0.0592, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03268226236104965, "mask/share_reasoning": 0.853368878364563, "mask/share_step_conf": 0.11394885182380676, "num_tokens": 33710694.0, "reward": 0.8005416393280029, "reward_std": 0.21339192986488342, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7789839506149292, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8220993876457214, "step": 142 }, { "adv/mean_abs_final_conf": 0.6933019161224365, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7632689476013184, "adv/std_final_conf": 0.9011903405189514, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348337650299072, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8008998875140607, "calib/avg_num_step_conf": 5.24609375, "calib/ece": 0.1389121338912134, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.3598326359832636, "calib/gap": 0.4012788245219348, "calib/mean_conf": 0.5570711297071129, "calib/mu_c": 0.7451181102362204, "calib/mu_w": 0.3438392857142856, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.08230125523012555, "calib/std_conf": 0.3815468943622376, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.4791177514792899, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.1619513346876857, "calib/step_q_w": 0.31716641679160423, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 535.3671875, "completions/mean_terminated_length": 535.3671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.15253333333333333, "grad_norm": 0.025781175121665, "kl": 0.1120758056640625, "learning_rate": 1.5833333333333333e-06, "loss": -0.1834, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.030763279646635056, "mask/share_reasoning": 0.8582282662391663, "mask/share_step_conf": 0.11100847274065018, "num_tokens": 33955084.0, "reward": 0.7788114547729492, "reward_std": 0.20533671975135803, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7449921369552612, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.8126306533813477, "step": 143 }, { "adv/mean_abs_final_conf": 0.7452036142349243, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7641355991363525, "adv/std_final_conf": 0.9165805578231812, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349513649940491, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6835314091680814, "calib/avg_num_step_conf": 5.28125, "calib/ece": 0.19552400000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.42, "calib/gap": 0.2959222410865874, "calib/mean_conf": 0.595356, "calib/mu_c": 0.7078064516129032, "calib/mu_w": 0.41188421052631585, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.08544000000000002, "calib/std_conf": 0.38758931520876583, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4788480453972257, "calib/step_q_c_n": 793.0, "calib/step_q_gap": 0.10707702571922922, "calib/step_q_w": 0.3717710196779965, "calib/step_q_w_n": 559.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2199.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 497.19140625, "completions/mean_terminated_length": 497.19140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.1536, "grad_norm": 0.025835543870925903, "kl": 0.1396636962890625, "learning_rate": 1.5555555555555558e-06, "loss": 0.0015, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.036746665835380554, "mask/share_reasoning": 0.8494962453842163, "mask/share_step_conf": 0.11375711113214493, "num_tokens": 34186493.0, "reward": 0.7822964191436768, "reward_std": 0.19108529388904572, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7314571738243103, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8331356048583984, "step": 144 }, { "adv/mean_abs_final_conf": 0.7166381478309631, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7478351593017578, "adv/std_final_conf": 0.8948644399642944, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356358051300049, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7113007852509389, "calib/avg_num_step_conf": 5.5703125, "calib/ece": 0.21483739837398375, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.3617886178861789, "calib/gap": 0.24325776715602587, "calib/mean_conf": 0.6160569105691057, "calib/mu_c": 0.7159310344827586, "calib/mu_w": 0.47267326732673276, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.12073170731707318, "calib/std_conf": 0.3594817588871085, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.49145888594164466, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.12235174308450175, "calib/step_q_w": 0.3691071428571429, "calib/step_q_w_n": 672.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 483.09765625, "completions/mean_terminated_length": 484.9921875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.15466666666666667, "grad_norm": 0.020173804834485054, "kl": 0.1165313720703125, "learning_rate": 1.527777777777778e-06, "loss": -0.0922, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03564931079745293, "mask/share_reasoning": 0.8389637470245361, "mask/share_step_conf": 0.12148070335388184, "num_tokens": 34412870.0, "reward": 0.7610074281692505, "reward_std": 0.2144685834646225, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7092058658599854, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8128089308738708, "step": 145 }, { "adv/mean_abs_final_conf": 0.7340985536575317, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.754753053188324, "adv/std_final_conf": 0.9043131470680237, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353973269462585, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.747416561932691, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.20726720647773283, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.32793522267206476, "calib/gap": 0.3250254852674206, "calib/mean_conf": 0.5417611336032389, "calib/mu_c": 0.7444086021505375, "calib/mu_w": 0.4193831168831169, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.18625506072874498, "calib/std_conf": 0.38723643985522777, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.4945991561181435, "calib/step_q_c_n": 474.0, "calib/step_q_gap": 0.11234915611814345, "calib/step_q_w": 0.38225000000000003, "calib/step_q_w_n": 840.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 534.15234375, "completions/mean_terminated_length": 534.15234375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.15573333333333333, "grad_norm": 0.0326496884226799, "kl": 0.1174163818359375, "learning_rate": 1.5e-06, "loss": -0.0651, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.032749567180871964, "mask/share_reasoning": 0.8582661151885986, "mask/share_step_conf": 0.10898430645465851, "num_tokens": 34656829.0, "reward": 0.747028112411499, "reward_std": 0.22130638360977173, "rewards/accuracy_reward_step": 0.36328125, "rewards/final_brier_reward_step": 0.700760543346405, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.7932957410812378, "step": 146 }, { "adv/mean_abs_final_conf": 0.6888132095336914, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7391669750213623, "adv/std_final_conf": 0.8701976537704468, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350495934486389, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7470329971873498, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.24528925619834713, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.32730054194964686, "calib/mean_conf": 0.6731404958677685, "calib/mu_c": 0.8476106194690267, "calib/mu_w": 0.5203100775193799, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.22574380165289257, "calib/std_conf": 0.37821685383569825, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5680240549828179, "calib/step_q_c_n": 582.0, "calib/step_q_gap": 0.17246262829853798, "calib/step_q_w": 0.39556142668427996, "calib/step_q_w_n": 757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 513.5078125, "completions/mean_terminated_length": 513.5078125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1568, "grad_norm": 0.027548471465706825, "kl": 0.1119842529296875, "learning_rate": 1.4722222222222225e-06, "loss": -0.1514, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.033459752798080444, "mask/share_reasoning": 0.8553204536437988, "mask/share_step_conf": 0.11121977865695953, "num_tokens": 34891967.0, "reward": 0.741882860660553, "reward_std": 0.23327764868736267, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.6886242032051086, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7951413989067078, "step": 147 }, { "adv/mean_abs_final_conf": 0.6712465286254883, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7319573760032654, "adv/std_final_conf": 0.8693785667419434, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935432493686676, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8044133771929826, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.16645161290322585, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.49193548387096775, "calib/gap": 0.3975328947368423, "calib/mean_conf": 0.6567741935483871, "calib/mu_c": 0.8106578947368422, "calib/mu_w": 0.41312499999999996, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.10516129032258069, "calib/std_conf": 0.38179235275781737, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5186184210526316, "calib/step_q_c_n": 760.0, "calib/step_q_gap": 0.15973949728581543, "calib/step_q_w": 0.3588789237668161, "calib/step_q_w_n": 669.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 531.484375, "completions/mean_terminated_length": 531.484375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.15786666666666666, "grad_norm": 0.02297389879822731, "kl": 0.119415283203125, "learning_rate": 1.4444444444444445e-06, "loss": -0.0599, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03818276524543762, "mask/share_reasoning": 0.8397675156593323, "mask/share_step_conf": 0.1220497414469719, "num_tokens": 35133139.0, "reward": 0.7869892120361328, "reward_std": 0.2096644788980484, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7629504203796387, "rewards/format_reward_step": 0.953125, "rewards/step_l2_reward": 0.8110280632972717, "step": 148 }, { "adv/mean_abs_final_conf": 0.7334018349647522, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7355673313140869, "adv/std_final_conf": 0.9075469970703125, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9344674944877625, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8240863347457628, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.14654471544715447, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.3902439024390244, "calib/gap": 0.44016419491525427, "calib/mean_conf": 0.5920731707317073, "calib/mu_c": 0.8211016949152542, "calib/mu_w": 0.38093749999999993, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1294715447154472, "calib/std_conf": 0.3841612269667263, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5430490018148821, "calib/step_q_c_n": 551.0, "calib/step_q_gap": 0.18139160470698446, "calib/step_q_w": 0.3616573971078977, "calib/step_q_w_n": 899.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 569.98828125, "completions/mean_terminated_length": 572.2235717773438, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.15893333333333334, "grad_norm": 0.02119380608201027, "kl": 0.10820770263671875, "learning_rate": 1.4166666666666667e-06, "loss": -0.0682, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.032900311052799225, "mask/share_reasoning": 0.8550213575363159, "mask/share_step_conf": 0.10817211866378784, "num_tokens": 35383512.0, "reward": 0.8044848442077637, "reward_std": 0.22013668715953827, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7762886881828308, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8326810002326965, "step": 149 }, { "adv/mean_abs_final_conf": 0.6614770889282227, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7001590728759766, "adv/std_final_conf": 0.8694519996643066, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9205872416496277, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7167489373371726, "calib/avg_num_step_conf": 4.5546875, "calib/ece": 0.19620408163265302, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5346938775510204, "calib/gap": 0.28749554367201435, "calib/mean_conf": 0.7184897959183673, "calib/mu_c": 0.8381818181818183, "calib/mu_w": 0.5506862745098039, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.1655102040816326, "calib/std_conf": 0.35515182855287136, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5661550888529887, "calib/step_q_c_n": 619.0, "calib/step_q_gap": 0.1576541747762063, "calib/step_q_w": 0.4085009140767824, "calib/step_q_w_n": 547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 436.5625, "completions/mean_terminated_length": 438.2745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.16, "grad_norm": 0.032520148903131485, "kl": 0.1268768310546875, "learning_rate": 1.3888888888888892e-06, "loss": -0.0493, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.040418535470962524, "mask/share_reasoning": 0.8314287066459656, "mask/share_step_conf": 0.12424648553133011, "num_tokens": 35600232.0, "reward": 0.7510082125663757, "reward_std": 0.2066630870103836, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7011894583702087, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8008269667625427, "step": 150 }, { "adv/mean_abs_final_conf": 0.7541616559028625, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7513711452484131, "adv/std_final_conf": 0.9041014909744263, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351353645324707, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7779142877791428, "calib/avg_num_step_conf": 4.84765625, "calib/ece": 0.17130252100840337, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3067226890756303, "calib/gap": 0.362661704126617, "calib/mean_conf": 0.55609243697479, "calib/mu_c": 0.7648514851485149, "calib/mu_w": 0.40218978102189784, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.1515126050420168, "calib/std_conf": 0.3741661090607905, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.5342436974789916, "calib/step_q_c_n": 476.0, "calib/step_q_gap": 0.13857049486461248, "calib/step_q_w": 0.3956732026143791, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 535.37109375, "completions/mean_terminated_length": 535.37109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.16106666666666666, "grad_norm": 0.02657642960548401, "kl": 0.106231689453125, "learning_rate": 1.3611111111111112e-06, "loss": -0.1509, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.031711041927337646, "mask/share_reasoning": 0.8681683540344238, "mask/share_step_conf": 0.1001206785440445, "num_tokens": 35844311.0, "reward": 0.7364934682846069, "reward_std": 0.2154543697834015, "rewards/accuracy_reward_step": 0.39453125, "rewards/final_brier_reward_step": 0.7103277444839478, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7626591920852661, "step": 151 }, { "adv/mean_abs_final_conf": 0.799831748008728, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7398291230201721, "adv/std_final_conf": 0.9359056353569031, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353933930397034, "calib/answer_extract_rate": 0.9140625, "calib/auroc": 0.7272593247825447, "calib/avg_num_step_conf": 5.33203125, "calib/ece": 0.20055276595744687, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.3148936170212766, "calib/gap": 0.3046464175143743, "calib/mean_conf": 0.567191914893617, "calib/mu_c": 0.7396088235294119, "calib/mu_w": 0.4349624060150376, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.953125, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.16685106382978726, "calib/std_conf": 0.36984893382234413, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.4948689138576779, "calib/step_q_c_n": 534.0, "calib/step_q_gap": 0.14458010519341802, "calib/step_q_w": 0.3502888086642599, "calib/step_q_w_n": 831.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 506.484375, "completions/mean_terminated_length": 510.4724426269531, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.16213333333333332, "grad_norm": 0.033719953149557114, "kl": 0.1153106689453125, "learning_rate": 1.3333333333333334e-06, "loss": -0.1198, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.03530751168727875, "mask/share_reasoning": 0.8421989679336548, "mask/share_step_conf": 0.11468097567558289, "num_tokens": 36079363.0, "reward": 0.7214515209197998, "reward_std": 0.24063536524772644, "rewards/accuracy_reward_step": 0.3984375, "rewards/final_brier_reward_step": 0.6684511303901672, "rewards/format_reward_step": 0.89453125, "rewards/step_l2_reward": 0.7744519114494324, "step": 152 }, { "adv/mean_abs_final_conf": 0.7516984939575195, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7617511749267578, "adv/std_final_conf": 0.9103282690048218, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350059032440186, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7148971466489715, "calib/avg_num_step_conf": 4.71875, "calib/ece": 0.18222672064777334, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.2631578947368421, "calib/gap": 0.28025746516257455, "calib/mean_conf": 0.535991902834008, "calib/mu_c": 0.6608029197080292, "calib/mu_w": 0.38054545454545463, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.08178137651821868, "calib/std_conf": 0.37345054783107035, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.49373608903020666, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.11404696985922219, "calib/step_q_w": 0.37968911917098447, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 495.65234375, "completions/mean_terminated_length": 497.5960998535156, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.1632, "grad_norm": 0.03105360083281994, "kl": 0.11749267578125, "learning_rate": 1.3055555555555556e-06, "loss": -0.0638, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03299054503440857, "mask/share_reasoning": 0.8585386872291565, "mask/share_step_conf": 0.10456451773643494, "num_tokens": 36313570.0, "reward": 0.7574142217636108, "reward_std": 0.210285484790802, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7134543061256409, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8013741970062256, "step": 153 }, { "adv/mean_abs_final_conf": 0.7263320684432983, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7746478319168091, "adv/std_final_conf": 0.8918282985687256, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348356127738953, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6936760164693774, "calib/avg_num_step_conf": 4.64453125, "calib/ece": 0.2270988, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.336, "calib/gap": 0.2798690941842511, "calib/mean_conf": 0.5604212, "calib/mu_c": 0.7104310344827586, "calib/mu_w": 0.4305619402985075, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.16175999999999996, "calib/std_conf": 0.3910226783072307, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5556363636363637, "calib/step_q_c_n": 495.0, "calib/step_q_gap": 0.181702646057113, "calib/step_q_w": 0.3739337175792507, "calib/step_q_w_n": 694.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 455.3203125, "completions/mean_terminated_length": 458.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.16426666666666667, "grad_norm": 0.030806738883256912, "kl": 0.1298980712890625, "learning_rate": 1.2777777777777779e-06, "loss": -0.0773, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035829126834869385, "mask/share_reasoning": 0.8452428579330444, "mask/share_step_conf": 0.11111550778150558, "num_tokens": 36534572.0, "reward": 0.7740947008132935, "reward_std": 0.17938129603862762, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.7112395763397217, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8369499444961548, "step": 154 }, { "adv/mean_abs_final_conf": 0.7826955318450928, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7361176013946533, "adv/std_final_conf": 0.935632050037384, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9358672499656677, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6956580640791166, "calib/avg_num_step_conf": 5.359375, "calib/ece": 0.22495901639344262, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.26639344262295084, "calib/gap": 0.26259229153965985, "calib/mean_conf": 0.5129918032786884, "calib/mu_c": 0.656126126126126, "calib/mu_w": 0.3935338345864661, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14151639344262296, "calib/std_conf": 0.37540860789762587, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.48486868686868684, "calib/step_q_c_n": 495.0, "calib/step_q_gap": 0.15307849302604143, "calib/step_q_w": 0.3317901938426454, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 453.94140625, "completions/mean_terminated_length": 457.5157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.16533333333333333, "grad_norm": 0.02824331633746624, "kl": 0.1313629150390625, "learning_rate": 1.25e-06, "loss": -0.0847, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03731415420770645, "mask/share_reasoning": 0.8393542766571045, "mask/share_step_conf": 0.11551909893751144, "num_tokens": 36757997.0, "reward": 0.7349332571029663, "reward_std": 0.2547478675842285, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.6926234364509583, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.7772430181503296, "step": 155 }, { "adv/mean_abs_final_conf": 0.7364197373390198, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7438960075378418, "adv/std_final_conf": 0.9156517386436462, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347387552261353, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7281936030226045, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.1750806451612903, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.29838709677419356, "calib/gap": 0.33133737215816556, "calib/mean_conf": 0.5179032258064517, "calib/mu_c": 0.6902521008403361, "calib/mu_w": 0.3589147286821705, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10657258064516124, "calib/std_conf": 0.38794282542153785, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4847897897897898, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.129165800571461, "calib/step_q_w": 0.35562398921832883, "calib/step_q_w_n": 742.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 520.078125, "completions/mean_terminated_length": 520.078125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1664, "grad_norm": 0.02229756861925125, "kl": 0.1176910400390625, "learning_rate": 1.2222222222222223e-06, "loss": -0.0485, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03484189510345459, "mask/share_reasoning": 0.8441666960716248, "mask/share_step_conf": 0.12099139392375946, "num_tokens": 36995897.0, "reward": 0.7709934711456299, "reward_std": 0.21262159943580627, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.7337499856948853, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8082370162010193, "step": 156 }, { "adv/mean_abs_final_conf": 0.6937302350997925, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7066144943237305, "adv/std_final_conf": 0.8964200019836426, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352517127990723, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.794429525580114, "calib/avg_num_step_conf": 5.94140625, "calib/ece": 0.13188524590163936, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.38934426229508196, "calib/gap": 0.4263003511671142, "calib/mean_conf": 0.5777049180327868, "calib/mu_c": 0.7576595744680851, "calib/mu_w": 0.3313592233009709, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.06586065573770493, "calib/std_conf": 0.3969138876861177, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5136559036144578, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.16106545498927688, "calib/step_q_w": 0.3525904486251809, "calib/step_q_w_n": 691.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 513.89453125, "completions/mean_terminated_length": 517.94091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.16746666666666668, "grad_norm": 0.029141323640942574, "kl": 0.1168365478515625, "learning_rate": 1.1944444444444446e-06, "loss": -0.0538, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.034257110208272934, "mask/share_reasoning": 0.8325086832046509, "mask/share_step_conf": 0.1254216879606247, "num_tokens": 37231182.0, "reward": 0.7845574617385864, "reward_std": 0.19589388370513916, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7629241943359375, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8061907291412354, "step": 157 }, { "adv/mean_abs_final_conf": 0.7426419258117676, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7533231973648071, "adv/std_final_conf": 0.9092576503753662, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353457093238831, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7546258503401361, "calib/avg_num_step_conf": 4.875, "calib/ece": 0.1671428571428572, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.3673469387755102, "calib/gap": 0.3399285714285714, "calib/mean_conf": 0.6068163265306123, "calib/mu_c": 0.7525, "calib/mu_w": 0.41257142857142853, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.10126530612244904, "calib/std_conf": 0.3710267977267997, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5364376996805111, "calib/step_q_c_n": 626.0, "calib/step_q_gap": 0.14506953247793875, "calib/step_q_w": 0.3913681672025724, "calib/step_q_w_n": 622.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 458.75390625, "completions/mean_terminated_length": 462.36614990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.16853333333333334, "grad_norm": 0.028471689671278, "kl": 0.1329345703125, "learning_rate": 1.1666666666666668e-06, "loss": -0.0743, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03755185008049011, "mask/share_reasoning": 0.8372524380683899, "mask/share_step_conf": 0.11738321185112, "num_tokens": 37453863.0, "reward": 0.7625678777694702, "reward_std": 0.23472969233989716, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7292590141296387, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7958768606185913, "step": 158 }, { "adv/mean_abs_final_conf": 0.7335477471351624, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7329621911048889, "adv/std_final_conf": 0.9072332978248596, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348861575126648, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6955052493438321, "calib/avg_num_step_conf": 5.1875, "calib/ece": 0.22028340080971667, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.3522267206477733, "calib/gap": 0.24983136482939633, "calib/mean_conf": 0.5543724696356276, "calib/mu_c": 0.675748031496063, "calib/mu_w": 0.42591666666666667, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.13024291497975715, "calib/std_conf": 0.38340303279691007, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5337541528239202, "calib/step_q_c_n": 602.0, "calib/step_q_gap": 0.14590291315449871, "calib/step_q_w": 0.38785123966942153, "calib/step_q_w_n": 726.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 443.578125, "completions/mean_terminated_length": 445.31768798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1696, "grad_norm": 0.027787763625383377, "kl": 0.1327056884765625, "learning_rate": 1.138888888888889e-06, "loss": -0.1012, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03695898503065109, "mask/share_reasoning": 0.8409558534622192, "mask/share_step_conf": 0.11817888915538788, "num_tokens": 37672203.0, "reward": 0.765270471572876, "reward_std": 0.1936519891023636, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6986597776412964, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8318811058998108, "step": 159 }, { "adv/mean_abs_final_conf": 0.7273938655853271, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7440230250358582, "adv/std_final_conf": 0.9046057462692261, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353306293487549, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7467687413004575, "calib/avg_num_step_conf": 4.734375, "calib/ece": 0.18802419354838706, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3467741935483871, "calib/gap": 0.32727580035792403, "calib/mean_conf": 0.5456048387096775, "calib/mu_c": 0.6868085106382978, "calib/mu_w": 0.3595327102803738, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.0825403225806451, "calib/std_conf": 0.39425874340435424, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5295440729483283, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.13961627511439328, "calib/step_q_w": 0.389927797833935, "calib/step_q_w_n": 554.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 490.8359375, "completions/mean_terminated_length": 490.8359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.17066666666666666, "grad_norm": 0.026033377274870872, "kl": 0.1306304931640625, "learning_rate": 1.111111111111111e-06, "loss": -0.071, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0354008786380291, "mask/share_reasoning": 0.8589190244674683, "mask/share_step_conf": 0.10568006336688995, "num_tokens": 37902697.0, "reward": 0.7788609266281128, "reward_std": 0.20089104771614075, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7316604852676392, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8260613083839417, "step": 160 }, { "adv/mean_abs_final_conf": 0.6831102967262268, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7423896789550781, "adv/std_final_conf": 0.890784740447998, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347586035728455, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6956490929705216, "calib/avg_num_step_conf": 5.01953125, "calib/ece": 0.2265476190476191, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.38095238095238093, "calib/gap": 0.27446428571428566, "calib/mean_conf": 0.5822619047619049, "calib/mu_c": 0.67375, "calib/mu_w": 0.3992857142857143, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.07107142857142865, "calib/std_conf": 0.3908162536337356, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.522169373549884, "calib/step_q_c_n": 862.0, "calib/step_q_gap": 0.12871783690685806, "calib/step_q_w": 0.39345153664302596, "calib/step_q_w_n": 423.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 448.42578125, "completions/mean_terminated_length": 450.184326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.17173333333333332, "grad_norm": 0.03522691875696182, "kl": 0.1334075927734375, "learning_rate": 1.0833333333333335e-06, "loss": -0.0441, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037295594811439514, "mask/share_reasoning": 0.838032603263855, "mask/share_step_conf": 0.12076550722122192, "num_tokens": 38121414.0, "reward": 0.7708685994148254, "reward_std": 0.18391358852386475, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.720809817314148, "rewards/format_reward_step": 0.9765625, "rewards/step_l2_reward": 0.8209274411201477, "step": 161 }, { "adv/mean_abs_final_conf": 0.6881465315818787, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7457125782966614, "adv/std_final_conf": 0.8926540017127991, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345178008079529, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7456922162804516, "calib/avg_num_step_conf": 4.7578125, "calib/ece": 0.1726190476190475, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.48412698412698413, "calib/gap": 0.3740463458110516, "calib/mean_conf": 0.6173015873015874, "calib/mu_c": 0.7642483660130719, "calib/mu_w": 0.39020202020202027, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.09138888888888877, "calib/std_conf": 0.39745502430675955, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5439555555555555, "calib/step_q_c_n": 675.0, "calib/step_q_gap": 0.13649699201964394, "calib/step_q_w": 0.4074585635359116, "calib/step_q_w_n": 543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 433.66796875, "completions/mean_terminated_length": 435.36865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.1728, "grad_norm": 0.036304570734500885, "kl": 0.137939453125, "learning_rate": 1.0555555555555557e-06, "loss": -0.0579, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03735470026731491, "mask/share_reasoning": 0.8399271965026855, "mask/share_step_conf": 0.11881189048290253, "num_tokens": 38336577.0, "reward": 0.7964285016059875, "reward_std": 0.19373297691345215, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7544523477554321, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.838404655456543, "step": 162 }, { "adv/mean_abs_final_conf": 0.7097820043563843, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7364397644996643, "adv/std_final_conf": 0.8892258405685425, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9342389106750488, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8013877715205149, "calib/avg_num_step_conf": 5.5, "calib/ece": 0.16159183673469385, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.3795918367346939, "calib/gap": 0.3858004827031376, "calib/mean_conf": 0.563877551020408, "calib/mu_c": 0.7418181818181818, "calib/mu_w": 0.3560176991150442, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.09334693877551016, "calib/std_conf": 0.3870960241917022, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5473961661341853, "calib/step_q_c_n": 626.0, "calib/step_q_gap": 0.2016416904308605, "calib/step_q_w": 0.3457544757033248, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 540.84765625, "completions/mean_terminated_length": 540.84765625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.17386666666666667, "grad_norm": 0.02911502867937088, "kl": 0.1142425537109375, "learning_rate": 1.0277777777777777e-06, "loss": -0.1082, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.034962691366672516, "mask/share_reasoning": 0.8461117148399353, "mask/share_step_conf": 0.11892561614513397, "num_tokens": 38579866.0, "reward": 0.790389895439148, "reward_std": 0.1950080692768097, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7531589269638062, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8276206851005554, "step": 163 }, { "adv/mean_abs_final_conf": 0.7328827381134033, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7585192918777466, "adv/std_final_conf": 0.9071486592292786, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9354165196418762, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7537022321751065, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.17059322033898316, "calib/final_conf_rate": 0.921875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.3644067796610169, "calib/gap": 0.3148306003034024, "calib/mean_conf": 0.5703389830508475, "calib/mu_c": 0.715748031496063, "calib/mu_w": 0.40091743119266066, "calib/nonempty_final_conf_rate": 0.921875, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.10139830508474584, "calib/std_conf": 0.3783952113041983, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5105730659025788, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.13347763021793152, "calib/step_q_w": 0.37709543568464726, "calib/step_q_w_n": 723.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 588.8046875, "completions/mean_terminated_length": 593.44091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.17493333333333333, "grad_norm": 0.029559997841715813, "kl": 0.10688018798828125, "learning_rate": 1.0000000000000002e-06, "loss": -0.0654, "mask/has_final_conf_rate": 0.921875, "mask/share_final_conf": 0.03045329451560974, "mask/share_reasoning": 0.8584093451499939, "mask/share_step_conf": 0.10332489013671875, "num_tokens": 38836736.0, "reward": 0.7449078559875488, "reward_std": 0.22561901807785034, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7001925706863403, "rewards/format_reward_step": 0.91796875, "rewards/step_l2_reward": 0.7896230816841125, "step": 164 }, { "adv/mean_abs_final_conf": 0.7392752170562744, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7389962673187256, "adv/std_final_conf": 0.9226400852203369, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351508021354675, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7567901234567901, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.24765432098765444, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.39094650205761317, "calib/gap": 0.3417407407407406, "calib/mean_conf": 0.608477366255144, "calib/mu_c": 0.7983333333333332, "calib/mu_w": 0.4565925925925926, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.20584362139917706, "calib/std_conf": 0.3743918889676306, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5377734375, "calib/step_q_c_n": 512.0, "calib/step_q_gap": 0.1569616727941176, "calib/step_q_w": 0.38081176470588235, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 546.93359375, "completions/mean_terminated_length": 549.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.176, "grad_norm": 0.034481655806303024, "kl": 0.11751556396484375, "learning_rate": 9.722222222222224e-07, "loss": -0.0787, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03241300582885742, "mask/share_reasoning": 0.8556787371635437, "mask/share_step_conf": 0.10800202190876007, "num_tokens": 39082327.0, "reward": 0.759535014629364, "reward_std": 0.20878538489341736, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.712382435798645, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8066876530647278, "step": 165 }, { "adv/mean_abs_final_conf": 0.6982669830322266, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7666926980018616, "adv/std_final_conf": 0.8951435685157776, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345536231994629, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7946651949963208, "calib/avg_num_step_conf": 5.3515625, "calib/ece": 0.15782572614107884, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.36929460580912865, "calib/gap": 0.4003985283296541, "calib/mean_conf": 0.5773609958506224, "calib/mu_c": 0.726887417218543, "calib/mu_w": 0.3264888888888889, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.05431535269709547, "calib/std_conf": 0.38817360188537814, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5173142112125163, "calib/step_q_c_n": 767.0, "calib/step_q_gap": 0.13766246991898395, "calib/step_q_w": 0.37965174129353235, "calib/step_q_w_n": 603.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 536.796875, "completions/mean_terminated_length": 536.796875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.17706666666666668, "grad_norm": 0.028494818136096, "kl": 0.1080780029296875, "learning_rate": 9.444444444444445e-07, "loss": -0.1071, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.032391056418418884, "mask/share_reasoning": 0.8516812324523926, "mask/share_step_conf": 0.11592767387628555, "num_tokens": 39325931.0, "reward": 0.7896512746810913, "reward_std": 0.18727052211761475, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7533988952636719, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8259036540985107, "step": 166 }, { "adv/mean_abs_final_conf": 0.6581146717071533, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.769611656665802, "adv/std_final_conf": 0.872353196144104, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350212216377258, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6861284745311877, "calib/avg_num_step_conf": 5.06640625, "calib/ece": 0.23337349397590368, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.4939759036144578, "calib/gap": 0.21856097885357095, "calib/mean_conf": 0.6993172690763052, "calib/mu_c": 0.7897260273972603, "calib/mu_w": 0.5711650485436893, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17317269076305225, "calib/std_conf": 0.35386213329557303, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5674607703281026, "calib/step_q_c_n": 701.0, "calib/step_q_gap": 0.09635338777776709, "calib/step_q_w": 0.4711073825503355, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 513.32421875, "completions/mean_terminated_length": 515.3372802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.17813333333333334, "grad_norm": 0.024843445047736168, "kl": 0.114715576171875, "learning_rate": 9.166666666666666e-07, "loss": -0.0683, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.033744700253009796, "mask/share_reasoning": 0.85487961769104, "mask/share_step_conf": 0.10746943950653076, "num_tokens": 39562950.0, "reward": 0.7541587352752686, "reward_std": 0.18127688765525818, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7017910480499268, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8065265417098999, "step": 167 }, { "adv/mean_abs_final_conf": 0.6994673609733582, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7407394051551819, "adv/std_final_conf": 0.8693518042564392, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341798424720764, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7639729325265144, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.15360000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.36, "calib/gap": 0.3358071442514153, "calib/mean_conf": 0.60912, "calib/mu_c": 0.755531914893617, "calib/mu_w": 0.41972477064220176, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.09936, "calib/std_conf": 0.36863969618043035, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5279178470254958, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.19868497031316706, "calib/step_q_w": 0.32923287671232876, "calib/step_q_w_n": 730.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 555.65625, "completions/mean_terminated_length": 555.65625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.1792, "grad_norm": 0.030230311676859856, "kl": 0.1105194091796875, "learning_rate": 8.88888888888889e-07, "loss": -0.0252, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03307843208312988, "mask/share_reasoning": 0.8582138419151306, "mask/share_step_conf": 0.10870774835348129, "num_tokens": 39809870.0, "reward": 0.8047250509262085, "reward_std": 0.187750905752182, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7603687047958374, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.8490815162658691, "step": 168 }, { "adv/mean_abs_final_conf": 0.6772477030754089, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7406015396118164, "adv/std_final_conf": 0.8583061099052429, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9343918561935425, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7615018688413184, "calib/avg_num_step_conf": 4.453125, "calib/ece": 0.15352459016393438, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.42213114754098363, "calib/gap": 0.353482840638804, "calib/mean_conf": 0.6317213114754098, "calib/mu_c": 0.7896296296296297, "calib/mu_w": 0.4361467889908257, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.11598360655737702, "calib/std_conf": 0.36956264054085053, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5921711568938193, "calib/step_q_c_n": 631.0, "calib/step_q_gap": 0.1683794083672967, "calib/step_q_w": 0.42379174852652257, "calib/step_q_w_n": 509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 522.76953125, "completions/mean_terminated_length": 522.76953125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.18026666666666666, "grad_norm": 0.01928914710879326, "kl": 0.118133544921875, "learning_rate": 8.611111111111112e-07, "loss": -0.1741, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03361359238624573, "mask/share_reasoning": 0.8643434047698975, "mask/share_step_conf": 0.1020430251955986, "num_tokens": 40047883.0, "reward": 0.7742510437965393, "reward_std": 0.19515499472618103, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7330683469772339, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8154337406158447, "step": 169 }, { "adv/mean_abs_final_conf": 0.685404896736145, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7570817470550537, "adv/std_final_conf": 0.8591107130050659, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9352391958236694, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8053858577347096, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.15433884297520664, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.47107438016528924, "calib/gap": 0.37613720946562756, "calib/mean_conf": 0.6625206611570248, "calib/mu_c": 0.8195035460992909, "calib/mu_w": 0.4433663366336633, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.11710743801652895, "calib/std_conf": 0.37280909521354627, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5794684796044499, "calib/step_q_c_n": 809.0, "calib/step_q_gap": 0.15316413177836297, "calib/step_q_w": 0.42630434782608695, "calib/step_q_w_n": 598.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 535.41796875, "completions/mean_terminated_length": 541.766845703125, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.18133333333333335, "grad_norm": 0.02909146249294281, "kl": 0.1026763916015625, "learning_rate": 8.333333333333333e-07, "loss": -0.1038, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03135133534669876, "mask/share_reasoning": 0.8453192114830017, "mask/share_step_conf": 0.11161071807146072, "num_tokens": 40289102.0, "reward": 0.7756690979003906, "reward_std": 0.20986171066761017, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7445504069328308, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8067878484725952, "step": 170 }, { "adv/mean_abs_final_conf": 0.7240392565727234, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.750836193561554, "adv/std_final_conf": 0.8942993879318237, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346383810043335, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7470126208378088, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.23118367346938784, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.46530612244897956, "calib/gap": 0.30954417293233083, "calib/mean_conf": 0.6495510204081633, "calib/mu_c": 0.8175892857142858, "calib/mu_w": 0.508045112781955, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.211795918367347, "calib/std_conf": 0.3764563590299157, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5977127659574468, "calib/step_q_c_n": 564.0, "calib/step_q_gap": 0.20283006801023273, "calib/step_q_w": 0.3948826979472141, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 503.66796875, "completions/mean_terminated_length": 505.6431579589844, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.1824, "grad_norm": 0.028601879253983498, "kl": 0.11051177978515625, "learning_rate": 8.055555555555557e-07, "loss": -0.0867, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03329313173890114, "mask/share_reasoning": 0.8577781915664673, "mask/share_step_conf": 0.1050223857164383, "num_tokens": 40524937.0, "reward": 0.743405818939209, "reward_std": 0.2160402238368988, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.6792566180229187, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.8075549006462097, "step": 171 }, { "adv/mean_abs_final_conf": 0.7263467311859131, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7435145378112793, "adv/std_final_conf": 0.8930355906486511, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.934866726398468, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7686213991769547, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.18440329218106993, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.5185185185185185, "calib/gap": 0.31218518518518523, "calib/mean_conf": 0.7091769547325102, "calib/mu_c": 0.847925925925926, "calib/mu_w": 0.5357407407407407, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.16901234567901235, "calib/std_conf": 0.3398234556835382, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5687214611872146, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.13083836482070593, "calib/step_q_w": 0.43788309636650863, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 532.25, "completions/mean_terminated_length": 532.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.18346666666666667, "grad_norm": 0.025736555457115173, "kl": 0.108001708984375, "learning_rate": 7.777777777777779e-07, "loss": -0.0984, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03396880626678467, "mask/share_reasoning": 0.8518245220184326, "mask/share_step_conf": 0.11420667916536331, "num_tokens": 40764545.0, "reward": 0.7568353414535522, "reward_std": 0.2210984230041504, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7214609384536743, "rewards/format_reward_step": 0.9375, "rewards/step_l2_reward": 0.7922097444534302, "step": 172 }, { "adv/mean_abs_final_conf": 0.7614967823028564, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7911373972892761, "adv/std_final_conf": 0.908078670501709, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351539611816406, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.660099816128185, "calib/avg_num_step_conf": 5.58203125, "calib/ece": 0.2597188755020081, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5421686746987951, "calib/gap": 0.16133175728920413, "calib/mean_conf": 0.7683935742971887, "calib/mu_c": 0.8383687943262411, "calib/mu_w": 0.677037037037037, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23092369477911653, "calib/std_conf": 0.305166735843764, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5414319248826291, "calib/step_q_c_n": 852.0, "calib/step_q_gap": 0.024690157118331113, "calib/step_q_w": 0.516741767764298, "calib/step_q_w_n": 577.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 509.953125, "completions/mean_terminated_length": 509.953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.18453333333333333, "grad_norm": 0.029172774404287338, "kl": 0.11724853515625, "learning_rate": 7.5e-07, "loss": -0.0033, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03518332913517952, "mask/share_reasoning": 0.8441824316978455, "mask/share_step_conf": 0.12063427269458771, "num_tokens": 40998253.0, "reward": 0.7339141368865967, "reward_std": 0.21232792735099792, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.676971435546875, "rewards/format_reward_step": 0.96875, "rewards/step_l2_reward": 0.7908568382263184, "step": 173 }, { "adv/mean_abs_final_conf": 0.799006462097168, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7779349684715271, "adv/std_final_conf": 0.9240788221359253, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9356370568275452, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.6908530606527525, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.22924686192468613, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.3891213389121339, "calib/gap": 0.22902577391791556, "calib/mean_conf": 0.6314225941422595, "calib/mu_c": 0.7473728813559322, "calib/mu_w": 0.5183471074380166, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.18347280334728028, "calib/std_conf": 0.36237776076612777, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5223063063063063, "calib/step_q_c_n": 555.0, "calib/step_q_gap": 0.08102078533201668, "calib/step_q_w": 0.4412855209742896, "calib/step_q_w_n": 739.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 548.2109375, "completions/mean_terminated_length": 550.36083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.1856, "grad_norm": 0.026223061606287956, "kl": 0.1020355224609375, "learning_rate": 7.222222222222222e-07, "loss": -0.2093, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.030761104077100754, "mask/share_reasoning": 0.8606378436088562, "mask/share_step_conf": 0.10469479858875275, "num_tokens": 41242827.0, "reward": 0.7080938816070557, "reward_std": 0.25465157628059387, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6587077975273132, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7574799060821533, "step": 174 }, { "adv/mean_abs_final_conf": 0.6954188942909241, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.734821617603302, "adv/std_final_conf": 0.877416729927063, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350839257240295, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7567567567567569, "calib/avg_num_step_conf": 5.4375, "calib/ece": 0.2215833333333332, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.375, "calib/gap": 0.30361198407710027, "calib/mean_conf": 0.5615833333333333, "calib/mu_c": 0.7247747747747747, "calib/mu_w": 0.4211627906976744, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.95703125, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.16033333333333322, "calib/std_conf": 0.38855286176901194, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5498676748582231, "calib/step_q_c_n": 529.0, "calib/step_q_gap": 0.16305423337502495, "calib/step_q_w": 0.38681344148319813, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 532.87109375, "completions/mean_terminated_length": 539.1897583007812, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.18666666666666668, "grad_norm": 0.02312024123966694, "kl": 0.10526275634765625, "learning_rate": 6.944444444444446e-07, "loss": -0.182, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03259507194161415, "mask/share_reasoning": 0.8478891253471375, "mask/share_step_conf": 0.10779708623886108, "num_tokens": 41485066.0, "reward": 0.7382111549377441, "reward_std": 0.21607854962348938, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.691319465637207, "rewards/format_reward_step": 0.93359375, "rewards/step_l2_reward": 0.7851028442382812, "step": 175 }, { "adv/mean_abs_final_conf": 0.6969413757324219, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7284245491027832, "adv/std_final_conf": 0.8782188892364502, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353442788124084, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7745104895104895, "calib/avg_num_step_conf": 4.91796875, "calib/ece": 0.20312500000000006, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.4791666666666667, "calib/gap": 0.3726923076923079, "calib/mean_conf": 0.676875, "calib/mu_c": 0.8476923076923079, "calib/mu_w": 0.475, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.16916666666666672, "calib/std_conf": 0.3678584207386496, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5631003039513678, "calib/step_q_c_n": 658.0, "calib/step_q_gap": 0.17672759180494507, "calib/step_q_w": 0.3863727121464227, "calib/step_q_w_n": 601.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 545.77734375, "completions/mean_terminated_length": 545.77734375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.18773333333333334, "grad_norm": 0.024735065177083015, "kl": 0.10872650146484375, "learning_rate": 6.666666666666667e-07, "loss": -0.1375, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03495047986507416, "mask/share_reasoning": 0.85334712266922, "mask/share_step_conf": 0.11170239001512527, "num_tokens": 41728849.0, "reward": 0.7591984868049622, "reward_std": 0.2403138130903244, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7225527167320251, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7958441972732544, "step": 176 }, { "adv/mean_abs_final_conf": 0.7366968393325806, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7311983108520508, "adv/std_final_conf": 0.9075549244880676, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350935816764832, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.753664111873067, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.15971428571428573, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4204081632653061, "calib/gap": 0.34068441575904246, "calib/mean_conf": 0.6166938775510203, "calib/mu_c": 0.7710447761194029, "calib/mu_w": 0.43036036036036046, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.11473469387755103, "calib/std_conf": 0.37850523432509514, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5550551181102362, "calib/step_q_c_n": 635.0, "calib/step_q_gap": 0.17452160237836206, "calib/step_q_w": 0.38053351573187416, "calib/step_q_w_n": 731.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 519.1953125, "completions/mean_terminated_length": 523.283447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.1888, "grad_norm": 0.0299546979367733, "kl": 0.10813140869140625, "learning_rate": 6.388888888888889e-07, "loss": -0.1086, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.033860474824905396, "mask/share_reasoning": 0.8479142189025879, "mask/share_step_conf": 0.11041281372308731, "num_tokens": 41965595.0, "reward": 0.7677517533302307, "reward_std": 0.2208181917667389, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7314074039459229, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.8040961623191833, "step": 177 }, { "adv/mean_abs_final_conf": 0.7266371250152588, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7638938426971436, "adv/std_final_conf": 0.8988336324691772, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.93440842628479, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7904185742315238, "calib/avg_num_step_conf": 5.01171875, "calib/ece": 0.15554216867469886, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3654618473895582, "calib/gap": 0.35714061478090253, "calib/mean_conf": 0.621004016064257, "calib/mu_c": 0.7787769784172662, "calib/mu_w": 0.4216363636363637, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10915662650602416, "calib/std_conf": 0.3606478385318377, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.587271268057785, "calib/step_q_c_n": 623.0, "calib/step_q_gap": 0.21184702563354246, "calib/step_q_w": 0.3754242424242425, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 472.04296875, "completions/mean_terminated_length": 477.6403503417969, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.18986666666666666, "grad_norm": 0.02804921194911003, "kl": 0.11296844482421875, "learning_rate": 6.111111111111112e-07, "loss": 0.0177, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035195671021938324, "mask/share_reasoning": 0.8432610630989075, "mask/share_step_conf": 0.10982454568147659, "num_tokens": 42192510.0, "reward": 0.807304859161377, "reward_std": 0.19166387617588043, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7737792730331421, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.840830385684967, "step": 178 }, { "adv/mean_abs_final_conf": 0.7112410068511963, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7365227937698364, "adv/std_final_conf": 0.9087821245193481, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9346781969070435, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7975643382352942, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.15419354838709684, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4314516129032258, "calib/gap": 0.3781144957983193, "calib/mean_conf": 0.6313709677419355, "calib/mu_c": 0.8021323529411765, "calib/mu_w": 0.42401785714285717, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11858870967741941, "calib/std_conf": 0.3685692360164819, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5448756906077348, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.14255048735570225, "calib/step_q_w": 0.4023252032520325, "calib/step_q_w_n": 615.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 516.94921875, "completions/mean_terminated_length": 516.94921875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.19093333333333334, "grad_norm": 0.024730678647756577, "kl": 0.1097259521484375, "learning_rate": 5.833333333333334e-07, "loss": -0.0799, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03368227928876877, "mask/share_reasoning": 0.854263424873352, "mask/share_step_conf": 0.11205431818962097, "num_tokens": 42431113.0, "reward": 0.8051706552505493, "reward_std": 0.18343767523765564, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7680890560150146, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8422523736953735, "step": 179 }, { "adv/mean_abs_final_conf": 0.7052978277206421, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7558290362358093, "adv/std_final_conf": 0.8929066061973572, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348551034927368, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7466370775008535, "calib/avg_num_step_conf": 5.328125, "calib/ece": 0.17609756097560975, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.44715447154471544, "calib/gap": 0.3255377261864116, "calib/mean_conf": 0.6577235772357723, "calib/mu_c": 0.7913793103448276, "calib/mu_w": 0.465841584158416, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12219512195121952, "calib/std_conf": 0.3662295539577348, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5244253632760898, "calib/step_q_c_n": 757.0, "calib/step_q_gap": 0.08030674713111446, "calib/step_q_w": 0.44411861614497533, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 559.89453125, "completions/mean_terminated_length": 562.0902099609375, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.192, "grad_norm": 0.033511292189359665, "kl": 0.1053466796875, "learning_rate": 5.555555555555555e-07, "loss": -0.0235, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030846022069454193, "mask/share_reasoning": 0.861070454120636, "mask/share_step_conf": 0.1041773110628128, "num_tokens": 42678302.0, "reward": 0.7754788398742676, "reward_std": 0.19080181419849396, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.742948055267334, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8080097436904907, "step": 180 }, { "adv/mean_abs_final_conf": 0.7309097051620483, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.761759877204895, "adv/std_final_conf": 0.9095430970191956, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350034594535828, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7781721956886537, "calib/avg_num_step_conf": 4.828125, "calib/ece": 0.2001606425702812, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.42168674698795183, "calib/gap": 0.32375306570285284, "calib/mean_conf": 0.6502008032128515, "calib/mu_c": 0.8153278688524591, "calib/mu_w": 0.4915748031496063, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.18020080321285148, "calib/std_conf": 0.3559538910160631, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5650313479623824, "calib/step_q_c_n": 638.0, "calib/step_q_gap": 0.13148619746071016, "calib/step_q_w": 0.43354515050167225, "calib/step_q_w_n": 598.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 492.484375, "completions/mean_terminated_length": 494.41571044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.19306666666666666, "grad_norm": 0.026930317282676697, "kl": 0.12311553955078125, "learning_rate": 5.277777777777779e-07, "loss": -0.0894, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03484185039997101, "mask/share_reasoning": 0.8484460115432739, "mask/share_step_conf": 0.11280588805675507, "num_tokens": 42910642.0, "reward": 0.7667028307914734, "reward_std": 0.1992136836051941, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7242370843887329, "rewards/format_reward_step": 0.94921875, "rewards/step_l2_reward": 0.8091685175895691, "step": 181 }, { "adv/mean_abs_final_conf": 0.716362476348877, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7423475980758667, "adv/std_final_conf": 0.8897429704666138, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351845979690552, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7849022736338251, "calib/avg_num_step_conf": 5.40625, "calib/ece": 0.1755465587044534, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.48582995951417, "calib/gap": 0.3501781677968355, "calib/mean_conf": 0.6844534412955465, "calib/mu_c": 0.8389855072463768, "calib/mu_w": 0.48880733944954124, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1506477732793522, "calib/std_conf": 0.3539682573120276, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.545406976744186, "calib/step_q_c_n": 688.0, "calib/step_q_gap": 0.14319433306602514, "calib/step_q_w": 0.4022126436781609, "calib/step_q_w_n": 696.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 521.8359375, "completions/mean_terminated_length": 521.8359375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.19413333333333332, "grad_norm": 0.02837144397199154, "kl": 0.110260009765625, "learning_rate": 5.000000000000001e-07, "loss": -0.0358, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03232816234230995, "mask/share_reasoning": 0.8547617793083191, "mask/share_step_conf": 0.11291007697582245, "num_tokens": 43150392.0, "reward": 0.7891018390655518, "reward_std": 0.20950955152511597, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.7541359663009644, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8240677118301392, "step": 182 }, { "adv/mean_abs_final_conf": 0.6966094374656677, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7494475841522217, "adv/std_final_conf": 0.8805187344551086, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.935265839099884, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.711265756302521, "calib/avg_num_step_conf": 4.8671875, "calib/ece": 0.23927125506072872, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.44129554655870445, "calib/gap": 0.24173581932773103, "calib/mean_conf": 0.6294736842105263, "calib/mu_c": 0.7459375, "calib/mu_w": 0.504201680672269, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.17526315789473682, "calib/std_conf": 0.374413612133966, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5725888324873096, "calib/step_q_c_n": 591.0, "calib/step_q_gap": 0.16466516836517214, "calib/step_q_w": 0.4079236641221375, "calib/step_q_w_n": 655.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 516.7265625, "completions/mean_terminated_length": 516.7265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1952, "grad_norm": 0.02867904305458069, "kl": 0.1077117919921875, "learning_rate": 4.7222222222222226e-07, "loss": -0.1072, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0338304303586483, "mask/share_reasoning": 0.8622312545776367, "mask/share_step_conf": 0.1039382666349411, "num_tokens": 43389354.0, "reward": 0.7498453259468079, "reward_std": 0.2144506722688675, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6893109679222107, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.8103797435760498, "step": 183 }, { "adv/mean_abs_final_conf": 0.722597599029541, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7299163937568665, "adv/std_final_conf": 0.9212670922279358, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351829290390015, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7574013157894738, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.1932142857142856, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5357142857142857, "calib/gap": 0.31155526315789495, "calib/mean_conf": 0.7247222222222223, "calib/mu_c": 0.8483552631578949, "calib/mu_w": 0.5367999999999999, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.15738095238095226, "calib/std_conf": 0.3386192819046213, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5488351920693928, "calib/step_q_c_n": 807.0, "calib/step_q_gap": 0.10737555450761355, "calib/step_q_w": 0.44145963756177925, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 507.171875, "completions/mean_terminated_length": 507.171875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.19626666666666667, "grad_norm": 0.025500323623418808, "kl": 0.1114654541015625, "learning_rate": 4.444444444444445e-07, "loss": -0.052, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033292390406131744, "mask/share_reasoning": 0.8502817749977112, "mask/share_step_conf": 0.11642585694789886, "num_tokens": 43624470.0, "reward": 0.794865608215332, "reward_std": 0.21812523901462555, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.756519079208374, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8332120776176453, "step": 184 }, { "adv/mean_abs_final_conf": 0.6467674970626831, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7386155724525452, "adv/std_final_conf": 0.8718067407608032, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341294169425964, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.8350427350427351, "calib/avg_num_step_conf": 5.50390625, "calib/ece": 0.1701673640167365, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5439330543933054, "calib/gap": 0.44017663817663805, "calib/mean_conf": 0.7178661087866108, "calib/mu_c": 0.9094074074074073, "calib/mu_w": 0.4692307692307693, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1615899581589959, "calib/std_conf": 0.36194160117195295, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.593742774566474, "calib/step_q_c_n": 692.0, "calib/step_q_gap": 0.22329647052184354, "calib/step_q_w": 0.3704463040446304, "calib/step_q_w_n": 717.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 556.5, "completions/mean_terminated_length": 558.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.19733333333333333, "grad_norm": 0.027720289304852486, "kl": 0.09789276123046875, "learning_rate": 4.1666666666666667e-07, "loss": -0.0947, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03240993991494179, "mask/share_reasoning": 0.8568481802940369, "mask/share_step_conf": 0.10683561861515045, "num_tokens": 43873854.0, "reward": 0.7714694738388062, "reward_std": 0.19280867278575897, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7566031217575073, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.786335825920105, "step": 185 }, { "adv/mean_abs_final_conf": 0.6654640436172485, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7512376308441162, "adv/std_final_conf": 0.8803311586380005, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341305494308472, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7729294231388978, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.17456349206349214, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.48412698412698413, "calib/gap": 0.3772716725749276, "calib/mean_conf": 0.6609126984126985, "calib/mu_c": 0.8211034482758621, "calib/mu_w": 0.4438317757009345, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1300396825396826, "calib/std_conf": 0.3737470607166107, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5759733333333333, "calib/step_q_c_n": 750.0, "calib/step_q_gap": 0.16490244343891408, "calib/step_q_w": 0.41107088989441926, "calib/step_q_w_n": 663.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2543.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 513.26953125, "completions/mean_terminated_length": 513.26953125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1984, "grad_norm": 0.03017481043934822, "kl": 0.1173858642578125, "learning_rate": 3.8888888888888895e-07, "loss": -0.0154, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03294532373547554, "mask/share_reasoning": 0.8512404561042786, "mask/share_step_conf": 0.115814208984375, "num_tokens": 44110291.0, "reward": 0.8168442249298096, "reward_std": 0.16611357033252716, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7806402444839478, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8530482053756714, "step": 186 }, { "adv/mean_abs_final_conf": 0.7587389945983887, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7640891075134277, "adv/std_final_conf": 0.9130141139030457, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348838329315186, "calib/answer_extract_rate": 0.9296875, "calib/auroc": 0.7465395480225988, "calib/avg_num_step_conf": 5.296875, "calib/ece": 0.24495798319327733, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.921875, "calib/frac_conf_gt_0.9": 0.5042016806722689, "calib/gap": 0.29432062146892646, "calib/mean_conf": 0.7091596638655461, "calib/mu_c": 0.8550833333333333, "calib/mu_w": 0.5607627118644068, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.22495798319327734, "calib/std_conf": 0.33363437903968757, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5535940099833612, "calib/step_q_c_n": 601.0, "calib/step_q_gap": 0.1172231490562089, "calib/step_q_w": 0.43637086092715227, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 592.921875, "completions/mean_terminated_length": 595.2470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.19946666666666665, "grad_norm": 0.04078133404254913, "kl": 0.12639617919921875, "learning_rate": 3.611111111111111e-07, "loss": -0.0517, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.029552815482020378, "mask/share_reasoning": 0.865035891532898, "mask/share_step_conf": 0.10150502622127533, "num_tokens": 44363623.0, "reward": 0.7370548248291016, "reward_std": 0.24327854812145233, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6864898204803467, "rewards/format_reward_step": 0.921875, "rewards/step_l2_reward": 0.7876197099685669, "step": 187 }, { "adv/mean_abs_final_conf": 0.741357147693634, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7504744529724121, "adv/std_final_conf": 0.8928669691085815, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348607063293457, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7147405175955087, "calib/avg_num_step_conf": 5.5390625, "calib/ece": 0.23893004115226332, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.4732510288065844, "calib/gap": 0.23204162672874173, "calib/mean_conf": 0.6762139917695473, "calib/mu_c": 0.7802985074626867, "calib/mu_w": 0.548256880733945, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1818518518518518, "calib/std_conf": 0.3621910738279023, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5239970501474926, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.09524029339073592, "calib/step_q_w": 0.42875675675675673, "calib/step_q_w_n": 740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 581.21484375, "completions/mean_terminated_length": 583.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.20053333333333334, "grad_norm": 0.025433626025915146, "kl": 0.0932159423828125, "learning_rate": 3.3333333333333335e-07, "loss": -0.0809, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.032101910561323166, "mask/share_reasoning": 0.8519271612167358, "mask/share_step_conf": 0.11206468939781189, "num_tokens": 44616486.0, "reward": 0.7399386167526245, "reward_std": 0.21706843376159668, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6801941394805908, "rewards/format_reward_step": 0.9453125, "rewards/step_l2_reward": 0.7996830940246582, "step": 188 }, { "adv/mean_abs_final_conf": 0.7388796210289001, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7620296478271484, "adv/std_final_conf": 0.9037927985191345, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347342848777771, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7080489763147331, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.21684210526315784, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.39271255060728744, "calib/gap": 0.24630603505954773, "calib/mean_conf": 0.5980566801619435, "calib/mu_c": 0.703758865248227, "calib/mu_w": 0.4574528301886792, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12202429149797565, "calib/std_conf": 0.3709767565122908, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4952091254752851, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.0966848536306249, "calib/step_q_w": 0.3985242718446602, "calib/step_q_w_n": 515.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2199.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 493.4765625, "completions/mean_terminated_length": 493.4765625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2016, "grad_norm": 0.024370839819312096, "kl": 0.1257476806640625, "learning_rate": 3.055555555555556e-07, "loss": -0.085, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.035693950951099396, "mask/share_reasoning": 0.8530647158622742, "mask/share_step_conf": 0.11124132573604584, "num_tokens": 44850584.0, "reward": 0.7606855630874634, "reward_std": 0.18621540069580078, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7114140391349792, "rewards/format_reward_step": 0.96484375, "rewards/step_l2_reward": 0.8099571466445923, "step": 189 }, { "adv/mean_abs_final_conf": 0.7214258313179016, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7243682742118835, "adv/std_final_conf": 0.9232921004295349, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349976181983948, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8055611055611055, "calib/avg_num_step_conf": 5.5625, "calib/ece": 0.13370967741935494, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.39919354838709675, "calib/gap": 0.38047952047952055, "calib/mean_conf": 0.6365322580645162, "calib/mu_c": 0.7976223776223776, "calib/mu_w": 0.4171428571428571, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.09681451612903236, "calib/std_conf": 0.3574552630588191, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5187714987714988, "calib/step_q_c_n": 814.0, "calib/step_q_gap": 0.14439936762395778, "calib/step_q_w": 0.374372131147541, "calib/step_q_w_n": 610.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 571.48828125, "completions/mean_terminated_length": 571.48828125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.20266666666666666, "grad_norm": 0.03225598856806755, "kl": 0.09828948974609375, "learning_rate": 2.7777777777777776e-07, "loss": -0.0619, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.030426179990172386, "mask/share_reasoning": 0.8637335300445557, "mask/share_step_conf": 0.10584026575088501, "num_tokens": 45102493.0, "reward": 0.8016316294670105, "reward_std": 0.20212367177009583, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7732390761375427, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.830024242401123, "step": 190 }, { "adv/mean_abs_final_conf": 0.6836140751838684, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7573249340057373, "adv/std_final_conf": 0.8904557228088379, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9353086352348328, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7221316818774447, "calib/avg_num_step_conf": 5.92578125, "calib/ece": 0.2806854838709677, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5524193548387096, "calib/gap": 0.26793611473272494, "calib/mean_conf": 0.7280241935483871, "calib/mu_c": 0.8684745762711864, "calib/mu_w": 0.6005384615384615, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2664516129032258, "calib/std_conf": 0.3429928280005771, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5900663349917082, "calib/step_q_c_n": 603.0, "calib/step_q_gap": 0.17962869823022032, "calib/step_q_w": 0.4104376367614879, "calib/step_q_w_n": 914.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 533.52734375, "completions/mean_terminated_length": 533.52734375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.20373333333333332, "grad_norm": 0.03718110918998718, "kl": 0.1095123291015625, "learning_rate": 2.5000000000000004e-07, "loss": -0.0399, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03474542871117592, "mask/share_reasoning": 0.8406577706336975, "mask/share_step_conf": 0.12459678947925568, "num_tokens": 45343244.0, "reward": 0.7328799962997437, "reward_std": 0.1909710317850113, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6795296669006348, "rewards/format_reward_step": 0.9609375, "rewards/step_l2_reward": 0.7862304449081421, "step": 191 }, { "adv/mean_abs_final_conf": 0.7038275003433228, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7256333827972412, "adv/std_final_conf": 0.8924391269683838, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348781704902649, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.79156223893066, "calib/avg_num_step_conf": 4.93359375, "calib/ece": 0.16763485477178416, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.44398340248962653, "calib/gap": 0.3649143692564744, "calib/mean_conf": 0.6344398340248963, "calib/mu_c": 0.79796992481203, "calib/mu_w": 0.4330555555555556, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.12510373443983397, "calib/std_conf": 0.37149886703162577, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5448253968253969, "calib/step_q_c_n": 630.0, "calib/step_q_gap": 0.15837989919506512, "calib/step_q_w": 0.38644549763033176, "calib/step_q_w_n": 633.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 531.1328125, "completions/mean_terminated_length": 533.2156982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.2048, "grad_norm": 0.030240915715694427, "kl": 0.1117401123046875, "learning_rate": 2.2222222222222224e-07, "loss": -0.1492, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.034587934613227844, "mask/share_reasoning": 0.852296769618988, "mask/share_step_conf": 0.10920904576778412, "num_tokens": 45584190.0, "reward": 0.7673456072807312, "reward_std": 0.2374992072582245, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.735599160194397, "rewards/format_reward_step": 0.9296875, "rewards/step_l2_reward": 0.7990920543670654, "step": 192 }, { "adv/mean_abs_final_conf": 0.7664552927017212, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7637758851051331, "adv/std_final_conf": 0.9189733266830444, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9349616169929504, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.775987861570248, "calib/avg_num_step_conf": 5.12109375, "calib/ece": 0.2009638554216867, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.3855421686746988, "calib/gap": 0.3164585485537191, "calib/mean_conf": 0.635421686746988, "calib/mu_c": 0.7980991735537191, "calib/mu_w": 0.481640625, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.1752208835341365, "calib/std_conf": 0.35633442125431675, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5100494233937397, "calib/step_q_c_n": 607.0, "calib/step_q_gap": 0.09916874157555788, "calib/step_q_w": 0.4108806818181818, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 549.03515625, "completions/mean_terminated_length": 549.03515625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.20586666666666667, "grad_norm": 0.04683626815676689, "kl": 0.1035919189453125, "learning_rate": 1.9444444444444447e-07, "loss": -0.1273, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.032026469707489014, "mask/share_reasoning": 0.8625136613845825, "mask/share_step_conf": 0.10545986890792847, "num_tokens": 45830455.0, "reward": 0.7765910625457764, "reward_std": 0.20513297617435455, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.7382304668426514, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8149516582489014, "step": 193 }, { "adv/mean_abs_final_conf": 0.7188024520874023, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7388456463813782, "adv/std_final_conf": 0.8925259113311768, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9348104000091553, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8354273044880408, "calib/avg_num_step_conf": 4.88671875, "calib/ece": 0.2031967213114754, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.48770491803278687, "calib/gap": 0.4008196721311476, "calib/mean_conf": 0.666311475409836, "calib/mu_c": 0.8667213114754099, "calib/mu_w": 0.46590163934426226, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.18475409836065573, "calib/std_conf": 0.36803911236303505, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5610172143974961, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.1487296326981497, "calib/step_q_w": 0.4122875816993464, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 498.07421875, "completions/mean_terminated_length": 500.0274658203125, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.20693333333333333, "grad_norm": 0.024641873314976692, "kl": 0.105682373046875, "learning_rate": 1.6666666666666668e-07, "loss": -0.1451, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.034925445914268494, "mask/share_reasoning": 0.8536133766174316, "mask/share_step_conf": 0.10755492746829987, "num_tokens": 46063906.0, "reward": 0.7782323360443115, "reward_std": 0.2255384773015976, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7424625158309937, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.814002275466919, "step": 194 }, { "adv/mean_abs_final_conf": 0.7369129657745361, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7271870374679565, "adv/std_final_conf": 0.9174585938453674, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9350939989089966, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7756257030371204, "calib/avg_num_step_conf": 5.30078125, "calib/ece": 0.16782426778242684, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.38493723849372385, "calib/gap": 0.364064257592801, "calib/mean_conf": 0.6210460251046024, "calib/mu_c": 0.7916535433070867, "calib/mu_w": 0.4275892857142857, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.12874476987447703, "calib/std_conf": 0.36755916891238644, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5499426111908179, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.16221533846354513, "calib/step_q_w": 0.38772727272727275, "calib/step_q_w_n": 660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2461.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 534.41015625, "completions/mean_terminated_length": 540.7470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.208, "grad_norm": 0.024135705083608627, "kl": 0.109375, "learning_rate": 1.3888888888888888e-07, "loss": -0.1971, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.03207322955131531, "mask/share_reasoning": 0.8431376814842224, "mask/share_step_conf": 0.11307032406330109, "num_tokens": 46306699.0, "reward": 0.757156252861023, "reward_std": 0.22981423139572144, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7289426326751709, "rewards/format_reward_step": 0.92578125, "rewards/step_l2_reward": 0.7853699326515198, "step": 195 }, { "adv/mean_abs_final_conf": 0.634067177772522, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7509913444519043, "adv/std_final_conf": 0.8568524718284607, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9345593452453613, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7871885225270576, "calib/avg_num_step_conf": 4.859375, "calib/ece": 0.1885770750988143, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.47035573122529645, "calib/gap": 0.33918701233324944, "calib/mean_conf": 0.708498023715415, "calib/mu_c": 0.8640145985401461, "calib/mu_w": 0.5248275862068966, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.17778656126482223, "calib/std_conf": 0.3414930554390387, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5664759036144578, "calib/step_q_c_n": 664.0, "calib/step_q_gap": 0.10759659326963023, "calib/step_q_w": 0.45887931034482754, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 445.6484375, "completions/mean_terminated_length": 445.6484375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.20906666666666668, "grad_norm": 0.025791611522436142, "kl": 0.1186065673828125, "learning_rate": 1.1111111111111112e-07, "loss": -0.0162, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037724819034338, "mask/share_reasoning": 0.8455040454864502, "mask/share_step_conf": 0.11677109450101852, "num_tokens": 46523329.0, "reward": 0.7994219064712524, "reward_std": 0.15122339129447937, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7627730369567871, "rewards/format_reward_step": 0.984375, "rewards/step_l2_reward": 0.8360706567764282, "step": 196 }, { "adv/mean_abs_final_conf": 0.7248457670211792, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7576488256454468, "adv/std_final_conf": 0.9057278633117676, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9341248869895935, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8298182998053212, "calib/avg_num_step_conf": 5.875, "calib/ece": 0.16755020080321287, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.37751004016064255, "calib/gap": 0.39027644386761834, "calib/mean_conf": 0.6077108433734939, "calib/mu_c": 0.8177391304347825, "calib/mu_w": 0.4274626865671642, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.15670682730923696, "calib/std_conf": 0.36246808573389455, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5408744038155803, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.14510297524415178, "calib/step_q_w": 0.3957714285714285, "calib/step_q_w_n": 875.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 539.18359375, "completions/mean_terminated_length": 541.298095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.21013333333333334, "grad_norm": 0.029260681942105293, "kl": 0.1084442138671875, "learning_rate": 8.333333333333334e-08, "loss": -0.0653, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03204625844955444, "mask/share_reasoning": 0.8442831635475159, "mask/share_step_conf": 0.11976435035467148, "num_tokens": 46766416.0, "reward": 0.806782603263855, "reward_std": 0.16487516462802887, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7711203098297119, "rewards/format_reward_step": 0.97265625, "rewards/step_l2_reward": 0.8424450159072876, "step": 197 }, { "adv/mean_abs_final_conf": 0.7016639709472656, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7586361765861511, "adv/std_final_conf": 0.8786138296127319, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9339083433151245, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8085466612488149, "calib/avg_num_step_conf": 5.625, "calib/ece": 0.13453061224489798, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4489795918367347, "calib/gap": 0.3924190708384126, "calib/mean_conf": 0.6554285714285714, "calib/mu_c": 0.8268115942028986, "calib/mu_w": 0.434392523364486, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11334693877551025, "calib/std_conf": 0.3682018080191506, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5540152477763659, "calib/step_q_c_n": 787.0, "calib/step_q_gap": 0.17266762143639658, "calib/step_q_w": 0.3813476263399693, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 495.03125, "completions/mean_terminated_length": 500.9012145996094, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.2112, "grad_norm": 0.029695816338062286, "kl": 0.1151885986328125, "learning_rate": 5.555555555555556e-08, "loss": -0.1509, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.034423746168613434, "mask/share_reasoning": 0.8307321667671204, "mask/share_step_conf": 0.12312532961368561, "num_tokens": 46998528.0, "reward": 0.8024085760116577, "reward_std": 0.18739835917949677, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.768500030040741, "rewards/format_reward_step": 0.95703125, "rewards/step_l2_reward": 0.8363170623779297, "step": 198 }, { "adv/mean_abs_final_conf": 0.7539445161819458, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7357039451599121, "adv/std_final_conf": 0.9093256592750549, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9351595044136047, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.6866793062284952, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.23665289256198357, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.4132231404958678, "calib/gap": 0.2108552770170634, "calib/mean_conf": 0.6543388429752066, "calib/mu_c": 0.7423404255319148, "calib/mu_w": 0.5314851485148514, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.15417355371900834, "calib/std_conf": 0.35539376494340014, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.51625, "calib/step_q_c_n": 712.0, "calib/step_q_gap": 0.10226851851851854, "calib/step_q_w": 0.41398148148148145, "calib/step_q_w_n": 648.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 559.0546875, "completions/mean_terminated_length": 565.683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.21226666666666666, "grad_norm": 0.027883553877472878, "kl": 0.1039276123046875, "learning_rate": 2.777777777777778e-08, "loss": -0.1335, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03248848393559456, "mask/share_reasoning": 0.845879316329956, "mask/share_step_conf": 0.1099134162068367, "num_tokens": 47245846.0, "reward": 0.7473338842391968, "reward_std": 0.2364819198846817, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6842191219329834, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8104487061500549, "step": 199 }, { "adv/mean_abs_final_conf": 0.6555727124214172, "adv/mean_abs_reasoning": 0.0, "adv/mean_abs_step_conf": 0.7696542143821716, "adv/std_final_conf": 0.8538159728050232, "adv/std_reasoning": 0.0, "adv/std_step_conf": 0.9347708821296692, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.9080633802816902, "calib/avg_num_step_conf": 4.9765625, "calib/ece": 0.1094214876033059, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5247933884297521, "calib/gap": 0.5623704225352111, "calib/mean_conf": 0.6676859504132231, "calib/mu_c": 0.9000704225352111, "calib/mu_w": 0.33770000000000006, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.09516528925619847, "calib/std_conf": 0.3811872716705017, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5982966226138032, "calib/step_q_c_n": 681.0, "calib/step_q_gap": 0.24617183340638332, "calib/step_q_w": 0.3521247892074199, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 556.3125, "completions/mean_terminated_length": 560.6929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.21333333333333335, "grad_norm": 0.0319523885846138, "kl": 0.0972442626953125, "learning_rate": 0.0, "loss": -0.0845, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03294922411441803, "mask/share_reasoning": 0.8577835559844971, "mask/share_step_conf": 0.10145474225282669, "num_tokens": 47496310.0, "reward": 0.8319915533065796, "reward_std": 0.17969706654548645, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.8264539241790771, "rewards/format_reward_step": 0.94140625, "rewards/step_l2_reward": 0.8375290632247925, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": -0.09220705164130777, "train_runtime": 12915.8932, "train_samples_per_second": 3.964, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 47496310, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }