{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "adv/mean_abs_final_conf": 0.773959219455719, "adv/mean_abs_reasoning": 0.47714588046073914, "adv/mean_abs_step_conf": 0.7557058930397034, "adv/ratio_final_to_reasoning": 1.622059942565935, "adv/ratio_step_to_reasoning": 1.583804710437786, "adv/std_final_conf": 0.9294352531433105, "adv/std_reasoning": 0.7393431663513184, "adv/std_step_conf": 0.9350293874740601, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.38076182006817844, "calib/avg_num_step_conf": 5.23046875, "calib/ece": 0.2003187250996017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.2948207171314741, "calib/gap": -0.026059730250481805, "calib/mean_conf": 0.8737051792828686, "calib/mu_c": 0.865606936416185, "calib/mu_w": 0.8916666666666668, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19239043824701207, "calib/std_conf": 0.09027744273295583, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7959393232205367, "calib/step_q_c_n": 857.0, "calib/step_q_gap": -0.006446568895645877, "calib/step_q_w": 0.8023858921161826, "calib/step_q_w_n": 482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 474.94921875, "completions/mean_terminated_length": 478.68896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0010666666666666667, "grad_norm": 0.04495124891400337, "kl": 0.000291675329208374, "learning_rate": 2.5000000000000004e-07, "loss": -0.0071, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03466901555657387, "mask/share_reasoning": 0.8340686559677124, "mask/share_step_conf": 0.12344987690448761, "num_tokens": 229171.0, "reward": 0.8151886463165283, "reward_std": 0.1976359486579895, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7142800688743591, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.5856284499168396, "step": 1 }, { "adv/mean_abs_final_conf": 0.7672724723815918, "adv/mean_abs_reasoning": 0.5104547739028931, "adv/mean_abs_step_conf": 0.7685759663581848, "adv/ratio_final_to_reasoning": 1.503115479781084, "adv/ratio_step_to_reasoning": 1.5056690732495641, "adv/std_final_conf": 0.9330522418022156, "adv/std_reasoning": 0.7575037479400635, "adv/std_step_conf": 0.9351121783256531, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.44343065693430656, "calib/avg_num_step_conf": 5.05859375, "calib/ece": 0.3349411764705883, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.2823529411764706, "calib/gap": 0.002352468143016151, "calib/mean_conf": 0.8721960784313726, "calib/mu_c": 0.8732846715328467, "calib/mu_w": 0.8709322033898306, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3349411764705883, "calib/std_conf": 0.07627016470309335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7954391371340525, "calib/step_q_c_n": 649.0, "calib/step_q_gap": 0.011011892552009073, "calib/step_q_w": 0.7844272445820434, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 492.9765625, "completions/mean_terminated_length": 494.9098205566406, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.0021333333333333334, "grad_norm": 0.03925655782222748, "kl": 0.00037539005279541016, "learning_rate": 5.000000000000001e-07, "loss": -0.0196, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03364308178424835, "mask/share_reasoning": 0.8523939251899719, "mask/share_step_conf": 0.11005672812461853, "num_tokens": 458661.0, "reward": 0.7541265487670898, "reward_std": 0.19602537155151367, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6320762038230896, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.5699269771575928, "step": 2 }, { "adv/mean_abs_final_conf": 0.7629603147506714, "adv/mean_abs_reasoning": 0.456207275390625, "adv/mean_abs_step_conf": 0.7601202726364136, "adv/ratio_final_to_reasoning": 1.6723983941233527, "adv/ratio_step_to_reasoning": 1.6661730613251722, "adv/std_final_conf": 0.9287545680999756, "adv/std_reasoning": 0.7393346428871155, "adv/std_step_conf": 0.9348368048667908, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4309514251304697, "calib/avg_num_step_conf": 4.95703125, "calib/ece": 0.2599604743083003, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.31225296442687744, "calib/gap": -0.008907399973236974, "calib/mean_conf": 0.882806324110672, "calib/mu_c": 0.8794968553459118, "calib/mu_w": 0.8884042553191488, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2571541501976284, "calib/std_conf": 0.04582608082005031, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.7922554347826088, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.022049055795742123, "calib/step_q_w": 0.7702063789868667, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 507.96875, "completions/mean_terminated_length": 511.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.0032, "grad_norm": 0.04582794010639191, "kl": 0.0014024674892425537, "learning_rate": 7.5e-07, "loss": 0.0181, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032599903643131256, "mask/share_reasoning": 0.8509199023246765, "mask/share_step_conf": 0.10866767168045044, "num_tokens": 693957.0, "reward": 0.7870842218399048, "reward_std": 0.19606655836105347, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6834691166877747, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5719493627548218, "step": 3 }, { "adv/mean_abs_final_conf": 0.7519498467445374, "adv/mean_abs_reasoning": 0.4396269917488098, "adv/mean_abs_step_conf": 0.7328969240188599, "adv/ratio_final_to_reasoning": 1.7104269320528431, "adv/ratio_step_to_reasoning": 1.6670880946218516, "adv/std_final_conf": 0.9299926161766052, "adv/std_reasoning": 0.7205653786659241, "adv/std_step_conf": 0.9349788427352905, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4655419222903886, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.2355731225296443, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.28063241106719367, "calib/gap": -0.002314928425358076, "calib/mean_conf": 0.8798418972332016, "calib/mu_c": 0.8790184049079753, "calib/mu_w": 0.8813333333333334, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2355731225296443, "calib/std_conf": 0.04344792330006411, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.8000382165605096, "calib/step_q_c_n": 785.0, "calib/step_q_gap": 0.02094194936993976, "calib/step_q_w": 0.7790962671905698, "calib/step_q_w_n": 509.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 506.82421875, "completions/mean_terminated_length": 506.82421875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.004266666666666667, "grad_norm": 0.04342903196811676, "kl": 0.00028195977210998535, "learning_rate": 1.0000000000000002e-06, "loss": 0.0294, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033570367842912674, "mask/share_reasoning": 0.8493372201919556, "mask/share_step_conf": 0.11709243804216385, "num_tokens": 929872.0, "reward": 0.7906985878944397, "reward_std": 0.18596147000789642, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7032796740531921, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5538987517356873, "step": 4 }, { "adv/mean_abs_final_conf": 0.7772389650344849, "adv/mean_abs_reasoning": 0.5261285305023193, "adv/mean_abs_step_conf": 0.7380795478820801, "adv/ratio_final_to_reasoning": 1.4772796379098065, "adv/ratio_step_to_reasoning": 1.4028502639410207, "adv/std_final_conf": 0.9322751760482788, "adv/std_reasoning": 0.757560670375824, "adv/std_step_conf": 0.9353094696998596, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.39873829344432876, "calib/avg_num_step_conf": 4.83984375, "calib/ece": 0.3755645161290322, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.27419354838709675, "calib/gap": -0.015967741935483892, "calib/mean_conf": 0.8755645161290322, "calib/mu_c": 0.8675806451612903, "calib/mu_w": 0.8835483870967742, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.3755645161290322, "calib/std_conf": 0.04874724838148169, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.7979742765273312, "calib/step_q_c_n": 622.0, "calib/step_q_gap": 0.019919171178870876, "calib/step_q_w": 0.7780551053484603, "calib/step_q_w_n": 617.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 530.97265625, "completions/mean_terminated_length": 533.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.005333333333333333, "grad_norm": 0.037376418709754944, "kl": 0.00031588971614837646, "learning_rate": 1.25e-06, "loss": -0.0605, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03435836732387543, "mask/share_reasoning": 0.8499799370765686, "mask/share_step_conf": 0.11175543814897537, "num_tokens": 1172489.0, "reward": 0.6875247955322266, "reward_std": 0.18943388760089874, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.578542172908783, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.5074448585510254, "step": 5 }, { "adv/mean_abs_final_conf": 0.7624509334564209, "adv/mean_abs_reasoning": 0.42811134457588196, "adv/mean_abs_step_conf": 0.7660953998565674, "adv/ratio_final_to_reasoning": 1.780964095244334, "adv/ratio_step_to_reasoning": 1.7894769890190994, "adv/std_final_conf": 0.9306944608688354, "adv/std_reasoning": 0.701313316822052, "adv/std_step_conf": 0.9351617097854614, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5076169405815424, "calib/avg_num_step_conf": 5.22265625, "calib/ece": 0.3263636363636364, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.31225296442687744, "calib/gap": 0.0015790139064474351, "calib/mean_conf": 0.8797233201581027, "calib/mu_c": 0.8804285714285713, "calib/mu_w": 0.8788495575221239, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3263636363636364, "calib/std_conf": 0.04437001410662838, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7998520710059173, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.005495036210153348, "calib/step_q_w": 0.7943570347957639, "calib/step_q_w_n": 661.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 453.703125, "completions/mean_terminated_length": 455.4823913574219, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.0064, "grad_norm": 0.04042307659983635, "kl": 0.009191513061523438, "learning_rate": 1.5e-06, "loss": 0.0336, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.036790646612644196, "mask/share_reasoning": 0.8311512470245361, "mask/share_step_conf": 0.12815183401107788, "num_tokens": 1394589.0, "reward": 0.7339399456977844, "reward_std": 0.17830194532871246, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6337183713912964, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5279115438461304, "step": 6 }, { "adv/mean_abs_final_conf": 0.7737003564834595, "adv/mean_abs_reasoning": 0.471673846244812, "adv/mean_abs_step_conf": 0.7888888120651245, "adv/ratio_final_to_reasoning": 1.640329144054104, "adv/ratio_step_to_reasoning": 1.6725303265080103, "adv/std_final_conf": 0.9291283488273621, "adv/std_reasoning": 0.7206437587738037, "adv/std_step_conf": 0.9347559809684753, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.44000855492977825, "calib/avg_num_step_conf": 5.015625, "calib/ece": 0.2105555555555556, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.29365079365079366, "calib/gap": -0.008644756540956866, "calib/mean_conf": 0.8811904761904762, "calib/mu_c": 0.8783431952662721, "calib/mu_w": 0.886987951807229, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2105555555555556, "calib/std_conf": 0.04319772081835137, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.7899770378874857, "calib/step_q_c_n": 871.0, "calib/step_q_gap": 0.014650161374168413, "calib/step_q_w": 0.7753268765133173, "calib/step_q_w_n": 413.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 554.140625, "completions/mean_terminated_length": 554.140625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.007466666666666667, "grad_norm": 0.05730045214295387, "kl": 0.00027717649936676025, "learning_rate": 1.75e-06, "loss": 0.0567, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030790384858846664, "mask/share_reasoning": 0.8650421500205994, "mask/share_step_conf": 0.10416749119758606, "num_tokens": 1643873.0, "reward": 0.8190743923187256, "reward_std": 0.19272911548614502, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.717705488204956, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5915369987487793, "step": 7 }, { "adv/mean_abs_final_conf": 0.7933437824249268, "adv/mean_abs_reasoning": 0.4549658000469208, "adv/mean_abs_step_conf": 0.7815483808517456, "adv/ratio_final_to_reasoning": 1.7437437766599357, "adv/ratio_step_to_reasoning": 1.7178178684444947, "adv/std_final_conf": 0.9299046993255615, "adv/std_reasoning": 0.7014166116714478, "adv/std_step_conf": 0.9355618953704834, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.47194433756573045, "calib/avg_num_step_conf": 4.99609375, "calib/ece": 0.3208764940239044, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.2788844621513944, "calib/gap": 0.01008464794151609, "calib/mean_conf": 0.8706772908366534, "calib/mu_c": 0.8752173913043478, "calib/mu_w": 0.8651327433628317, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3208764940239044, "calib/std_conf": 0.0805999629412233, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7944136807817589, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.03266931987950328, "calib/step_q_w": 0.7617443609022556, "calib/step_q_w_n": 665.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 539.08203125, "completions/mean_terminated_length": 541.1961059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.008533333333333334, "grad_norm": 0.0390574112534523, "kl": 0.0003594011068344116, "learning_rate": 2.0000000000000003e-06, "loss": 0.0279, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03241812810301781, "mask/share_reasoning": 0.8590602874755859, "mask/share_step_conf": 0.10461536049842834, "num_tokens": 1888390.0, "reward": 0.7530773878097534, "reward_std": 0.2001701146364212, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6282703280448914, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5755406618118286, "step": 8 }, { "adv/mean_abs_final_conf": 0.7767202854156494, "adv/mean_abs_reasoning": 0.5762010812759399, "adv/mean_abs_step_conf": 0.7679073810577393, "adv/ratio_final_to_reasoning": 1.3480021309499797, "adv/ratio_step_to_reasoning": 1.3327072891937044, "adv/std_final_conf": 0.9332738518714905, "adv/std_reasoning": 0.8099425435066223, "adv/std_step_conf": 0.9358413815498352, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.49622048297544985, "calib/avg_num_step_conf": 4.48046875, "calib/ece": 0.2740800000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.288, "calib/gap": 0.009531072312529587, "calib/mean_conf": 0.87808, "calib/mu_c": 0.8818543046357618, "calib/mu_w": 0.8723232323232322, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2740800000000001, "calib/std_conf": 0.07160107261766406, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.7836281859070465, "calib/step_q_c_n": 667.0, "calib/step_q_gap": -0.004455147426286721, "calib/step_q_w": 0.7880833333333332, "calib/step_q_w_n": 480.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 482.03125, "completions/mean_terminated_length": 487.7470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.0096, "grad_norm": 0.04033350199460983, "kl": 0.0003407597541809082, "learning_rate": 2.25e-06, "loss": -0.0697, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03390541672706604, "mask/share_reasoning": 0.8510233163833618, "mask/share_step_conf": 0.10335250198841095, "num_tokens": 2119326.0, "reward": 0.7477097511291504, "reward_std": 0.2547100782394409, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6577199697494507, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.5267621874809265, "step": 9 }, { "adv/mean_abs_final_conf": 0.7742620706558228, "adv/mean_abs_reasoning": 0.41145145893096924, "adv/mean_abs_step_conf": 0.7497866153717041, "adv/ratio_final_to_reasoning": 1.8817822949698755, "adv/ratio_step_to_reasoning": 1.8222966503018247, "adv/std_final_conf": 0.9290869832038879, "adv/std_reasoning": 0.7013988494873047, "adv/std_step_conf": 0.9350776076316833, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.4911539740605254, "calib/avg_num_step_conf": 5.140625, "calib/ece": 0.26777777777777767, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.0022986365147990018, "calib/mean_conf": 0.8828571428571429, "calib/mu_c": 0.883741935483871, "calib/mu_w": 0.881443298969072, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26777777777777767, "calib/std_conf": 0.048539905737234317, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.7831796588235294, "calib/step_q_c_n": 765.0, "calib/step_q_gap": 0.0010815666638197552, "calib/step_q_w": 0.7820980921597096, "calib/step_q_w_n": 551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 517.20703125, "completions/mean_terminated_length": 519.2353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.010666666666666666, "grad_norm": 0.037387724965810776, "kl": 0.0003885924816131592, "learning_rate": 2.5e-06, "loss": 0.0666, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032854266464710236, "mask/share_reasoning": 0.8497961759567261, "mask/share_step_conf": 0.1134432703256607, "num_tokens": 2358531.0, "reward": 0.7786507606506348, "reward_std": 0.18859273195266724, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6794851422309875, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.5598475933074951, "step": 10 }, { "adv/mean_abs_final_conf": 0.7800927758216858, "adv/mean_abs_reasoning": 0.42066067457199097, "adv/mean_abs_step_conf": 0.7692493200302124, "adv/ratio_final_to_reasoning": 1.8544466430464548, "adv/ratio_step_to_reasoning": 1.8286694395973653, "adv/std_final_conf": 0.9302803874015808, "adv/std_reasoning": 0.6816288232803345, "adv/std_step_conf": 0.9348410964012146, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.43812219402693736, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.33092, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.384, "calib/gap": -0.012217450712473155, "calib/mean_conf": 0.8878799999999999, "calib/mu_c": 0.8825531914893617, "calib/mu_w": 0.8947706422018349, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3274, "calib/std_conf": 0.04921692391850593, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7793227091633466, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.01725967191406297, "calib/step_q_w": 0.7620630372492836, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 544.44921875, "completions/mean_terminated_length": 544.44921875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.011733333333333333, "grad_norm": 0.04038803279399872, "kl": 0.000596165657043457, "learning_rate": 2.7500000000000004e-06, "loss": 0.1017, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032387614250183105, "mask/share_reasoning": 0.8491656184196472, "mask/share_step_conf": 0.11844678968191147, "num_tokens": 2602390.0, "reward": 0.7374609708786011, "reward_std": 0.17157597839832306, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6250066757202148, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5452277660369873, "step": 11 }, { "adv/mean_abs_final_conf": 0.747472882270813, "adv/mean_abs_reasoning": 0.4134790897369385, "adv/mean_abs_step_conf": 0.7474517226219177, "adv/ratio_final_to_reasoning": 1.807764650798584, "adv/ratio_step_to_reasoning": 1.8077134761456923, "adv/std_final_conf": 0.9264019131660461, "adv/std_reasoning": 0.7013806104660034, "adv/std_step_conf": 0.9347985982894897, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5182906551380113, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.2823809523809523, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.44841269841269843, "calib/gap": 0.0006185567010308368, "calib/mean_conf": 0.8897619047619049, "calib/mu_c": 0.8899999999999999, "calib/mu_w": 0.8893814432989691, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.27853174603174596, "calib/std_conf": 0.049975617637770565, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.7757761732851985, "calib/step_q_c_n": 831.0, "calib/step_q_gap": 0.021172725009336535, "calib/step_q_w": 0.754603448275862, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 479.765625, "completions/mean_terminated_length": 481.6470947265625, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.0128, "grad_norm": 0.03798322379589081, "kl": 0.0012229681015014648, "learning_rate": 3e-06, "loss": 0.0572, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03574306145310402, "mask/share_reasoning": 0.830949068069458, "mask/share_step_conf": 0.1294015645980835, "num_tokens": 2829386.0, "reward": 0.7993886470794678, "reward_std": 0.17441141605377197, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6748757362365723, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6059328317642212, "step": 12 }, { "adv/mean_abs_final_conf": 0.7700250148773193, "adv/mean_abs_reasoning": 0.4588058590888977, "adv/mean_abs_step_conf": 0.7821769118309021, "adv/ratio_final_to_reasoning": 1.678324283840761, "adv/ratio_step_to_reasoning": 1.704810207489849, "adv/std_final_conf": 0.9316981434822083, "adv/std_reasoning": 0.7205221652984619, "adv/std_step_conf": 0.9355109333992004, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5232063492063492, "calib/avg_num_step_conf": 4.91015625, "calib/ece": 0.3004705882352941, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.403921568627451, "calib/gap": 0.0042761904761904335, "calib/mean_conf": 0.8887058823529412, "calib/mu_c": 0.8904666666666666, "calib/mu_w": 0.8861904761904762, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3004705882352941, "calib/std_conf": 0.04789993414283698, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.749095966620306, "calib/step_q_c_n": 719.0, "calib/step_q_gap": 0.002571803051532706, "calib/step_q_w": 0.7465241635687733, "calib/step_q_w_n": 538.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 474.79296875, "completions/mean_terminated_length": 476.6549377441406, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.013866666666666666, "grad_norm": 0.03955255076289177, "kl": 0.0017464160919189453, "learning_rate": 3.2500000000000002e-06, "loss": -0.0285, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0346112921833992, "mask/share_reasoning": 0.8478481769561768, "mask/share_step_conf": 0.11363425105810165, "num_tokens": 3055525.0, "reward": 0.7889747619628906, "reward_std": 0.18800292909145355, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6608437299728394, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6014807224273682, "step": 13 }, { "adv/mean_abs_final_conf": 0.7658154964447021, "adv/mean_abs_reasoning": 0.5901321172714233, "adv/mean_abs_step_conf": 0.7683314085006714, "adv/ratio_final_to_reasoning": 1.297701775638955, "adv/ratio_step_to_reasoning": 1.3019650786898074, "adv/std_final_conf": 0.9326004981994629, "adv/std_reasoning": 0.8264757394790649, "adv/std_step_conf": 0.9356228709220886, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.4291593752025406, "calib/avg_num_step_conf": 5.203125, "calib/ece": 0.34828, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.496, "calib/gap": -0.011687730896364212, "calib/mean_conf": 0.9018800000000001, "calib/mu_c": 0.8966906474820143, "calib/mu_w": 0.9083783783783785, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.34707999999999994, "calib/std_conf": 0.04499850664188757, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.7377101449275362, "calib/step_q_c_n": 690.0, "calib/step_q_gap": 0.016650954896383574, "calib/step_q_w": 0.7210591900311526, "calib/step_q_w_n": 642.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 549.3046875, "completions/mean_terminated_length": 551.4588623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.014933333333333333, "grad_norm": 0.035396840423345566, "kl": 0.004086017608642578, "learning_rate": 3.5e-06, "loss": 0.0084, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.032657988369464874, "mask/share_reasoning": 0.8497765064239502, "mask/share_step_conf": 0.11365923285484314, "num_tokens": 3301547.0, "reward": 0.7465819120407104, "reward_std": 0.2237074077129364, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6071679592132568, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.5828708410263062, "step": 14 }, { "adv/mean_abs_final_conf": 0.7601497173309326, "adv/mean_abs_reasoning": 0.41240811347961426, "adv/mean_abs_step_conf": 0.7841324806213379, "adv/ratio_final_to_reasoning": 1.8431977754203606, "adv/ratio_step_to_reasoning": 1.9013507615196283, "adv/std_final_conf": 0.9262159466743469, "adv/std_reasoning": 0.6815544366836548, "adv/std_step_conf": 0.9350322484970093, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.5059617400419287, "calib/avg_num_step_conf": 4.921875, "calib/ece": 0.282156862745098, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.5647058823529412, "calib/gap": 0.0034394654088049093, "calib/mean_conf": 0.9056862745098039, "calib/mu_c": 0.9069811320754717, "calib/mu_w": 0.9035416666666668, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.282156862745098, "calib/std_conf": 0.046427464064431254, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6902580645161289, "calib/step_q_c_n": 775.0, "calib/step_q_gap": -0.005523378782840149, "calib/step_q_w": 0.6957814432989691, "calib/step_q_w_n": 485.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 453.85546875, "completions/mean_terminated_length": 455.63531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.016, "grad_norm": 0.03976760804653168, "kl": 0.006726264953613281, "learning_rate": 3.7500000000000005e-06, "loss": -0.0166, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.0355098694562912, "mask/share_reasoning": 0.8437584638595581, "mask/share_step_conf": 0.11682546883821487, "num_tokens": 3525614.0, "reward": 0.8103689551353455, "reward_std": 0.17287077009677887, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.68243008852005, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6140890121459961, "step": 15 }, { "adv/mean_abs_final_conf": 0.7611852288246155, "adv/mean_abs_reasoning": 0.48267441987991333, "adv/mean_abs_step_conf": 0.7547823190689087, "adv/ratio_final_to_reasoning": 1.5770158878815126, "adv/ratio_step_to_reasoning": 1.5637504039611096, "adv/std_final_conf": 0.9273737072944641, "adv/std_reasoning": 0.7206409573554993, "adv/std_step_conf": 0.9352510571479797, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5414437619708078, "calib/avg_num_step_conf": 6.5078125, "calib/ece": 0.3295600000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.692, "calib/gap": 0.011701340730466692, "calib/mean_conf": 0.9175599999999999, "calib/mu_c": 0.9223809523809522, "calib/mu_w": 0.9106796116504855, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3295600000000001, "calib/std_conf": 0.050168181151004465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.6367959183673468, "calib/step_q_c_n": 980.0, "calib/step_q_gap": 0.003457725947521695, "calib/step_q_w": 0.6333381924198251, "calib/step_q_w_n": 686.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 656.74609375, "completions/mean_terminated_length": 659.3215942382812, "completions/min_length": 0.0, "completions/min_terminated_length": 214.0, "epoch": 0.017066666666666667, "grad_norm": 0.03466910123825073, "kl": 0.008123397827148438, "learning_rate": 4.000000000000001e-06, "loss": 0.0199, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.025563012808561325, "mask/share_reasoning": 0.861224889755249, "mask/share_step_conf": 0.10930580645799637, "num_tokens": 3802589.0, "reward": 0.7798618674278259, "reward_std": 0.19322898983955383, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6369988322257996, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6125686168670654, "step": 16 }, { "adv/mean_abs_final_conf": 0.7376457452774048, "adv/mean_abs_reasoning": 0.45587021112442017, "adv/mean_abs_step_conf": 0.773207426071167, "adv/ratio_final_to_reasoning": 1.6181047308574412, "adv/ratio_step_to_reasoning": 1.6961130760529917, "adv/std_final_conf": 0.9277526140213013, "adv/std_reasoning": 0.7392179369926453, "adv/std_step_conf": 0.9353331923484802, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5414690069576218, "calib/avg_num_step_conf": 5.39453125, "calib/ece": 0.19047244094488186, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6456692913385826, "calib/gap": 0.0013456672991774, "calib/mean_conf": 0.9137795275590551, "calib/mu_c": 0.9141397849462364, "calib/mu_w": 0.912794117647059, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18598425196850388, "calib/std_conf": 0.05437445460105899, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6245174129353234, "calib/step_q_c_n": 1005.0, "calib/step_q_gap": 0.009118476765110684, "calib/step_q_w": 0.6153989361702127, "calib/step_q_w_n": 376.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2384.0, "completions/max_terminated_length": 2384.0, "completions/mean_length": 505.03515625, "completions/mean_terminated_length": 507.0157165527344, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.018133333333333335, "grad_norm": 0.04985566437244415, "kl": 0.01265716552734375, "learning_rate": 4.25e-06, "loss": 0.0316, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032756924629211426, "mask/share_reasoning": 0.8445059061050415, "mask/share_step_conf": 0.11883093416690826, "num_tokens": 4035406.0, "reward": 0.8795958757400513, "reward_std": 0.19196872413158417, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7625812292098999, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6528604030609131, "step": 17 }, { "adv/mean_abs_final_conf": 0.754453182220459, "adv/mean_abs_reasoning": 0.40319809317588806, "adv/mean_abs_step_conf": 0.7516779899597168, "adv/ratio_final_to_reasoning": 1.871172495578599, "adv/ratio_step_to_reasoning": 1.8642895457142217, "adv/std_final_conf": 0.9269874691963196, "adv/std_reasoning": 0.701314389705658, "adv/std_step_conf": 0.9353558421134949, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5397597977243995, "calib/avg_num_step_conf": 4.94921875, "calib/ece": 0.37280632411067194, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7193675889328063, "calib/gap": 0.010398230088495497, "calib/mean_conf": 0.9103557312252964, "calib/mu_c": 0.9149999999999999, "calib/mu_w": 0.9046017699115044, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.364901185770751, "calib/std_conf": 0.11643023635425974, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.6056040756914118, "calib/step_q_c_n": 687.0, "calib/step_q_gap": 0.005276489484515312, "calib/step_q_w": 0.6003275862068965, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 498.77734375, "completions/mean_terminated_length": 500.7333679199219, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.0192, "grad_norm": 0.034936245530843735, "kl": 0.013996124267578125, "learning_rate": 4.5e-06, "loss": -0.0685, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03259109705686569, "mask/share_reasoning": 0.8579654097557068, "mask/share_step_conf": 0.10553723573684692, "num_tokens": 4273813.0, "reward": 0.7512034177780151, "reward_std": 0.18561282753944397, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.601987898349762, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.594950258731842, "step": 18 }, { "adv/mean_abs_final_conf": 0.7315109372138977, "adv/mean_abs_reasoning": 0.37319090962409973, "adv/mean_abs_step_conf": 0.7599064111709595, "adv/ratio_final_to_reasoning": 1.9601520785989117, "adv/ratio_step_to_reasoning": 2.0362404109370797, "adv/std_final_conf": 0.9230652451515198, "adv/std_reasoning": 0.6815478205680847, "adv/std_step_conf": 0.9353557825088501, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5212645966029724, "calib/avg_num_step_conf": 4.5, "calib/ece": 0.30189723320158096, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.758893280632411, "calib/gap": 0.02426021762208075, "calib/mean_conf": 0.9224505928853755, "calib/mu_c": 0.931656050955414, "calib/mu_w": 0.9073958333333333, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30189723320158096, "calib/std_conf": 0.08276545823423914, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6106906077348067, "calib/step_q_c_n": 724.0, "calib/step_q_gap": 0.058798084370320636, "calib/step_q_w": 0.551892523364486, "calib/step_q_w_n": 428.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 473.13671875, "completions/mean_terminated_length": 476.8622131347656, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.020266666666666665, "grad_norm": 0.03250151127576828, "kl": 0.021490097045898438, "learning_rate": 4.75e-06, "loss": -0.0751, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03216724097728729, "mask/share_reasoning": 0.8552690744400024, "mask/share_step_conf": 0.10475122183561325, "num_tokens": 4499696.0, "reward": 0.814599871635437, "reward_std": 0.18274182081222534, "rewards/accuracy_reward_step": 0.61328125, "rewards/final_brier_reward_step": 0.6661339402198792, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6435343623161316, "step": 19 }, { "adv/mean_abs_final_conf": 0.7408556342124939, "adv/mean_abs_reasoning": 0.38389551639556885, "adv/mean_abs_step_conf": 0.7733815908432007, "adv/ratio_final_to_reasoning": 1.9298366419291821, "adv/ratio_step_to_reasoning": 2.0145627073338948, "adv/std_final_conf": 0.9134058952331543, "adv/std_reasoning": 0.6815377473831177, "adv/std_step_conf": 0.9353929162025452, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.4916993464052288, "calib/avg_num_step_conf": 5.5234375, "calib/ece": 0.348452380952381, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9047619047619048, "calib/gap": 0.005211764705882138, "calib/mean_conf": 0.9436904761904762, "calib/mu_c": 0.9458000000000001, "calib/mu_w": 0.940588235294118, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.348452380952381, "calib/std_conf": 0.038764032643126396, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.559713216957606, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.007621713689632159, "calib/step_q_w": 0.5520915032679738, "calib/step_q_w_n": 612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 475.04296875, "completions/mean_terminated_length": 475.04296875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.021333333333333333, "grad_norm": 0.042394109070301056, "kl": 0.028331756591796875, "learning_rate": 5e-06, "loss": 0.0176, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03551467880606651, "mask/share_reasoning": 0.8324639201164246, "mask/share_step_conf": 0.13202139735221863, "num_tokens": 4726179.0, "reward": 0.7870444655418396, "reward_std": 0.17159029841423035, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6286808252334595, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6313455104827881, "step": 20 }, { "adv/mean_abs_final_conf": 0.7471381425857544, "adv/mean_abs_reasoning": 0.4719105064868927, "adv/mean_abs_step_conf": 0.751166582107544, "adv/ratio_final_to_reasoning": 1.5832199798808806, "adv/ratio_step_to_reasoning": 1.5917564279285814, "adv/std_final_conf": 0.9129815697669983, "adv/std_reasoning": 0.7393201589584351, "adv/std_step_conf": 0.9356944561004639, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.48754306917572227, "calib/avg_num_step_conf": 5.41796875, "calib/ece": 0.346904761904762, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9801587301587301, "calib/gap": -0.0027643784786641756, "calib/mean_conf": 0.9562698412698413, "calib/mu_c": 0.9551948051948052, "calib/mu_w": 0.9579591836734693, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3460317460317461, "calib/std_conf": 0.023845871726307317, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5527120669056153, "calib/step_q_c_n": 837.0, "calib/step_q_gap": 0.030457521451069836, "calib/step_q_w": 0.5222545454545454, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 509.40234375, "completions/mean_terminated_length": 511.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.0224, "grad_norm": 0.030131801962852478, "kl": 0.027887344360351562, "learning_rate": 4.9722222222222224e-06, "loss": -0.0461, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03290124982595444, "mask/share_reasoning": 0.844321608543396, "mask/share_step_conf": 0.11887086927890778, "num_tokens": 4959546.0, "reward": 0.791034996509552, "reward_std": 0.20873993635177612, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.63130784034729, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6335747241973877, "step": 21 }, { "adv/mean_abs_final_conf": 0.734825074672699, "adv/mean_abs_reasoning": 0.31760308146476746, "adv/mean_abs_step_conf": 0.7737596035003662, "adv/ratio_final_to_reasoning": 2.3136585176810227, "adv/ratio_step_to_reasoning": 2.436247154567363, "adv/std_final_conf": 0.8874879479408264, "adv/std_reasoning": 0.5961337089538574, "adv/std_step_conf": 0.9354345798492432, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5057532424343198, "calib/avg_num_step_conf": 5.53515625, "calib/ece": 0.34626984126984117, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9841269841269841, "calib/gap": -0.0008234120385766097, "calib/mean_conf": 0.9613492063492063, "calib/mu_c": 0.9610322580645162, "calib/mu_w": 0.9618556701030928, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34626984126984117, "calib/std_conf": 0.02009316296113722, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5426682134570765, "calib/step_q_c_n": 862.0, "calib/step_q_gap": 0.010217763006626024, "calib/step_q_w": 0.5324504504504505, "calib/step_q_w_n": 555.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 474.7109375, "completions/mean_terminated_length": 476.57257080078125, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.023466666666666667, "grad_norm": 0.027000360190868378, "kl": 0.0365753173828125, "learning_rate": 4.944444444444445e-06, "loss": -0.0405, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03261125460267067, "mask/share_reasoning": 0.8394464254379272, "mask/share_step_conf": 0.12403606623411179, "num_tokens": 5182888.0, "reward": 0.7897454500198364, "reward_std": 0.15976503491401672, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6325070261955261, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6290149688720703, "step": 22 }, { "adv/mean_abs_final_conf": 0.729829728603363, "adv/mean_abs_reasoning": 0.5199911594390869, "adv/mean_abs_step_conf": 0.7715022563934326, "adv/ratio_final_to_reasoning": 1.403542570590293, "adv/ratio_step_to_reasoning": 1.483683409590367, "adv/std_final_conf": 0.9012270569801331, "adv/std_reasoning": 0.7753241062164307, "adv/std_step_conf": 0.9355471730232239, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5126816055158829, "calib/avg_num_step_conf": 5.51171875, "calib/ece": 0.45525490196078433, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.996078431372549, "calib/gap": 0.000213001723713524, "calib/mean_conf": 0.9689803921568627, "calib/mu_c": 0.9690839694656489, "calib/mu_w": 0.9688709677419354, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.45525490196078433, "calib/std_conf": 0.015228314953296749, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5279524438573315, "calib/step_q_c_n": 757.0, "calib/step_q_gap": -0.024876302320038568, "calib/step_q_w": 0.5528287461773701, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 516.83203125, "completions/mean_terminated_length": 518.8588256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.024533333333333334, "grad_norm": 0.03263654559850693, "kl": 0.039920806884765625, "learning_rate": 4.9166666666666665e-06, "loss": -0.0187, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.033953502774238586, "mask/share_reasoning": 0.8383735418319702, "mask/share_step_conf": 0.1237666979432106, "num_tokens": 5419133.0, "reward": 0.7309042811393738, "reward_std": 0.21733129024505615, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5406855344772339, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6195605397224426, "step": 23 }, { "adv/mean_abs_final_conf": 0.7107611894607544, "adv/mean_abs_reasoning": 0.5182062983512878, "adv/mean_abs_step_conf": 0.7769420742988586, "adv/ratio_final_to_reasoning": 1.371579603957139, "adv/ratio_step_to_reasoning": 1.4992910676129527, "adv/std_final_conf": 0.8938801288604736, "adv/std_reasoning": 0.7754848599433899, "adv/std_step_conf": 0.9358326196670532, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.5574873434585664, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.40676113360323884, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9959514170040485, "calib/gap": 0.0035791366906474664, "calib/mean_conf": 0.9695141700404858, "calib/mu_c": 0.9710791366906474, "calib/mu_w": 0.9674999999999999, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.40676113360323884, "calib/std_conf": 0.014529280836954818, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5566867469879518, "calib/step_q_c_n": 830.0, "calib/step_q_gap": 0.011347641648846496, "calib/step_q_w": 0.5453391053391053, "calib/step_q_w_n": 693.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 583.0859375, "completions/mean_terminated_length": 585.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.0256, "grad_norm": 0.03223908692598343, "kl": 0.032039642333984375, "learning_rate": 4.888888888888889e-06, "loss": 0.0627, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03086993470788002, "mask/share_reasoning": 0.8466017246246338, "mask/share_step_conf": 0.1186220794916153, "num_tokens": 5672915.0, "reward": 0.7419910430908203, "reward_std": 0.23589922487735748, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.5692902207374573, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6131293773651123, "step": 24 }, { "adv/mean_abs_final_conf": 0.7177988886833191, "adv/mean_abs_reasoning": 0.35805076360702515, "adv/mean_abs_step_conf": 0.7632608413696289, "adv/ratio_final_to_reasoning": 2.0047405609533393, "adv/ratio_step_to_reasoning": 2.131711251445166, "adv/std_final_conf": 0.8593156933784485, "adv/std_reasoning": 0.6186238527297974, "adv/std_step_conf": 0.935645580291748, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.49297475999745693, "calib/avg_num_step_conf": 5.2265625, "calib/ece": 0.39507874015748023, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": -0.0002867315150361094, "calib/mean_conf": 0.9726377952755906, "calib/mu_c": 0.9725170068027209, "calib/mu_w": 0.972803738317757, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3944881889763779, "calib/std_conf": 0.015412015606676835, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5808769633507853, "calib/step_q_c_n": 764.0, "calib/step_q_gap": 0.012619123629530993, "calib/step_q_w": 0.5682578397212543, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 501.6328125, "completions/mean_terminated_length": 501.6328125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.02666666666666667, "grad_norm": 0.029137631878256798, "kl": 0.042995452880859375, "learning_rate": 4.861111111111111e-06, "loss": 0.0453, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03312459588050842, "mask/share_reasoning": 0.8488258123397827, "mask/share_step_conf": 0.11804959177970886, "num_tokens": 5904557.0, "reward": 0.7627255916595459, "reward_std": 0.17514243721961975, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.5959746241569519, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6161953210830688, "step": 25 }, { "adv/mean_abs_final_conf": 0.7123991847038269, "adv/mean_abs_reasoning": 0.4883148670196533, "adv/mean_abs_step_conf": 0.780587375164032, "adv/ratio_final_to_reasoning": 1.4588930889035472, "adv/ratio_step_to_reasoning": 1.598532889093085, "adv/std_final_conf": 0.8922529816627502, "adv/std_reasoning": 0.7575893402099609, "adv/std_step_conf": 0.9354667663574219, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.45722446236559133, "calib/avg_num_step_conf": 4.89453125, "calib/ece": 0.3526294820717131, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.007183467741935234, "calib/mean_conf": 0.9691235059760956, "calib/mu_c": 0.9718709677419354, "calib/mu_w": 0.9646875000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3521115537848605, "calib/std_conf": 0.06323317915048283, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5893456614509245, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.051945661450924496, "calib/step_q_w": 0.5374, "calib/step_q_w_n": 550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2664.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 504.28125, "completions/mean_terminated_length": 506.25885009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.027733333333333332, "grad_norm": 0.04002292826771736, "kl": 0.04222869873046875, "learning_rate": 4.833333333333333e-06, "loss": 0.0213, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03132515773177147, "mask/share_reasoning": 0.8574528694152832, "mask/share_step_conf": 0.10731575638055801, "num_tokens": 6138893.0, "reward": 0.7889226675033569, "reward_std": 0.22167420387268066, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6185371279716492, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6452457904815674, "step": 26 }, { "adv/mean_abs_final_conf": 0.7595363855361938, "adv/mean_abs_reasoning": 0.5100986361503601, "adv/mean_abs_step_conf": 0.7679067850112915, "adv/ratio_final_to_reasoning": 1.488999051768309, "adv/ratio_step_to_reasoning": 1.5054084261165093, "adv/std_final_conf": 0.8876025080680847, "adv/std_reasoning": 0.7394004464149475, "adv/std_step_conf": 0.9355193376541138, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5076570153140306, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.4701181102362205, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9960629921259843, "calib/gap": 0.007637795275590498, "calib/mean_conf": 0.9701181102362205, "calib/mu_c": 0.9739370078740158, "calib/mu_w": 0.9662992125984253, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4701181102362205, "calib/std_conf": 0.0623583893885775, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5613547486033519, "calib/step_q_c_n": 716.0, "calib/step_q_gap": 0.02405380393398615, "calib/step_q_w": 0.5373009446693657, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 491.56640625, "completions/mean_terminated_length": 493.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.0288, "grad_norm": 0.04091726988554001, "kl": 0.08211898803710938, "learning_rate": 4.805555555555556e-06, "loss": -0.0478, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.032995350658893585, "mask/share_reasoning": 0.8379353284835815, "mask/share_step_conf": 0.12516307830810547, "num_tokens": 6369950.0, "reward": 0.7274233102798462, "reward_std": 0.23430326581001282, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.5247871279716492, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6331844329833984, "step": 27 }, { "adv/mean_abs_final_conf": 0.6339473724365234, "adv/mean_abs_reasoning": 0.2670007348060608, "adv/mean_abs_step_conf": 0.7607395648956299, "adv/ratio_final_to_reasoning": 2.374328193879312, "adv/ratio_step_to_reasoning": 2.8492040122968287, "adv/std_final_conf": 0.8115496039390564, "adv/std_reasoning": 0.5728670954704285, "adv/std_step_conf": 0.9356757402420044, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.5102272727272728, "calib/avg_num_step_conf": 4.7734375, "calib/ece": 0.33338709677419387, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.9959677419354839, "calib/gap": 0.0023295454545455563, "calib/mean_conf": 0.9785483870967745, "calib/mu_c": 0.9793750000000001, "calib/mu_w": 0.9770454545454546, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.33338709677419387, "calib/std_conf": 0.014379283282595278, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5871241558441559, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.04378344787955413, "calib/step_q_w": 0.5433407079646018, "calib/step_q_w_n": 452.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 554.3671875, "completions/mean_terminated_length": 556.5411987304688, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.029866666666666666, "grad_norm": 0.025103984400629997, "kl": 0.03751373291015625, "learning_rate": 4.777777777777778e-06, "loss": 0.0124, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03064804896712303, "mask/share_reasoning": 0.863610029220581, "mask/share_step_conf": 0.10183563083410263, "num_tokens": 6618812.0, "reward": 0.7855503559112549, "reward_std": 0.16933844983577728, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6323234438896179, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6223709583282471, "step": 28 }, { "adv/mean_abs_final_conf": 0.7691308259963989, "adv/mean_abs_reasoning": 0.5598151683807373, "adv/mean_abs_step_conf": 0.7597047686576843, "adv/ratio_final_to_reasoning": 1.3739013685911836, "adv/ratio_step_to_reasoning": 1.3570635659178845, "adv/std_final_conf": 0.9021946787834167, "adv/std_reasoning": 0.7929183840751648, "adv/std_step_conf": 0.9357885122299194, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5087365591397849, "calib/avg_num_step_conf": 5.79296875, "calib/ece": 0.48376, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.992, "calib/gap": 0.0011162314388121208, "calib/mean_conf": 0.9797600000000001, "calib/mu_c": 0.9803225806451613, "calib/mu_w": 0.9792063492063492, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.48376, "calib/std_conf": 0.015644244948222977, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.554188790560472, "calib/step_q_c_n": 678.0, "calib/step_q_gap": 0.036076989318235886, "calib/step_q_w": 0.5181118012422361, "calib/step_q_w_n": 805.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 593.1875, "completions/mean_terminated_length": 595.5137329101562, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.030933333333333334, "grad_norm": 0.02336724102497101, "kl": 0.037761688232421875, "learning_rate": 4.75e-06, "loss": 0.0091, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.028588736429810524, "mask/share_reasoning": 0.8579009771347046, "mask/share_step_conf": 0.10960404574871063, "num_tokens": 6877796.0, "reward": 0.7166707515716553, "reward_std": 0.24979683756828308, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5002988576889038, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6416363716125488, "step": 29 }, { "adv/mean_abs_final_conf": 0.7652782201766968, "adv/mean_abs_reasoning": 0.5875619053840637, "adv/mean_abs_step_conf": 0.7714072465896606, "adv/ratio_final_to_reasoning": 1.3024639840740997, "adv/ratio_step_to_reasoning": 1.3128952703041992, "adv/std_final_conf": 0.8903350830078125, "adv/std_reasoning": 0.7929739952087402, "adv/std_step_conf": 0.9361233115196228, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5122029958677685, "calib/avg_num_step_conf": 5.51953125, "calib/ece": 0.46855421686747, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.0008981146694215791, "calib/mean_conf": 0.9826104417670684, "calib/mu_c": 0.9830468750000001, "calib/mu_w": 0.9821487603305785, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.46855421686747, "calib/std_conf": 0.012706639451492419, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5545207547169811, "calib/step_q_c_n": 689.0, "calib/step_q_gap": 0.048747135932450725, "calib/step_q_w": 0.5057736187845304, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 559.4375, "completions/mean_terminated_length": 568.3175048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.032, "grad_norm": 0.03202415630221367, "kl": 0.039432525634765625, "learning_rate": 4.722222222222222e-06, "loss": -0.0889, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0293913371860981, "mask/share_reasoning": 0.8461931943893433, "mask/share_step_conf": 0.10879041999578476, "num_tokens": 7127996.0, "reward": 0.7057627439498901, "reward_std": 0.268536776304245, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5086121559143066, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6099447011947632, "step": 30 }, { "adv/mean_abs_final_conf": 0.7451807856559753, "adv/mean_abs_reasoning": 0.482657253742218, "adv/mean_abs_step_conf": 0.753348171710968, "adv/ratio_final_to_reasoning": 1.5439129524695143, "adv/ratio_step_to_reasoning": 1.5608346624234577, "adv/std_final_conf": 0.8795778155326843, "adv/std_reasoning": 0.7393043041229248, "adv/std_step_conf": 0.9356608986854553, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.5176118462507877, "calib/avg_num_step_conf": 6.14453125, "calib/ece": 0.5158893280632412, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.9762845849802372, "calib/gap": 0.019478260869565167, "calib/mean_conf": 0.9688537549407116, "calib/mu_c": 0.9794782608695654, "calib/mu_w": 0.9600000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.5150988142292491, "calib/std_conf": 0.10507606108140252, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.565764705882353, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.10277902081077828, "calib/step_q_w": 0.46298568507157467, "calib/step_q_w_n": 978.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 591.6875, "completions/mean_terminated_length": 591.6875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.03306666666666667, "grad_norm": 0.024821175262331963, "kl": 0.0372772216796875, "learning_rate": 4.694444444444445e-06, "loss": 0.0015, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030024806037545204, "mask/share_reasoning": 0.8544356822967529, "mask/share_step_conf": 0.11553947627544403, "num_tokens": 7385380.0, "reward": 0.7112659215927124, "reward_std": 0.2211502492427826, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.48047342896461487, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6545584201812744, "step": 31 }, { "adv/mean_abs_final_conf": 0.7076499462127686, "adv/mean_abs_reasoning": 0.422512412071228, "adv/mean_abs_step_conf": 0.7611806392669678, "adv/ratio_final_to_reasoning": 1.6748619117335455, "adv/ratio_step_to_reasoning": 1.8015580549114054, "adv/std_final_conf": 0.8828017711639404, "adv/std_reasoning": 0.7204849720001221, "adv/std_step_conf": 0.9358897805213928, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5186550702716329, "calib/avg_num_step_conf": 5.40234375, "calib/ece": 0.45940476190476204, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9761904761904762, "calib/gap": 0.01669691813197205, "calib/mean_conf": 0.9645634920634922, "calib/mu_c": 0.9727131782945737, "calib/mu_w": 0.9560162601626017, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4560317460317462, "calib/std_conf": 0.11048687317216035, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5740236686390532, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.056654503151075786, "calib/step_q_w": 0.5173691654879774, "calib/step_q_w_n": 707.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 534.02734375, "completions/mean_terminated_length": 538.2322998046875, "completions/min_length": 0.0, "completions/min_terminated_length": 149.0, "epoch": 0.034133333333333335, "grad_norm": 0.03505050763487816, "kl": 0.042835235595703125, "learning_rate": 4.666666666666667e-06, "loss": -0.0726, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0307695884257555, "mask/share_reasoning": 0.8490593433380127, "mask/share_step_conf": 0.11235859990119934, "num_tokens": 7628795.0, "reward": 0.7308146953582764, "reward_std": 0.20415005087852478, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.53291916847229, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6310539841651917, "step": 32 }, { "adv/mean_abs_final_conf": 0.6907575726509094, "adv/mean_abs_reasoning": 0.5340086221694946, "adv/mean_abs_step_conf": 0.7723900675773621, "adv/ratio_final_to_reasoning": 1.293532620961432, "adv/ratio_step_to_reasoning": 1.446399993392251, "adv/std_final_conf": 0.8698583245277405, "adv/std_reasoning": 0.7928417325019836, "adv/std_step_conf": 0.9358692765235901, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5592071611253197, "calib/avg_num_step_conf": 5.65234375, "calib/ece": 0.43330677290836683, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9840637450199203, "calib/gap": 0.015262787723785087, "calib/mean_conf": 0.9751394422310757, "calib/mu_c": 0.9821323529411766, "calib/mu_w": 0.9668695652173915, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.43330677290836683, "calib/std_conf": 0.06474805968641091, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5756370656370656, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.07524900593557315, "calib/step_q_w": 0.5003880597014925, "calib/step_q_w_n": 670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 554.0234375, "completions/mean_terminated_length": 554.0234375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0352, "grad_norm": 0.020000923424959183, "kl": 0.046581268310546875, "learning_rate": 4.638888888888889e-06, "loss": 0.0408, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030136464163661003, "mask/share_reasoning": 0.8574041724205017, "mask/share_step_conf": 0.11245937645435333, "num_tokens": 7877497.0, "reward": 0.7474783658981323, "reward_std": 0.25027376413345337, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.5562992095947266, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.636313796043396, "step": 33 }, { "adv/mean_abs_final_conf": 0.7421075701713562, "adv/mean_abs_reasoning": 0.5471454858779907, "adv/mean_abs_step_conf": 0.7719881534576416, "adv/ratio_final_to_reasoning": 1.3563258572452894, "adv/ratio_step_to_reasoning": 1.410937626980238, "adv/std_final_conf": 0.9032068848609924, "adv/std_reasoning": 0.792746365070343, "adv/std_step_conf": 0.9356927871704102, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.46708090127408525, "calib/avg_num_step_conf": 5.7109375, "calib/ece": 0.42838582677165354, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.984251968503937, "calib/gap": -0.0087459988702695, "calib/mean_conf": 0.9724015748031496, "calib/mu_c": 0.9685106382978723, "calib/mu_w": 0.9772566371681418, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.42283464566929135, "calib/std_conf": 0.06648174929742734, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5360771276595745, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.040513747377884324, "calib/step_q_w": 0.49556338028169017, "calib/step_q_w_n": 710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 493.55078125, "completions/mean_terminated_length": 493.55078125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.03626666666666667, "grad_norm": 0.03208079934120178, "kl": 0.05084228515625, "learning_rate": 4.611111111111112e-06, "loss": 0.0492, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03344898670911789, "mask/share_reasoning": 0.8358134031295776, "mask/share_step_conf": 0.1307375729084015, "num_tokens": 8108958.0, "reward": 0.7671661376953125, "reward_std": 0.2366221696138382, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5657182931900024, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6600202322006226, "step": 34 }, { "adv/mean_abs_final_conf": 0.7637979984283447, "adv/mean_abs_reasoning": 0.5293041467666626, "adv/mean_abs_step_conf": 0.764624297618866, "adv/ratio_final_to_reasoning": 1.4430228878691478, "adv/ratio_step_to_reasoning": 1.4445839925677768, "adv/std_final_conf": 0.9240019917488098, "adv/std_reasoning": 0.7754998803138733, "adv/std_step_conf": 0.9358949065208435, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5877698763533858, "calib/avg_num_step_conf": 4.98046875, "calib/ece": 0.47964000000000023, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.948, "calib/gap": 0.014407713498622687, "calib/mean_conf": 0.96364, "calib/mu_c": 0.9710743801652894, "calib/mu_w": 0.9566666666666667, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.47964000000000023, "calib/std_conf": 0.07725251063881354, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5402079722703639, "calib/step_q_c_n": 577.0, "calib/step_q_gap": 0.051884190035406896, "calib/step_q_w": 0.488323782234957, "calib/step_q_w_n": 698.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 567.93359375, "completions/mean_terminated_length": 572.405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 234.0, "epoch": 0.037333333333333336, "grad_norm": 0.028312060981988907, "kl": 0.05146026611328125, "learning_rate": 4.583333333333333e-06, "loss": -0.011, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02935807779431343, "mask/share_reasoning": 0.8650037050247192, "mask/share_step_conf": 0.09782572090625763, "num_tokens": 8363605.0, "reward": 0.7115371227264404, "reward_std": 0.2542686462402344, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5092089772224426, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6240215301513672, "step": 35 }, { "adv/mean_abs_final_conf": 0.6834421753883362, "adv/mean_abs_reasoning": 0.388292133808136, "adv/mean_abs_step_conf": 0.7604078650474548, "adv/ratio_final_to_reasoning": 1.7601236694793323, "adv/ratio_step_to_reasoning": 1.9583396078355524, "adv/std_final_conf": 0.8670485615730286, "adv/std_reasoning": 0.6816179156303406, "adv/std_step_conf": 0.9354241490364075, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.49876788553259144, "calib/avg_num_step_conf": 5.07421875, "calib/ece": 0.24920948616600794, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9604743083003953, "calib/gap": 0.0006724960254373968, "calib/mean_conf": 0.9601976284584981, "calib/mu_c": 0.9603783783783785, "calib/mu_w": 0.9597058823529411, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23909090909090913, "calib/std_conf": 0.10865127372890453, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5481837837837837, "calib/step_q_c_n": 925.0, "calib/step_q_gap": 0.018611591270414785, "calib/step_q_w": 0.529572192513369, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 498.359375, "completions/mean_terminated_length": 498.359375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.0384, "grad_norm": 0.03301846235990524, "kl": 0.055049896240234375, "learning_rate": 4.555555555555556e-06, "loss": 0.0143, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0352259986102581, "mask/share_reasoning": 0.8447118997573853, "mask/share_step_conf": 0.12006211280822754, "num_tokens": 8593897.0, "reward": 0.8832883834838867, "reward_std": 0.19006985425949097, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7300878763198853, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6950825452804565, "step": 36 }, { "adv/mean_abs_final_conf": 0.7156381011009216, "adv/mean_abs_reasoning": 0.3434959352016449, "adv/mean_abs_step_conf": 0.77043616771698, "adv/ratio_final_to_reasoning": 2.0833961271792503, "adv/ratio_step_to_reasoning": 2.2429265931915765, "adv/std_final_conf": 0.9074951410293579, "adv/std_reasoning": 0.6187070608139038, "adv/std_step_conf": 0.9358178973197937, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6113832288401253, "calib/avg_num_step_conf": 5.125, "calib/ece": 0.49439516129032257, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.9435483870967742, "calib/gap": 0.015075757575757631, "calib/mean_conf": 0.9544758064516129, "calib/mu_c": 0.9625, "calib/mu_w": 0.9474242424242424, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.49056451612903224, "calib/std_conf": 0.107713251626541, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5432139037433155, "calib/step_q_c_n": 561.0, "calib/step_q_gap": 0.06206876392973354, "calib/step_q_w": 0.4811451398135819, "calib/step_q_w_n": 751.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 540.375, "completions/mean_terminated_length": 544.6299438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.039466666666666664, "grad_norm": 0.024684589356184006, "kl": 0.046009063720703125, "learning_rate": 4.527777777777778e-06, "loss": -0.027, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03301898390054703, "mask/share_reasoning": 0.8481467962265015, "mask/share_step_conf": 0.11102168262004852, "num_tokens": 8839329.0, "reward": 0.693229079246521, "reward_std": 0.19042614102363586, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.49409645795822144, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6079866886138916, "step": 37 }, { "adv/mean_abs_final_conf": 0.7660816311836243, "adv/mean_abs_reasoning": 0.5371396541595459, "adv/mean_abs_step_conf": 0.7858396768569946, "adv/ratio_final_to_reasoning": 1.426224307312221, "adv/ratio_step_to_reasoning": 1.4630081223226497, "adv/std_final_conf": 0.9169600009918213, "adv/std_reasoning": 0.7753732800483704, "adv/std_step_conf": 0.9359428286552429, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.5588553603259485, "calib/avg_num_step_conf": 4.8828125, "calib/ece": 0.43199203187250995, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.9163346613545816, "calib/gap": 0.019633944486885824, "calib/mean_conf": 0.9578884462151395, "calib/mu_c": 0.9671969696969697, "calib/mu_w": 0.9475630252100838, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.43199203187250995, "calib/std_conf": 0.08018357613904677, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5569659442724457, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.040508990630061525, "calib/step_q_w": 0.5164569536423842, "calib/step_q_w_n": 604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 490.453125, "completions/mean_terminated_length": 492.3764953613281, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.04053333333333333, "grad_norm": 0.024724984541535378, "kl": 0.04885101318359375, "learning_rate": 4.5e-06, "loss": 0.0312, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.034811899065971375, "mask/share_reasoning": 0.8496053218841553, "mask/share_step_conf": 0.11167655885219574, "num_tokens": 9071773.0, "reward": 0.7323600649833679, "reward_std": 0.2456468641757965, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.5410265922546387, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6275997161865234, "step": 38 }, { "adv/mean_abs_final_conf": 0.7384641766548157, "adv/mean_abs_reasoning": 0.45391255617141724, "adv/mean_abs_step_conf": 0.7741857171058655, "adv/ratio_final_to_reasoning": 1.6268864269441785, "adv/ratio_step_to_reasoning": 1.7055833917348149, "adv/std_final_conf": 0.9104777574539185, "adv/std_reasoning": 0.7205345630645752, "adv/std_step_conf": 0.9356355667114258, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.6231960034538053, "calib/avg_num_step_conf": 5.25390625, "calib/ece": 0.4303529411764707, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8980392156862745, "calib/gap": 0.031580115949179755, "calib/mean_conf": 0.9480000000000001, "calib/mu_c": 0.9629850746268658, "calib/mu_w": 0.931404958677686, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.42643137254901975, "calib/std_conf": 0.11951995485207957, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5479436310395315, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.10090314765584563, "calib/step_q_w": 0.44704048338368585, "calib/step_q_w_n": 662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 497.9765625, "completions/mean_terminated_length": 499.929443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.0416, "grad_norm": 0.023097166791558266, "kl": 0.048549652099609375, "learning_rate": 4.472222222222223e-06, "loss": -0.047, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03459245711565018, "mask/share_reasoning": 0.8467061519622803, "mask/share_step_conf": 0.11479516327381134, "num_tokens": 9305343.0, "reward": 0.7707017660140991, "reward_std": 0.2181287407875061, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.5713586211204529, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6661387085914612, "step": 39 }, { "adv/mean_abs_final_conf": 0.7422031164169312, "adv/mean_abs_reasoning": 0.4518929123878479, "adv/mean_abs_step_conf": 0.7804073095321655, "adv/ratio_final_to_reasoning": 1.642431416981193, "adv/ratio_step_to_reasoning": 1.726973997906748, "adv/std_final_conf": 0.9173192977905273, "adv/std_reasoning": 0.7204866409301758, "adv/std_step_conf": 0.9359647035598755, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5731384615384616, "calib/avg_num_step_conf": 3.984375, "calib/ece": 0.4381960784313724, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8705882352941177, "calib/gap": 0.030233846153846056, "calib/mean_conf": 0.9333333333333333, "calib/mu_c": 0.9481538461538462, "calib/mu_w": 0.9179200000000002, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.43086274509803907, "calib/std_conf": 0.152799582520047, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5840674603174604, "calib/step_q_c_n": 504.0, "calib/step_q_gap": 0.03389304171280927, "calib/step_q_w": 0.5501744186046511, "calib/step_q_w_n": 516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 480.10546875, "completions/mean_terminated_length": 480.10546875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.042666666666666665, "grad_norm": 0.02745964005589485, "kl": 0.045932769775390625, "learning_rate": 4.444444444444444e-06, "loss": -0.0108, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03717602416872978, "mask/share_reasoning": 0.8646169900894165, "mask/share_step_conf": 0.0982070192694664, "num_tokens": 9535010.0, "reward": 0.7465357780456543, "reward_std": 0.22075152397155762, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5602851510047913, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6320051550865173, "step": 40 }, { "adv/mean_abs_final_conf": 0.766808271408081, "adv/mean_abs_reasoning": 0.48946183919906616, "adv/mean_abs_step_conf": 0.7792272567749023, "adv/ratio_final_to_reasoning": 1.566635455509366, "adv/ratio_step_to_reasoning": 1.5920081901583902, "adv/std_final_conf": 0.9200118780136108, "adv/std_reasoning": 0.7394125461578369, "adv/std_step_conf": 0.9358551502227783, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5873993558776168, "calib/avg_num_step_conf": 4.015625, "calib/ece": 0.21915662650602405, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7991967871485943, "calib/gap": 0.04211352657004819, "calib/mean_conf": 0.9263855421686747, "calib/mu_c": 0.9380555555555555, "calib/mu_w": 0.8959420289855073, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.21132530120481924, "calib/std_conf": 0.1272026335882018, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5662734584450402, "calib/step_q_c_n": 746.0, "calib/step_q_gap": -0.013478314604605246, "calib/step_q_w": 0.5797517730496454, "calib/step_q_w_n": 282.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 451.453125, "completions/mean_terminated_length": 451.453125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.04373333333333333, "grad_norm": 0.029322663322091103, "kl": 0.049098968505859375, "learning_rate": 4.416666666666667e-06, "loss": 0.0466, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.039902739226818085, "mask/share_reasoning": 0.8526535034179688, "mask/share_step_conf": 0.10744375735521317, "num_tokens": 9757830.0, "reward": 0.8721837997436523, "reward_std": 0.2361713945865631, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7382097244262695, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6710015535354614, "step": 41 }, { "adv/mean_abs_final_conf": 0.7153269052505493, "adv/mean_abs_reasoning": 0.40300723910331726, "adv/mean_abs_step_conf": 0.7842440009117126, "adv/ratio_final_to_reasoning": 1.7749728437686054, "adv/ratio_step_to_reasoning": 1.9459799348930784, "adv/std_final_conf": 0.9028910398483276, "adv/std_reasoning": 0.6815569400787354, "adv/std_step_conf": 0.9356938600540161, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.5793828542988678, "calib/avg_num_step_conf": 4.86328125, "calib/ece": 0.39549019607843144, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8705882352941177, "calib/gap": 0.03367861142217232, "calib/mean_conf": 0.946078431372549, "calib/mu_c": 0.9611347517730495, "calib/mu_w": 0.9274561403508772, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3943137254901961, "calib/std_conf": 0.11195374449898195, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5352765752765752, "calib/step_q_c_n": 693.0, "calib/step_q_gap": 0.04048853179831441, "calib/step_q_w": 0.49478804347826083, "calib/step_q_w_n": 552.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 398.9453125, "completions/mean_terminated_length": 400.50982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.0448, "grad_norm": 0.031161848455667496, "kl": 0.05927276611328125, "learning_rate": 4.388888888888889e-06, "loss": -0.0128, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.04035520553588867, "mask/share_reasoning": 0.8334106802940369, "mask/share_step_conf": 0.12232788652181625, "num_tokens": 9964328.0, "reward": 0.7765161991119385, "reward_std": 0.18666572868824005, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6000097990036011, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6436475515365601, "step": 42 }, { "adv/mean_abs_final_conf": 0.7596381306648254, "adv/mean_abs_reasoning": 0.551596999168396, "adv/mean_abs_step_conf": 0.7778820991516113, "adv/ratio_final_to_reasoning": 1.377161463550524, "adv/ratio_step_to_reasoning": 1.4102362781602682, "adv/std_final_conf": 0.9071572422981262, "adv/std_reasoning": 0.7753998041152954, "adv/std_step_conf": 0.9359604716300964, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6247401247401246, "calib/avg_num_step_conf": 4.45703125, "calib/ece": 0.38027559055118115, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8385826771653543, "calib/gap": 0.04776097776097776, "calib/mean_conf": 0.9275196850393701, "calib/mu_c": 0.9483916083916084, "calib/mu_w": 0.9006306306306306, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3724015748031496, "calib/std_conf": 0.14929937953859507, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5498345864661655, "calib/step_q_c_n": 665.0, "calib/step_q_gap": 0.02250265369305615, "calib/step_q_w": 0.5273319327731093, "calib/step_q_w_n": 476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 471.64453125, "completions/mean_terminated_length": 473.494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.04586666666666667, "grad_norm": 0.02614189125597477, "kl": 0.0488739013671875, "learning_rate": 4.361111111111112e-06, "loss": -0.011, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03692782297730446, "mask/share_reasoning": 0.853030800819397, "mask/share_step_conf": 0.10613512247800827, "num_tokens": 10190293.0, "reward": 0.776657223701477, "reward_std": 0.2550131678581238, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6174371242523193, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6257209777832031, "step": 43 }, { "adv/mean_abs_final_conf": 0.753898561000824, "adv/mean_abs_reasoning": 0.4462771415710449, "adv/mean_abs_step_conf": 0.7800233364105225, "adv/ratio_final_to_reasoning": 1.6893057940338345, "adv/ratio_step_to_reasoning": 1.7478451476689556, "adv/std_final_conf": 0.920788586139679, "adv/std_reasoning": 0.7205285429954529, "adv/std_step_conf": 0.9358447790145874, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.5990694789081886, "calib/avg_num_step_conf": 4.49609375, "calib/ece": 0.45397637795275614, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8503937007874016, "calib/gap": 0.04638337468982656, "calib/mean_conf": 0.9195669291338584, "calib/mu_c": 0.9433064516129035, "calib/mu_w": 0.8969230769230769, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.44267716535433094, "calib/std_conf": 0.18730635525080677, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5581340579710145, "calib/step_q_c_n": 552.0, "calib/step_q_gap": 0.04305893276233341, "calib/step_q_w": 0.515075125208681, "calib/step_q_w_n": 599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 465.8984375, "completions/mean_terminated_length": 469.5669250488281, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.046933333333333334, "grad_norm": 0.031128795817494392, "kl": 0.044940948486328125, "learning_rate": 4.333333333333334e-06, "loss": -0.0894, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034346550703048706, "mask/share_reasoning": 0.8601832389831543, "mask/share_step_conf": 0.09765768051147461, "num_tokens": 10415883.0, "reward": 0.7300410270690918, "reward_std": 0.23240987956523895, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5478339791297913, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6169354915618896, "step": 44 }, { "adv/mean_abs_final_conf": 0.7389326691627502, "adv/mean_abs_reasoning": 0.5605357885360718, "adv/mean_abs_step_conf": 0.7773196697235107, "adv/ratio_final_to_reasoning": 1.3182613568574992, "adv/ratio_step_to_reasoning": 1.3867440502837554, "adv/std_final_conf": 0.905174732208252, "adv/std_reasoning": 0.8098524212837219, "adv/std_step_conf": 0.9357939958572388, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6038591783272634, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.3743253968253967, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8373015873015873, "calib/gap": 0.07453133985048854, "calib/mean_conf": 0.8944047619047619, "calib/mu_c": 0.9272340425531913, "calib/mu_w": 0.8527027027027028, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3546031746031745, "calib/std_conf": 0.2332977074357667, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4982841068917019, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.047696887548005884, "calib/step_q_w": 0.450587219343696, "calib/step_q_w_n": 579.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 496.25390625, "completions/mean_terminated_length": 496.25390625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.048, "grad_norm": 0.02884974703192711, "kl": 0.05104827880859375, "learning_rate": 4.305555555555556e-06, "loss": -0.0224, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.0379553847014904, "mask/share_reasoning": 0.8416377305984497, "mask/share_step_conf": 0.120406873524189, "num_tokens": 10647972.0, "reward": 0.778372049331665, "reward_std": 0.2504884898662567, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6139621138572693, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6357506513595581, "step": 45 }, { "adv/mean_abs_final_conf": 0.7263065576553345, "adv/mean_abs_reasoning": 0.43585142493247986, "adv/mean_abs_step_conf": 0.7749524712562561, "adv/ratio_final_to_reasoning": 1.6664085881280544, "adv/ratio_step_to_reasoning": 1.778019818052237, "adv/std_final_conf": 0.901611864566803, "adv/std_reasoning": 0.7206472754478455, "adv/std_step_conf": 0.9355313181877136, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.5389550264550265, "calib/avg_num_step_conf": 5.3828125, "calib/ece": 0.3745381526104417, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8795180722891566, "calib/gap": 0.01425595238095212, "calib/mean_conf": 0.9325301204819277, "calib/mu_c": 0.9385416666666665, "calib/mu_w": 0.9242857142857144, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.3643775100401606, "calib/std_conf": 0.1496865000243633, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47231724137931025, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.051122754396155656, "calib/step_q_w": 0.4211944869831546, "calib/step_q_w_n": 653.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 520.98828125, "completions/mean_terminated_length": 520.98828125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.04906666666666667, "grad_norm": 0.031708262860774994, "kl": 0.05353546142578125, "learning_rate": 4.277777777777778e-06, "loss": 0.0112, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03628102317452431, "mask/share_reasoning": 0.8419740200042725, "mask/share_step_conf": 0.12174495309591293, "num_tokens": 10886113.0, "reward": 0.7703714370727539, "reward_std": 0.2165904939174652, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.5983883142471313, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6353232860565186, "step": 46 }, { "adv/mean_abs_final_conf": 0.7263702750205994, "adv/mean_abs_reasoning": 0.4627372622489929, "adv/mean_abs_step_conf": 0.7697163820266724, "adv/ratio_final_to_reasoning": 1.5697250562669167, "adv/ratio_step_to_reasoning": 1.663398314381905, "adv/std_final_conf": 0.9154651165008545, "adv/std_reasoning": 0.7392899990081787, "adv/std_step_conf": 0.9355723857879639, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6251989558795441, "calib/avg_num_step_conf": 4.9609375, "calib/ece": 0.3846825396825397, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8492063492063492, "calib/gap": 0.05532692430126673, "calib/mean_conf": 0.9362698412698413, "calib/mu_c": 0.9610791366906473, "calib/mu_w": 0.9057522123893805, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3846825396825397, "calib/std_conf": 0.12995868773969066, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.46648985959438377, "calib/step_q_c_n": 641.0, "calib/step_q_gap": 0.0652021012478019, "calib/step_q_w": 0.40128775834658187, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 515.1875, "completions/mean_terminated_length": 515.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.050133333333333335, "grad_norm": 0.030317526310682297, "kl": 0.0471038818359375, "learning_rate": 4.25e-06, "loss": -0.0348, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03467638045549393, "mask/share_reasoning": 0.8593902587890625, "mask/share_step_conf": 0.10593339055776596, "num_tokens": 11123977.0, "reward": 0.7904351353645325, "reward_std": 0.2099846452474594, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6055483818054199, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6698529720306396, "step": 47 }, { "adv/mean_abs_final_conf": 0.7311999797821045, "adv/mean_abs_reasoning": 0.46249309182167053, "adv/mean_abs_step_conf": 0.7647923231124878, "adv/ratio_final_to_reasoning": 1.5809965439744185, "adv/ratio_step_to_reasoning": 1.653629722554599, "adv/std_final_conf": 0.8829014301300049, "adv/std_reasoning": 0.7205711007118225, "adv/std_step_conf": 0.935703694820404, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.5960507223519022, "calib/avg_num_step_conf": 4.83984375, "calib/ece": 0.44051587301587314, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8492063492063492, "calib/gap": 0.011087628540786043, "calib/mean_conf": 0.9255158730158731, "calib/mu_c": 0.9308396946564885, "calib/mu_w": 0.9197520661157025, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.4230952380952382, "calib/std_conf": 0.16043384801629987, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.46140065146579806, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.042584651465798096, "calib/step_q_w": 0.41881599999999997, "calib/step_q_w_n": 625.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 472.6875, "completions/mean_terminated_length": 474.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.0512, "grad_norm": 0.031771764159202576, "kl": 0.055675506591796875, "learning_rate": 4.222222222222223e-06, "loss": -0.0552, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03715451806783676, "mask/share_reasoning": 0.8365613222122192, "mask/share_step_conf": 0.1223779246211052, "num_tokens": 11348673.0, "reward": 0.747970461845398, "reward_std": 0.19774633646011353, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.5552191734313965, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6430654525756836, "step": 48 }, { "adv/mean_abs_final_conf": 0.6782076358795166, "adv/mean_abs_reasoning": 0.4114164113998413, "adv/mean_abs_step_conf": 0.7703295946121216, "adv/ratio_final_to_reasoning": 1.648470058770675, "adv/ratio_step_to_reasoning": 1.872384215279796, "adv/std_final_conf": 0.8339138627052307, "adv/std_reasoning": 0.6613471508026123, "adv/std_step_conf": 0.9351799488067627, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.695326278659612, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.38141176470588223, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8823529411764706, "calib/gap": 0.04286848072562344, "calib/mean_conf": 0.9508235294117648, "calib/mu_c": 0.9689795918367347, "calib/mu_w": 0.9261111111111112, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.37788235294117634, "calib/std_conf": 0.09321548004959576, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4450980392156863, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.05670006282445528, "calib/step_q_w": 0.388397976391231, "calib/step_q_w_n": 593.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 464.25390625, "completions/mean_terminated_length": 466.07452392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.05226666666666667, "grad_norm": 0.0340806320309639, "kl": 0.061573028564453125, "learning_rate": 4.194444444444445e-06, "loss": -0.0488, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03489847481250763, "mask/share_reasoning": 0.8403646945953369, "mask/share_step_conf": 0.12083058059215546, "num_tokens": 11572058.0, "reward": 0.8100435733795166, "reward_std": 0.18482841551303864, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6249706745147705, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6818350553512573, "step": 49 }, { "adv/mean_abs_final_conf": 0.634462296962738, "adv/mean_abs_reasoning": 0.41344213485717773, "adv/mean_abs_step_conf": 0.7737225294113159, "adv/ratio_final_to_reasoning": 1.5345854799775331, "adv/ratio_step_to_reasoning": 1.8714167332717453, "adv/std_final_conf": 0.8445896506309509, "adv/std_reasoning": 0.6816203594207764, "adv/std_step_conf": 0.9350979924201965, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6930716401831403, "calib/avg_num_step_conf": 5.1796875, "calib/ece": 0.3456349206349206, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9563492063492064, "calib/gap": 0.02222730945327267, "calib/mean_conf": 0.9650000000000001, "calib/mu_c": 0.9732911392405065, "calib/mu_w": 0.9510638297872338, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.34182539682539675, "calib/std_conf": 0.08927618996819979, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4671608040201005, "calib/step_q_c_n": 796.0, "calib/step_q_gap": 0.0622174077936854, "calib/step_q_w": 0.4049433962264151, "calib/step_q_w_n": 530.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 492.6015625, "completions/mean_terminated_length": 494.5333557128906, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.05333333333333334, "grad_norm": 0.03873632475733757, "kl": 0.055904388427734375, "learning_rate": 4.166666666666667e-06, "loss": 0.025, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03604394197463989, "mask/share_reasoning": 0.8390281200408936, "mask/share_step_conf": 0.12102174013853073, "num_tokens": 11803524.0, "reward": 0.8262052536010742, "reward_std": 0.1869128942489624, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6440734267234802, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6880244612693787, "step": 50 }, { "adv/mean_abs_final_conf": 0.636472225189209, "adv/mean_abs_reasoning": 0.40898334980010986, "adv/mean_abs_step_conf": 0.766528844833374, "adv/ratio_final_to_reasoning": 1.5562301631601483, "adv/ratio_step_to_reasoning": 1.874229953879575, "adv/std_final_conf": 0.8273301720619202, "adv/std_reasoning": 0.6815751791000366, "adv/std_step_conf": 0.9350717067718506, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6419427710843373, "calib/avg_num_step_conf": 5.08203125, "calib/ece": 0.3037795275590552, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8937007874015748, "calib/gap": 0.02963992332968246, "calib/mean_conf": 0.9503937007874017, "calib/mu_c": 0.9606626506024099, "calib/mu_w": 0.9310227272727274, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30031496062992136, "calib/std_conf": 0.08893421061991631, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4518306351183064, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.04450131784923006, "calib/step_q_w": 0.40732931726907634, "calib/step_q_w_n": 498.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2528.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 481.578125, "completions/mean_terminated_length": 483.4667053222656, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.0544, "grad_norm": 0.04923851415514946, "kl": 0.0549163818359375, "learning_rate": 4.138888888888889e-06, "loss": -0.0151, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.034107182174921036, "mask/share_reasoning": 0.8453450202941895, "mask/share_step_conf": 0.11664153635501862, "num_tokens": 12036104.0, "reward": 0.8583088517189026, "reward_std": 0.18460452556610107, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6855703592300415, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.7029223442077637, "step": 51 }, { "adv/mean_abs_final_conf": 0.6218217611312866, "adv/mean_abs_reasoning": 0.4193978011608124, "adv/mean_abs_step_conf": 0.7663338780403137, "adv/ratio_final_to_reasoning": 1.482653841794601, "adv/ratio_step_to_reasoning": 1.8272243581612708, "adv/std_final_conf": 0.8274714350700378, "adv/std_reasoning": 0.7014986276626587, "adv/std_step_conf": 0.935455322265625, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7210922787193974, "calib/avg_num_step_conf": 5.36328125, "calib/ece": 0.20900793650793664, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8293650793650794, "calib/gap": 0.18994576271186436, "calib/mean_conf": 0.9082142857142858, "calib/mu_c": 0.9647457627118644, "calib/mu_w": 0.7748, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.20742063492063503, "calib/std_conf": 0.21163663480773778, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4427966101694915, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.0587406661135475, "calib/step_q_w": 0.384055944055944, "calib/step_q_w_n": 429.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 496.47265625, "completions/mean_terminated_length": 498.4196472167969, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.055466666666666664, "grad_norm": 0.05165761709213257, "kl": 0.13477325439453125, "learning_rate": 4.111111111111111e-06, "loss": -0.0638, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03455036133527756, "mask/share_reasoning": 0.8432260751724243, "mask/share_step_conf": 0.11831727623939514, "num_tokens": 12271153.0, "reward": 0.9032639265060425, "reward_std": 0.21079207956790924, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7670800685882568, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.705854058265686, "step": 52 }, { "adv/mean_abs_final_conf": 0.5205949544906616, "adv/mean_abs_reasoning": 0.36230170726776123, "adv/mean_abs_step_conf": 0.7746355533599854, "adv/ratio_final_to_reasoning": 1.4369100229105816, "adv/ratio_step_to_reasoning": 2.138095233394764, "adv/std_final_conf": 0.7433344125747681, "adv/std_reasoning": 0.6610947251319885, "adv/std_step_conf": 0.9352044463157654, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7025191953804175, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.36492187500000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.890625, "calib/gap": 0.052983057300590386, "calib/mean_conf": 0.9559375000000001, "calib/mu_c": 0.9772549019607844, "calib/mu_w": 0.924271844660194, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36160156250000003, "calib/std_conf": 0.10500697521474466, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.46941247002398084, "calib/step_q_c_n": 834.0, "calib/step_q_gap": -0.0009440027715162946, "calib/step_q_w": 0.47035647279549714, "calib/step_q_w_n": 533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 479.96875, "completions/mean_terminated_length": 481.85101318359375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.05653333333333333, "grad_norm": 0.042538344860076904, "kl": 0.06697845458984375, "learning_rate": 4.083333333333334e-06, "loss": 0.0043, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.034105442464351654, "mask/share_reasoning": 0.8453474044799805, "mask/share_step_conf": 0.11664089560508728, "num_tokens": 12499849.0, "reward": 0.8117985129356384, "reward_std": 0.16796886920928955, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6456257700920105, "rewards/format_reward_step": 1.0, "rewards/step_l1_reward": 0.6584399938583374, "step": 53 }, { "adv/mean_abs_final_conf": 0.40769556164741516, "adv/mean_abs_reasoning": 0.29208871722221375, "adv/mean_abs_step_conf": 0.7596259713172913, "adv/ratio_final_to_reasoning": 1.3957935983444736, "adv/ratio_step_to_reasoning": 2.600668654857308, "adv/std_final_conf": 0.6534843444824219, "adv/std_reasoning": 0.572625994682312, "adv/std_step_conf": 0.9350718259811401, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7117191818684355, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.21621093750000006, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.9140625, "calib/gap": 0.1158753849798626, "calib/mean_conf": 0.9480859375000001, "calib/mu_c": 0.9784126984126985, "calib/mu_w": 0.8625373134328359, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21300781250000006, "calib/std_conf": 0.16047977517352796, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5242069741282339, "calib/step_q_c_n": 889.0, "calib/step_q_gap": 0.14961113117211383, "calib/step_q_w": 0.3745958429561201, "calib/step_q_w_n": 433.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 451.21484375, "completions/mean_terminated_length": 452.9843444824219, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.0576, "grad_norm": 0.04199191555380821, "kl": 0.0654144287109375, "learning_rate": 4.055555555555556e-06, "loss": 0.0302, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.037677109241485596, "mask/share_reasoning": 0.8396079540252686, "mask/share_step_conf": 0.11880867183208466, "num_tokens": 12721592.0, "reward": 0.9451124668121338, "reward_std": 0.14664433896541595, "rewards/accuracy_reward_step": 0.73828125, "rewards/final_brier_reward_step": 0.7817854881286621, "rewards/format_reward_step": 1.0, "rewards/step_l1_reward": 0.7607831358909607, "step": 54 }, { "adv/mean_abs_final_conf": 0.5741836428642273, "adv/mean_abs_reasoning": 0.43653303384780884, "adv/mean_abs_step_conf": 0.7608143091201782, "adv/ratio_final_to_reasoning": 1.3153269016163585, "adv/ratio_step_to_reasoning": 1.7428562104773622, "adv/std_final_conf": 0.8026610612869263, "adv/std_reasoning": 0.7013072967529297, "adv/std_step_conf": 0.9350804686546326, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.7028716853085333, "calib/avg_num_step_conf": 4.87109375, "calib/ece": 0.34066666666666673, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8352941176470589, "calib/gap": 0.1598504461480459, "calib/mean_conf": 0.9042745098039217, "calib/mu_c": 0.9726027397260274, "calib/mu_w": 0.8127522935779815, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3361960784313726, "calib/std_conf": 0.22750169819933921, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5114985590778098, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.051444309529889376, "calib/step_q_w": 0.46005424954792046, "calib/step_q_w_n": 553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 470.1875, "completions/mean_terminated_length": 470.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.058666666666666666, "grad_norm": 0.42477744817733765, "kl": 0.1440887451171875, "learning_rate": 4.027777777777779e-06, "loss": 0.0114, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.03672902286052704, "mask/share_reasoning": 0.847432017326355, "mask/share_step_conf": 0.11583895236253738, "num_tokens": 12949784.0, "reward": 0.8394672870635986, "reward_std": 0.21210014820098877, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6690832376480103, "rewards/format_reward_step": 0.99609375, "rewards/step_l1_reward": 0.6965700387954712, "step": 55 }, { "adv/mean_abs_final_conf": 0.6618785858154297, "adv/mean_abs_reasoning": 0.6114981174468994, "adv/mean_abs_step_conf": 0.7772350907325745, "adv/ratio_final_to_reasoning": 1.0823885911192608, "adv/ratio_step_to_reasoning": 1.2710343148359196, "adv/std_final_conf": 0.8749757409095764, "adv/std_reasoning": 0.8265246748924255, "adv/std_step_conf": 0.9351528882980347, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.646551724137931, "calib/avg_num_step_conf": 6.30859375, "calib/ece": 0.4008695652173913, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8458498023715415, "calib/gap": 0.06370123332494348, "calib/mean_conf": 0.9322529644268776, "calib/mu_c": 0.9614598540145988, "calib/mu_w": 0.8977586206896553, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3958102766798419, "calib/std_conf": 0.16489804983394518, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.45058333333333334, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.052144623655913935, "calib/step_q_w": 0.3984387096774194, "calib/step_q_w_n": 775.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 535.3359375, "completions/mean_terminated_length": 535.3359375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.05973333333333333, "grad_norm": 0.05836059898138046, "kl": 0.0561370849609375, "learning_rate": 4.000000000000001e-06, "loss": 0.0696, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03186213970184326, "mask/share_reasoning": 0.8400670886039734, "mask/share_step_conf": 0.12807075679302216, "num_tokens": 13193670.0, "reward": 0.7989076375961304, "reward_std": 0.2353213131427765, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.5964038968086243, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6967238783836365, "step": 56 }, { "adv/mean_abs_final_conf": 0.5409480929374695, "adv/mean_abs_reasoning": 0.3886481523513794, "adv/mean_abs_step_conf": 0.7613285779953003, "adv/ratio_final_to_reasoning": 1.391871001224765, "adv/ratio_step_to_reasoning": 1.9589146980093657, "adv/std_final_conf": 0.7751472592353821, "adv/std_reasoning": 0.6613079309463501, "adv/std_step_conf": 0.935117781162262, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.699563257255565, "calib/avg_num_step_conf": 6.35546875, "calib/ece": 0.2616600790513834, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7944664031620553, "calib/gap": 0.10458016342631749, "calib/mean_conf": 0.9158102766798419, "calib/mu_c": 0.9505325443786983, "calib/mu_w": 0.8459523809523808, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.25474308300395254, "calib/std_conf": 0.17501619228623974, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4308325358851675, "calib/step_q_c_n": 1045.0, "calib/step_q_gap": 0.056107449974514556, "calib/step_q_w": 0.37472508591065296, "calib/step_q_w_n": 582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2177.0, "completions/max_terminated_length": 2177.0, "completions/mean_length": 545.80078125, "completions/mean_terminated_length": 545.80078125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0608, "grad_norm": 0.0570027232170105, "kl": 0.0592803955078125, "learning_rate": 3.972222222222223e-06, "loss": -0.0081, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03178969398140907, "mask/share_reasoning": 0.8385055065155029, "mask/share_step_conf": 0.1297047883272171, "num_tokens": 13440187.0, "reward": 0.8670452237129211, "reward_std": 0.17214056849479675, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7200671434402466, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6851169466972351, "step": 57 }, { "adv/mean_abs_final_conf": 0.6784148812294006, "adv/mean_abs_reasoning": 0.6007354855537415, "adv/mean_abs_step_conf": 0.7772684097290039, "adv/ratio_final_to_reasoning": 1.1293071535537083, "adv/ratio_step_to_reasoning": 1.2938613223631017, "adv/std_final_conf": 0.8778954148292542, "adv/std_reasoning": 0.8265618681907654, "adv/std_step_conf": 0.9353739619255066, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5667697775437672, "calib/avg_num_step_conf": 7.328125, "calib/ece": 0.243012048192771, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.42971887550200805, "calib/gap": 0.06912136369619604, "calib/mean_conf": 0.707269076305221, "calib/mu_c": 0.7369718309859156, "calib/mu_w": 0.6678504672897195, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.18999999999999992, "calib/std_conf": 0.29700339383811025, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.36587598425196854, "calib/step_q_c_n": 1016.0, "calib/step_q_gap": 0.04170156564731736, "calib/step_q_w": 0.3241744186046512, "calib/step_q_w_n": 860.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2969.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 615.85546875, "completions/mean_terminated_length": 618.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.06186666666666667, "grad_norm": 0.047018811106681824, "kl": 0.05673980712890625, "learning_rate": 3.944444444444445e-06, "loss": -0.0241, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.0279830489307642, "mask/share_reasoning": 0.8379050493240356, "mask/share_step_conf": 0.13020563125610352, "num_tokens": 13704166.0, "reward": 0.8076508641242981, "reward_std": 0.2049730271100998, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6558917760848999, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6555036306381226, "step": 58 }, { "adv/mean_abs_final_conf": 0.6874507665634155, "adv/mean_abs_reasoning": 0.458517462015152, "adv/mean_abs_step_conf": 0.7650613784790039, "adv/ratio_final_to_reasoning": 1.4992902637603327, "adv/ratio_step_to_reasoning": 1.6685545085166724, "adv/std_final_conf": 0.8893415331840515, "adv/std_reasoning": 0.7392930388450623, "adv/std_step_conf": 0.9357045292854309, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6364625850340135, "calib/avg_num_step_conf": 7.03125, "calib/ece": 0.21376518218623483, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3643724696356275, "calib/gap": 0.14626530612244903, "calib/mean_conf": 0.6440485829959514, "calib/mu_c": 0.7032653061224491, "calib/mu_w": 0.557, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.13133603238866398, "calib/std_conf": 0.32124978247604635, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4269165786694825, "calib/step_q_c_n": 947.0, "calib/step_q_gap": 0.07184037702821638, "calib/step_q_w": 0.35507620164126613, "calib/step_q_w_n": 853.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3067.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 624.58203125, "completions/mean_terminated_length": 624.58203125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.06293333333333333, "grad_norm": 0.0582418367266655, "kl": 0.07353973388671875, "learning_rate": 3.916666666666667e-06, "loss": 0.0168, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03092966601252556, "mask/share_reasoning": 0.8400421142578125, "mask/share_step_conf": 0.12902818620204926, "num_tokens": 13970307.0, "reward": 0.8103795051574707, "reward_std": 0.157393217086792, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.698492169380188, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6144543290138245, "step": 59 }, { "adv/mean_abs_final_conf": 0.7558591365814209, "adv/mean_abs_reasoning": 0.4506644010543823, "adv/mean_abs_step_conf": 0.7565865516662598, "adv/ratio_final_to_reasoning": 1.677210657893101, "adv/ratio_step_to_reasoning": 1.6788247527342666, "adv/std_final_conf": 0.9360891580581665, "adv/std_reasoning": 0.7205802798271179, "adv/std_step_conf": 0.935444176197052, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6880823613735584, "calib/avg_num_step_conf": 6.76953125, "calib/ece": 0.1567063492063492, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.1388888888888889, "calib/gap": 0.19722356160813198, "calib/mean_conf": 0.4832142857142857, "calib/mu_c": 0.5638255033557047, "calib/mu_w": 0.3666019417475727, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.024325396825396826, "calib/std_conf": 0.29236241119278317, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3974741200828158, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.08141153859650546, "calib/step_q_w": 0.31606258148631033, "calib/step_q_w_n": 767.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 544.828125, "completions/mean_terminated_length": 544.828125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.064, "grad_norm": 0.08643894642591476, "kl": 0.08319854736328125, "learning_rate": 3.88888888888889e-06, "loss": 0.0183, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.030775226652622223, "mask/share_reasoning": 0.8308082222938538, "mask/share_step_conf": 0.13841655850410461, "num_tokens": 14218639.0, "reward": 0.8752537965774536, "reward_std": 0.15258923172950745, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7446839809417725, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6925423741340637, "step": 60 }, { "adv/mean_abs_final_conf": 0.7297815084457397, "adv/mean_abs_reasoning": 0.4073347747325897, "adv/mean_abs_step_conf": 0.7520349621772766, "adv/ratio_final_to_reasoning": 1.7916012914066382, "adv/ratio_step_to_reasoning": 1.8462331448891844, "adv/std_final_conf": 0.9128029346466064, "adv/std_reasoning": 0.6816322803497314, "adv/std_step_conf": 0.9352243542671204, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.6146551724137931, "calib/avg_num_step_conf": 6.7890625, "calib/ece": 0.20610236220472442, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.24803149606299213, "calib/gap": 0.11521839080459773, "calib/mean_conf": 0.5919291338582677, "calib/mu_c": 0.6282183908045977, "calib/mu_w": 0.513, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05649606299212595, "calib/std_conf": 0.303227745834928, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.39431642304385206, "calib/step_q_c_n": 1163.0, "calib/step_q_gap": 0.009342510000373783, "calib/step_q_w": 0.3849739130434783, "calib/step_q_w_n": 575.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 487.109375, "completions/mean_terminated_length": 487.109375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.06506666666666666, "grad_norm": 0.09446429461240768, "kl": 0.07703399658203125, "learning_rate": 3.861111111111112e-06, "loss": 0.0665, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03652501851320267, "mask/share_reasoning": 0.8129298686981201, "mask/share_step_conf": 0.1505451202392578, "num_tokens": 14447403.0, "reward": 0.8667026162147522, "reward_std": 0.14637422561645508, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7276128530502319, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6714172959327698, "step": 61 }, { "adv/mean_abs_final_conf": 0.7487633228302002, "adv/mean_abs_reasoning": 0.6052176356315613, "adv/mean_abs_step_conf": 0.7552255392074585, "adv/ratio_final_to_reasoning": 1.2371802782132166, "adv/ratio_step_to_reasoning": 1.2478577865949987, "adv/std_final_conf": 0.9146410226821899, "adv/std_reasoning": 0.8100412487983704, "adv/std_step_conf": 0.9358305335044861, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.5865681444991789, "calib/avg_num_step_conf": 7.26953125, "calib/ece": 0.2374799999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.324, "calib/gap": 0.09063054187192132, "calib/mean_conf": 0.6482800000000001, "calib/mu_c": 0.686344827586207, "calib/mu_w": 0.5957142857142856, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15287999999999993, "calib/std_conf": 0.2954519277310608, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42051906779661014, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.04951579625898739, "calib/step_q_w": 0.37100327153762275, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 581.796875, "completions/mean_terminated_length": 584.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.06613333333333334, "grad_norm": 0.038679976016283035, "kl": 0.07125091552734375, "learning_rate": 3.833333333333334e-06, "loss": 0.0125, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02881879359483719, "mask/share_reasoning": 0.8347760438919067, "mask/share_step_conf": 0.13249891996383667, "num_tokens": 14703423.0, "reward": 0.8286094665527344, "reward_std": 0.20025327801704407, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.6919933557510376, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6566318869590759, "step": 62 }, { "adv/mean_abs_final_conf": 0.6951231956481934, "adv/mean_abs_reasoning": 0.5237023830413818, "adv/mean_abs_step_conf": 0.7333637475967407, "adv/ratio_final_to_reasoning": 1.3273248664848374, "adv/ratio_step_to_reasoning": 1.400344492109733, "adv/std_final_conf": 0.8929726481437683, "adv/std_reasoning": 0.7754513025283813, "adv/std_step_conf": 0.9356421232223511, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7179027113237639, "calib/avg_num_step_conf": 7.015625, "calib/ece": 0.1588446215139443, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.38247011952191234, "calib/gap": 0.2145235247208932, "calib/mean_conf": 0.7020318725099601, "calib/mu_c": 0.7866447368421053, "calib/mu_w": 0.5721212121212121, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12764940239043832, "calib/std_conf": 0.28001325911278524, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4216988416988417, "calib/step_q_c_n": 1036.0, "calib/step_q_gap": 0.04329094696199953, "calib/step_q_w": 0.37840789473684217, "calib/step_q_w_n": 760.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 628.02734375, "completions/mean_terminated_length": 628.02734375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0672, "grad_norm": 0.04206541180610657, "kl": 0.06488037109375, "learning_rate": 3.8055555555555556e-06, "loss": 0.0215, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.028620831668376923, "mask/share_reasoning": 0.8470596075057983, "mask/share_step_conf": 0.12431956827640533, "num_tokens": 14972838.0, "reward": 0.8802470564842224, "reward_std": 0.17957141995429993, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7607605457305908, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6848897933959961, "step": 63 }, { "adv/mean_abs_final_conf": 0.7531286478042603, "adv/mean_abs_reasoning": 0.47588926553726196, "adv/mean_abs_step_conf": 0.7466508150100708, "adv/ratio_final_to_reasoning": 1.582571203731618, "adv/ratio_step_to_reasoning": 1.5689591446596904, "adv/std_final_conf": 0.9304572343826294, "adv/std_reasoning": 0.7575029730796814, "adv/std_step_conf": 0.935704231262207, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6162650602409638, "calib/avg_num_step_conf": 7.03125, "calib/ece": 0.1742629482071713, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.4581673306772908, "calib/gap": 0.0986194188518783, "calib/mean_conf": 0.7979282868525897, "calib/mu_c": 0.8313253012048194, "calib/mu_w": 0.7327058823529411, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1554183266932271, "calib/std_conf": 0.21911741389575573, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4524823943661972, "calib/step_q_c_n": 1136.0, "calib/step_q_gap": 0.03605167147463095, "calib/step_q_w": 0.41643072289156624, "calib/step_q_w_n": 664.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 546.33203125, "completions/mean_terminated_length": 548.4745483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.06826666666666667, "grad_norm": 0.04303797706961632, "kl": 0.06490325927734375, "learning_rate": 3.777777777777778e-06, "loss": 0.0244, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.030777184292674065, "mask/share_reasoning": 0.8323039412498474, "mask/share_step_conf": 0.13301262259483337, "num_tokens": 15216475.0, "reward": 0.8568915724754333, "reward_std": 0.19936087727546692, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7388273477554321, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6491745710372925, "step": 64 }, { "adv/mean_abs_final_conf": 0.6149617433547974, "adv/mean_abs_reasoning": 0.3183043599128723, "adv/mean_abs_step_conf": 0.7738634347915649, "adv/ratio_final_to_reasoning": 1.931992837054219, "adv/ratio_step_to_reasoning": 2.4312058904986102, "adv/std_final_conf": 0.8313688039779663, "adv/std_reasoning": 0.5960519313812256, "adv/std_step_conf": 0.9354342222213745, "calib/answer_extract_rate": 0.99609375, "calib/auroc": 0.645457957957958, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.3441176470588235, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7333333333333333, "calib/gap": 0.08773460960960955, "calib/mean_conf": 0.9088235294117648, "calib/mu_c": 0.9470138888888889, "calib/mu_w": 0.8592792792792794, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3441176470588235, "calib/std_conf": 0.14786648798796906, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.539734219269103, "calib/step_q_c_n": 903.0, "calib/step_q_gap": 0.0739757509333151, "calib/step_q_w": 0.46575846833578793, "calib/step_q_w_n": 679.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 479.6171875, "completions/mean_terminated_length": 479.6171875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.06933333333333333, "grad_norm": 0.036914076656103134, "kl": 0.06595611572265625, "learning_rate": 3.7500000000000005e-06, "loss": 0.022, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.0350305512547493, "mask/share_reasoning": 0.8252483010292053, "mask/share_step_conf": 0.13972117006778717, "num_tokens": 15444281.0, "reward": 0.8163434267044067, "reward_std": 0.16152450442314148, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6505656242370605, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.671183705329895, "step": 65 }, { "adv/mean_abs_final_conf": 0.7002946138381958, "adv/mean_abs_reasoning": 0.48234987258911133, "adv/mean_abs_step_conf": 0.7682009339332581, "adv/ratio_final_to_reasoning": 1.4518395331571698, "adv/ratio_step_to_reasoning": 1.592621824091676, "adv/std_final_conf": 0.8663851618766785, "adv/std_reasoning": 0.739382803440094, "adv/std_step_conf": 0.9357068538665771, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6751748912837965, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.4038492063492064, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5992063492063492, "calib/gap": 0.08574588769143521, "calib/mean_conf": 0.8813095238095239, "calib/mu_c": 0.9252032520325205, "calib/mu_w": 0.8394573643410853, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3985317460317461, "calib/std_conf": 0.15848073160561013, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4955657142857143, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.09857902227050513, "calib/step_q_w": 0.39698669201520914, "calib/step_q_w_n": 1052.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 621.703125, "completions/mean_terminated_length": 621.703125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0704, "grad_norm": 0.041863150894641876, "kl": 0.052066802978515625, "learning_rate": 3.7222222222222225e-06, "loss": 0.04, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.02926797792315483, "mask/share_reasoning": 0.8379676342010498, "mask/share_step_conf": 0.1327643245458603, "num_tokens": 15709789.0, "reward": 0.7863322496414185, "reward_std": 0.21922364830970764, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.603674590587616, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6760210394859314, "step": 66 }, { "adv/mean_abs_final_conf": 0.6475679874420166, "adv/mean_abs_reasoning": 0.4235314726829529, "adv/mean_abs_step_conf": 0.7702065706253052, "adv/ratio_final_to_reasoning": 1.5289725302817647, "adv/ratio_step_to_reasoning": 1.8185344426619892, "adv/std_final_conf": 0.8636981248855591, "adv/std_reasoning": 0.7205429673194885, "adv/std_step_conf": 0.9358491897583008, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7918367346938776, "calib/avg_num_step_conf": 7.15625, "calib/ece": 0.31620967741935485, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7620967741935484, "calib/gap": 0.12286530612244906, "calib/mean_conf": 0.9210483870967743, "calib/mu_c": 0.9696, "calib/mu_w": 0.846734693877551, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31620967741935485, "calib/std_conf": 0.1313719083512535, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5373821464393179, "calib/step_q_c_n": 997.0, "calib/step_q_gap": 0.10374142787644369, "calib/step_q_w": 0.43364071856287423, "calib/step_q_w_n": 835.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 590.82421875, "completions/mean_terminated_length": 597.830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.07146666666666666, "grad_norm": 0.0307986568659544, "kl": 0.06261444091796875, "learning_rate": 3.694444444444445e-06, "loss": -0.053, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.028731655329465866, "mask/share_reasoning": 0.8310511708259583, "mask/share_step_conf": 0.1284984052181244, "num_tokens": 15966048.0, "reward": 0.834770917892456, "reward_std": 0.21885615587234497, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6805233955383301, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6772996783256531, "step": 67 }, { "adv/mean_abs_final_conf": 0.6067262291908264, "adv/mean_abs_reasoning": 0.38977935910224915, "adv/mean_abs_step_conf": 0.7842588424682617, "adv/ratio_final_to_reasoning": 1.5565889137594546, "adv/ratio_step_to_reasoning": 2.012058422679407, "adv/std_final_conf": 0.823989748954773, "adv/std_reasoning": 0.6815210580825806, "adv/std_step_conf": 0.9356105327606201, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6929762212457999, "calib/avg_num_step_conf": 6.9453125, "calib/ece": 0.37750000000000006, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8650793650793651, "calib/gap": 0.051254846213491745, "calib/mean_conf": 0.9568650793650795, "calib/mu_c": 0.9784246575342466, "calib/mu_w": 0.9271698113207548, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.37750000000000006, "calib/std_conf": 0.07394827162136619, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5131938325991189, "calib/step_q_c_n": 908.0, "calib/step_q_gap": 0.0837110739784292, "calib/step_q_w": 0.4294827586206897, "calib/step_q_w_n": 870.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 561.75390625, "completions/mean_terminated_length": 561.75390625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.07253333333333334, "grad_norm": 0.027804771438241005, "kl": 0.0568084716796875, "learning_rate": 3.6666666666666666e-06, "loss": 0.0825, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.032831259071826935, "mask/share_reasoning": 0.8298499584197998, "mask/share_step_conf": 0.13731881976127625, "num_tokens": 16213945.0, "reward": 0.7992095351219177, "reward_std": 0.1799999475479126, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6234105229377747, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6640709638595581, "step": 68 }, { "adv/mean_abs_final_conf": 0.7303532958030701, "adv/mean_abs_reasoning": 0.5670326948165894, "adv/mean_abs_step_conf": 0.7677696347236633, "adv/ratio_final_to_reasoning": 1.2880267795480609, "adv/ratio_step_to_reasoning": 1.3540129903303084, "adv/std_final_conf": 0.8972042798995972, "adv/std_reasoning": 0.7929468154907227, "adv/std_step_conf": 0.9360941052436829, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.68072126727334, "calib/avg_num_step_conf": 6.9375, "calib/ece": 0.4413114754098361, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.6639344262295082, "calib/gap": 0.0621449275362318, "calib/mean_conf": 0.9126229508196722, "calib/mu_c": 0.9454782608695652, "calib/mu_w": 0.8833333333333334, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.4413114754098361, "calib/std_conf": 0.1102627508606521, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.567404255319149, "calib/step_q_c_n": 705.0, "calib/step_q_gap": 0.10929968015575026, "calib/step_q_w": 0.4581045751633987, "calib/step_q_w_n": 1071.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 680.00390625, "completions/mean_terminated_length": 688.0671997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.0736, "grad_norm": 0.04531551152467728, "kl": 0.0455322265625, "learning_rate": 3.638888888888889e-06, "loss": -0.0052, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.027244973927736282, "mask/share_reasoning": 0.84705650806427, "mask/share_step_conf": 0.1139797642827034, "num_tokens": 16492522.0, "reward": 0.722407341003418, "reward_std": 0.2560442090034485, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.5479320287704468, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6164138913154602, "step": 69 }, { "adv/mean_abs_final_conf": 0.7253612279891968, "adv/mean_abs_reasoning": 0.558610737323761, "adv/mean_abs_step_conf": 0.7696953415870667, "adv/ratio_final_to_reasoning": 1.2985092829835638, "adv/ratio_step_to_reasoning": 1.3778742336292844, "adv/std_final_conf": 0.8815363049507141, "adv/std_reasoning": 0.7755623459815979, "adv/std_step_conf": 0.9357798099517822, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.781258064516129, "calib/avg_num_step_conf": 7.234375, "calib/ece": 0.4071084337349399, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6586345381526104, "calib/gap": 0.12092645161290316, "calib/mean_conf": 0.9051004016064257, "calib/mu_c": 0.9658064516129032, "calib/mu_w": 0.8448800000000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.4071084337349399, "calib/std_conf": 0.1355136367411435, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5355799755799756, "calib/step_q_c_n": 819.0, "calib/step_q_gap": 0.13376003366322825, "calib/step_q_w": 0.40181994191674736, "calib/step_q_w_n": 1033.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 614.0625, "completions/mean_terminated_length": 621.3438720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.07466666666666667, "grad_norm": 0.023433836176991463, "kl": 0.0515594482421875, "learning_rate": 3.6111111111111115e-06, "loss": -0.0691, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03000956028699875, "mask/share_reasoning": 0.8231422901153564, "mask/share_step_conf": 0.13512945175170898, "num_tokens": 16756714.0, "reward": 0.7856619358062744, "reward_std": 0.2529831528663635, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6080878973007202, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.672610878944397, "step": 70 }, { "adv/mean_abs_final_conf": 0.6923969388008118, "adv/mean_abs_reasoning": 0.5871611833572388, "adv/mean_abs_step_conf": 0.773187518119812, "adv/ratio_final_to_reasoning": 1.1792280525798071, "adv/ratio_step_to_reasoning": 1.316823284705099, "adv/std_final_conf": 0.8802487850189209, "adv/std_reasoning": 0.8098160624504089, "adv/std_step_conf": 0.9354393482208252, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7177836678026001, "calib/avg_num_step_conf": 7.9375, "calib/ece": 0.3804347826086957, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.691699604743083, "calib/gap": 0.05442572257983136, "calib/mean_conf": 0.9220948616600791, "calib/mu_c": 0.9466187050359716, "calib/mu_w": 0.8921929824561402, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3765612648221344, "calib/std_conf": 0.11336027640266048, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5415829810901002, "calib/step_q_c_n": 899.0, "calib/step_q_gap": 0.12280972424985304, "calib/step_q_w": 0.41877325684024713, "calib/step_q_w_n": 1133.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2831.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 612.87109375, "completions/mean_terminated_length": 612.87109375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.07573333333333333, "grad_norm": 0.03572480008006096, "kl": 0.05677032470703125, "learning_rate": 3.5833333333333335e-06, "loss": 0.0773, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0304461307823658, "mask/share_reasoning": 0.8326575756072998, "mask/share_step_conf": 0.1368963122367859, "num_tokens": 17018017.0, "reward": 0.8050858378410339, "reward_std": 0.21967308223247528, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6202863454818726, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6836353540420532, "step": 71 }, { "adv/mean_abs_final_conf": 0.6634776592254639, "adv/mean_abs_reasoning": 0.48542171716690063, "adv/mean_abs_step_conf": 0.7575856447219849, "adv/ratio_final_to_reasoning": 1.3668067079853021, "adv/ratio_step_to_reasoning": 1.5606752189488613, "adv/std_final_conf": 0.8854225873947144, "adv/std_reasoning": 0.7574900984764099, "adv/std_step_conf": 0.9355804324150085, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7398999493414387, "calib/avg_num_step_conf": 6.60546875, "calib/ece": 0.36264782608695645, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7035573122529645, "calib/gap": 0.09162114994934134, "calib/mean_conf": 0.9100794466403164, "calib/mu_c": 0.9506390070921985, "calib/mu_w": 0.8590178571428572, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35770750988142286, "calib/std_conf": 0.13658480423355987, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5223102310231023, "calib/step_q_c_n": 909.0, "calib/step_q_gap": 0.11248925915609453, "calib/step_q_w": 0.4098209718670077, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 545.14453125, "completions/mean_terminated_length": 545.14453125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.0768, "grad_norm": 0.03880830854177475, "kl": 0.05988311767578125, "learning_rate": 3.555555555555556e-06, "loss": 0.0675, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03043207712471485, "mask/share_reasoning": 0.8391535878181458, "mask/share_step_conf": 0.13041435182094574, "num_tokens": 17261982.0, "reward": 0.8158684968948364, "reward_std": 0.2149871587753296, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6477128863334656, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6762114763259888, "step": 72 }, { "adv/mean_abs_final_conf": 0.6486594676971436, "adv/mean_abs_reasoning": 0.521686315536499, "adv/mean_abs_step_conf": 0.7540993094444275, "adv/ratio_final_to_reasoning": 1.2433898463103563, "adv/ratio_step_to_reasoning": 1.4455033359824982, "adv/std_final_conf": 0.8541032075881958, "adv/std_reasoning": 0.7753783464431763, "adv/std_step_conf": 0.9358484148979187, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7639059304703477, "calib/avg_num_step_conf": 6.5, "calib/ece": 0.2678260869565219, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7193675889328063, "calib/gap": 0.11345398773006132, "calib/mean_conf": 0.9120948616600792, "calib/mu_c": 0.9524539877300614, "calib/mu_w": 0.8390000000000001, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2678260869565219, "calib/std_conf": 0.13503900363688165, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5559819639278557, "calib/step_q_c_n": 998.0, "calib/step_q_gap": 0.0849909729368647, "calib/step_q_w": 0.470990990990991, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 537.78515625, "completions/mean_terminated_length": 539.8941650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.07786666666666667, "grad_norm": 0.03681972622871399, "kl": 0.05597686767578125, "learning_rate": 3.5277777777777784e-06, "loss": 0.0088, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03080468252301216, "mask/share_reasoning": 0.8387721180915833, "mask/share_step_conf": 0.12651696801185608, "num_tokens": 17506687.0, "reward": 0.8686050176620483, "reward_std": 0.21645140647888184, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7242633104324341, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6879468560218811, "step": 73 }, { "adv/mean_abs_final_conf": 0.6617860794067383, "adv/mean_abs_reasoning": 0.4485233724117279, "adv/mean_abs_step_conf": 0.7783040404319763, "adv/ratio_final_to_reasoning": 1.4754773555016507, "adv/ratio_step_to_reasoning": 1.735258602571823, "adv/std_final_conf": 0.8622788190841675, "adv/std_reasoning": 0.7205154299736023, "adv/std_step_conf": 0.9355837106704712, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.747049071618037, "calib/avg_num_step_conf": 7.1171875, "calib/ece": 0.31747967479674793, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.540650406504065, "calib/gap": 0.15674270557029168, "calib/mean_conf": 0.8459349593495936, "calib/mu_c": 0.9198461538461539, "calib/mu_w": 0.7631034482758622, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.31747967479674793, "calib/std_conf": 0.1783749471926312, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.47568750000000004, "calib/step_q_c_n": 800.0, "calib/step_q_gap": 0.08489953522504895, "calib/step_q_w": 0.3907879647749511, "calib/step_q_w_n": 1022.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 597.7109375, "completions/mean_terminated_length": 600.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.07893333333333333, "grad_norm": 0.05273786559700966, "kl": 0.055866241455078125, "learning_rate": 3.5e-06, "loss": 0.0226, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.030816104263067245, "mask/share_reasoning": 0.8347541093826294, "mask/share_step_conf": 0.13052353262901306, "num_tokens": 17763629.0, "reward": 0.8106918931007385, "reward_std": 0.19002105295658112, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6691164374351501, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6585173606872559, "step": 74 }, { "adv/mean_abs_final_conf": 0.5904118418693542, "adv/mean_abs_reasoning": 0.44580191373825073, "adv/mean_abs_step_conf": 0.7511703968048096, "adv/ratio_final_to_reasoning": 1.3243815777246981, "adv/ratio_step_to_reasoning": 1.684986927278768, "adv/std_final_conf": 0.8323133587837219, "adv/std_reasoning": 0.7392011880874634, "adv/std_step_conf": 0.9356184601783752, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8356928838951312, "calib/avg_num_step_conf": 6.484375, "calib/ece": 0.19395256916996043, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6996047430830039, "calib/gap": 0.21269513108614224, "calib/mean_conf": 0.8975098814229249, "calib/mu_c": 0.9605617977528089, "calib/mu_w": 0.7478666666666667, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19395256916996043, "calib/std_conf": 0.1561712248891746, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5161583561643835, "calib/step_q_c_n": 1095.0, "calib/step_q_gap": 0.0628468517396048, "calib/step_q_w": 0.45331150442477874, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 499.62890625, "completions/mean_terminated_length": 501.5882568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.08, "grad_norm": 0.04962025210261345, "kl": 0.09801483154296875, "learning_rate": 3.4722222222222224e-06, "loss": 0.0597, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033663444221019745, "mask/share_reasoning": 0.8257121443748474, "mask/share_step_conf": 0.13671818375587463, "num_tokens": 17996286.0, "reward": 0.9173955917358398, "reward_std": 0.19051072001457214, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.8085620999336243, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6895103454589844, "step": 75 }, { "adv/mean_abs_final_conf": 0.6051506400108337, "adv/mean_abs_reasoning": 0.37921708822250366, "adv/mean_abs_step_conf": 0.7788043022155762, "adv/ratio_final_to_reasoning": 1.595789479971337, "adv/ratio_step_to_reasoning": 2.053716265440593, "adv/std_final_conf": 0.8431809544563293, "adv/std_reasoning": 0.6612383723258972, "adv/std_step_conf": 0.9355038404464722, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7292253266912136, "calib/avg_num_step_conf": 6.14453125, "calib/ece": 0.16392857142857148, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.5158730158730159, "calib/gap": 0.16805068226120867, "calib/mean_conf": 0.8232936507936508, "calib/mu_c": 0.8773099415204679, "calib/mu_w": 0.7092592592592593, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15432539682539687, "calib/std_conf": 0.19721036900281702, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.542784679089027, "calib/step_q_c_n": 966.0, "calib/step_q_gap": 0.08678797398194299, "calib/step_q_w": 0.45599670510708395, "calib/step_q_w_n": 607.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 530.9140625, "completions/mean_terminated_length": 530.9140625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.08106666666666666, "grad_norm": 0.06318492442369461, "kl": 0.16355514526367188, "learning_rate": 3.444444444444445e-06, "loss": -0.0061, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033482976257801056, "mask/share_reasoning": 0.8393796682357788, "mask/share_step_conf": 0.12713731825351715, "num_tokens": 18235256.0, "reward": 0.9013264179229736, "reward_std": 0.16557905077934265, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7829316854476929, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6892523765563965, "step": 76 }, { "adv/mean_abs_final_conf": 0.6626811027526855, "adv/mean_abs_reasoning": 0.43668264150619507, "adv/mean_abs_step_conf": 0.7610800266265869, "adv/ratio_final_to_reasoning": 1.5175347947584592, "adv/ratio_step_to_reasoning": 1.7428675983123312, "adv/std_final_conf": 0.8612998127937317, "adv/std_reasoning": 0.7014240622520447, "adv/std_step_conf": 0.935611367225647, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.6517267473858008, "calib/avg_num_step_conf": 6.4765625, "calib/ece": 0.19436000000000006, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.52, "calib/gap": 0.10325261419922949, "calib/mean_conf": 0.8215600000000001, "calib/mu_c": 0.8595569620253163, "calib/mu_w": 0.7563043478260868, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.19196000000000008, "calib/std_conf": 0.19563528925017593, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.474421052631579, "calib/step_q_c_n": 950.0, "calib/step_q_gap": 0.03443517692536424, "calib/step_q_w": 0.4399858757062148, "calib/step_q_w_n": 708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2747.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 522.6328125, "completions/mean_terminated_length": 524.682373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.08213333333333334, "grad_norm": 0.055999234318733215, "kl": 0.061328887939453125, "learning_rate": 3.416666666666667e-06, "loss": -0.0112, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.035561397671699524, "mask/share_reasoning": 0.8199267387390137, "mask/share_step_conf": 0.14060565829277039, "num_tokens": 18473714.0, "reward": 0.8609846234321594, "reward_std": 0.1922544538974762, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7206558585166931, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6833446025848389, "step": 77 }, { "adv/mean_abs_final_conf": 0.6377897262573242, "adv/mean_abs_reasoning": 0.4574580788612366, "adv/mean_abs_step_conf": 0.7704879641532898, "adv/ratio_final_to_reasoning": 1.3942036565295608, "adv/ratio_step_to_reasoning": 1.684281029796845, "adv/std_final_conf": 0.854850709438324, "adv/std_reasoning": 0.7013858556747437, "adv/std_step_conf": 0.9357221722602844, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.7476748511904762, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.23414062499999996, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.49609375, "calib/gap": 0.20053571428571415, "calib/mean_conf": 0.796640625, "calib/mu_c": 0.8843749999999999, "calib/mu_w": 0.6838392857142858, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.23414062499999996, "calib/std_conf": 0.21584781004126352, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4897462514417532, "calib/step_q_c_n": 867.0, "calib/step_q_gap": 0.04480189532092649, "calib/step_q_w": 0.4449443561208267, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 546.359375, "completions/mean_terminated_length": 548.5020141601562, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.0832, "grad_norm": 0.05781480669975281, "kl": 0.05998992919921875, "learning_rate": 3.3888888888888893e-06, "loss": 0.0201, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.030948851257562637, "mask/share_reasoning": 0.851616382598877, "mask/share_step_conf": 0.11352851986885071, "num_tokens": 18721606.0, "reward": 0.8687750101089478, "reward_std": 0.17642651498317719, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7433953285217285, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.6832171082496643, "step": 78 }, { "adv/mean_abs_final_conf": 0.6819567084312439, "adv/mean_abs_reasoning": 0.398343026638031, "adv/mean_abs_step_conf": 0.7532308101654053, "adv/ratio_final_to_reasoning": 1.711983548919833, "adv/ratio_step_to_reasoning": 1.890909994138936, "adv/std_final_conf": 0.8833400011062622, "adv/std_reasoning": 0.6815299391746521, "adv/std_step_conf": 0.9356348514556885, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7230061349693251, "calib/avg_num_step_conf": 6.66796875, "calib/ece": 0.19762845849802374, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5928853754940712, "calib/gap": 0.19834287661895045, "calib/mean_conf": 0.841897233201581, "calib/mu_c": 0.9124539877300615, "calib/mu_w": 0.714111111111111, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19762845849802374, "calib/std_conf": 0.21865280406965443, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.4832502250225023, "calib/step_q_c_n": 1111.0, "calib/step_q_gap": 0.0726789162976701, "calib/step_q_w": 0.4105713087248322, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2130.0, "completions/max_terminated_length": 2130.0, "completions/mean_length": 554.69140625, "completions/mean_terminated_length": 554.69140625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.08426666666666667, "grad_norm": 0.06547747552394867, "kl": 0.06035614013671875, "learning_rate": 3.3611111111111117e-06, "loss": 0.036, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.030871810391545296, "mask/share_reasoning": 0.8451499342918396, "mask/share_step_conf": 0.12397824227809906, "num_tokens": 18969983.0, "reward": 0.8965235948562622, "reward_std": 0.17538022994995117, "rewards/accuracy_reward_step": 0.63671875, "rewards/final_brier_reward_step": 0.7657819986343384, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.7022650837898254, "step": 79 }, { "adv/mean_abs_final_conf": 0.5756165981292725, "adv/mean_abs_reasoning": 0.4700927436351776, "adv/mean_abs_step_conf": 0.7659281492233276, "adv/ratio_final_to_reasoning": 1.2244745444868814, "adv/ratio_step_to_reasoning": 1.629312852822373, "adv/std_final_conf": 0.7832956910133362, "adv/std_reasoning": 0.7206355333328247, "adv/std_step_conf": 0.9355859756469727, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.698109243697479, "calib/avg_num_step_conf": 6.671875, "calib/ece": 0.2573517786561266, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7707509881422925, "calib/gap": 0.12273669467787152, "calib/mean_conf": 0.9213833992094862, "calib/mu_c": 0.9626190476190478, "calib/mu_w": 0.8398823529411763, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2573517786561266, "calib/std_conf": 0.1523026917754319, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5325895273401298, "calib/step_q_c_n": 1079.0, "calib/step_q_gap": 0.10716822368353196, "calib/step_q_w": 0.4254213036565978, "calib/step_q_w_n": 629.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 522.10546875, "completions/mean_terminated_length": 522.10546875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.08533333333333333, "grad_norm": 0.04910649359226227, "kl": 0.06691741943359375, "learning_rate": 3.3333333333333333e-06, "loss": 0.0123, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.032491035759449005, "mask/share_reasoning": 0.8334447741508484, "mask/share_step_conf": 0.13406416773796082, "num_tokens": 19205802.0, "reward": 0.8781391382217407, "reward_std": 0.20743659138679504, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7335456609725952, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6938263773918152, "step": 80 }, { "adv/mean_abs_final_conf": 0.6354163289070129, "adv/mean_abs_reasoning": 0.46208250522613525, "adv/mean_abs_step_conf": 0.7575079202651978, "adv/ratio_final_to_reasoning": 1.375114447572628, "adv/ratio_step_to_reasoning": 1.6393347761445467, "adv/std_final_conf": 0.8282449245452881, "adv/std_reasoning": 0.7207011580467224, "adv/std_step_conf": 0.9358264803886414, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.736551979282279, "calib/avg_num_step_conf": 6.43359375, "calib/ece": 0.21672131147540985, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.7213114754098361, "calib/gap": 0.22207621161672209, "calib/mean_conf": 0.868360655737705, "calib/mu_c": 0.9457232704402514, "calib/mu_w": 0.7236470588235293, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21672131147540985, "calib/std_conf": 0.22344677238969748, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5613417136414882, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.15183250311517238, "calib/step_q_w": 0.40950921052631584, "calib/step_q_w_n": 760.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 600.55859375, "completions/mean_terminated_length": 600.55859375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0864, "grad_norm": 0.04552274942398071, "kl": 0.05413055419921875, "learning_rate": 3.3055555555555558e-06, "loss": 0.0972, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.032387781888246536, "mask/share_reasoning": 0.838586688041687, "mask/share_step_conf": 0.12902551889419556, "num_tokens": 19465793.0, "reward": 0.8567100167274475, "reward_std": 0.22046364843845367, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.738071084022522, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.6612863540649414, "step": 81 }, { "adv/mean_abs_final_conf": 0.665834903717041, "adv/mean_abs_reasoning": 0.4954150915145874, "adv/mean_abs_step_conf": 0.784247875213623, "adv/ratio_final_to_reasoning": 1.3439939862983274, "adv/ratio_step_to_reasoning": 1.5830116777751229, "adv/std_final_conf": 0.8343429565429688, "adv/std_reasoning": 0.7393837571144104, "adv/std_step_conf": 0.9358690977096558, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7510314185972706, "calib/avg_num_step_conf": 5.68359375, "calib/ece": 0.38615079365079374, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8134920634920635, "calib/gap": 0.12391558235480793, "calib/mean_conf": 0.9298015873015874, "calib/mu_c": 0.9863503649635036, "calib/mu_w": 0.8624347826086957, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.38615079365079374, "calib/std_conf": 0.15069534703011933, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.6145556944444445, "calib/step_q_c_n": 720.0, "calib/step_q_gap": 0.14329038832199548, "calib/step_q_w": 0.471265306122449, "calib/step_q_w_n": 735.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2384.0, "completions/max_terminated_length": 2384.0, "completions/mean_length": 500.875, "completions/mean_terminated_length": 504.81890869140625, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.08746666666666666, "grad_norm": 0.0390843003988266, "kl": 0.059844970703125, "learning_rate": 3.277777777777778e-06, "loss": -0.0256, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03583943471312523, "mask/share_reasoning": 0.832611083984375, "mask/share_step_conf": 0.12373694777488708, "num_tokens": 19699569.0, "reward": 0.7950599193572998, "reward_std": 0.24772369861602783, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6315449476242065, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6546686887741089, "step": 82 }, { "adv/mean_abs_final_conf": 0.6100971102714539, "adv/mean_abs_reasoning": 0.4027612805366516, "adv/mean_abs_step_conf": 0.7715537548065186, "adv/ratio_final_to_reasoning": 1.5147859035966456, "adv/ratio_step_to_reasoning": 1.9156601989607254, "adv/std_final_conf": 0.7979010343551636, "adv/std_reasoning": 0.6818188428878784, "adv/std_step_conf": 0.9355340600013733, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.6761347517730495, "calib/avg_num_step_conf": 6.19140625, "calib/ece": 0.35539419087136925, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8132780082987552, "calib/gap": 0.07043120567375882, "calib/mean_conf": 0.9124066390041493, "calib/mu_c": 0.9416312056737587, "calib/mu_w": 0.8711999999999999, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.3413692946058091, "calib/std_conf": 0.18854987287646935, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5327183271832718, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.1325978608620283, "calib/step_q_w": 0.40012046632124354, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 600.59765625, "completions/mean_terminated_length": 605.3267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.08853333333333334, "grad_norm": 0.033184923231601715, "kl": 0.05547332763671875, "learning_rate": 3.2500000000000002e-06, "loss": -0.0594, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03150860220193863, "mask/share_reasoning": 0.8455889821052551, "mask/share_step_conf": 0.11508992314338684, "num_tokens": 19960586.0, "reward": 0.7805687785148621, "reward_std": 0.21249890327453613, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.607671856880188, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6558094024658203, "step": 83 }, { "adv/mean_abs_final_conf": 0.5975565314292908, "adv/mean_abs_reasoning": 0.4591618478298187, "adv/mean_abs_step_conf": 0.7478587031364441, "adv/ratio_final_to_reasoning": 1.3014071928092028, "adv/ratio_step_to_reasoning": 1.628747481244623, "adv/std_final_conf": 0.7702383399009705, "adv/std_reasoning": 0.720573365688324, "adv/std_step_conf": 0.9358304142951965, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7617763157894737, "calib/avg_num_step_conf": 5.46875, "calib/ece": 0.3046825396825397, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8214285714285714, "calib/gap": 0.17052631578947408, "calib/mean_conf": 0.907857142857143, "calib/mu_c": 0.9755263157894739, "calib/mu_w": 0.8049999999999998, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.3046825396825397, "calib/std_conf": 0.19799233247482304, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6060867065868264, "calib/step_q_c_n": 835.0, "calib/step_q_gap": 0.131697326055853, "calib/step_q_w": 0.4743893805309734, "calib/step_q_w_n": 565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 470.6953125, "completions/mean_terminated_length": 472.54119873046875, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.0896, "grad_norm": 0.03209591656923294, "kl": 0.06945037841796875, "learning_rate": 3.2222222222222227e-06, "loss": -0.0231, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03710485249757767, "mask/share_reasoning": 0.83646160364151, "mask/share_step_conf": 0.12252729386091232, "num_tokens": 20187004.0, "reward": 0.832172155380249, "reward_std": 0.21966418623924255, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.6917449235916138, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6585367918014526, "step": 84 }, { "adv/mean_abs_final_conf": 0.6900614500045776, "adv/mean_abs_reasoning": 0.5189496278762817, "adv/mean_abs_step_conf": 0.7603774070739746, "adv/ratio_final_to_reasoning": 1.329727227724478, "adv/ratio_step_to_reasoning": 1.4652239181396032, "adv/std_final_conf": 0.8699139952659607, "adv/std_reasoning": 0.7927501201629639, "adv/std_step_conf": 0.9358454942703247, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7442937701396347, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.33542857142857135, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.7510204081632653, "calib/gap": 0.17743421052631592, "calib/mean_conf": 0.8782857142857144, "calib/mu_c": 0.9593984962406017, "calib/mu_w": 0.7819642857142858, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.33542857142857135, "calib/std_conf": 0.23573835374959975, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5876065159574467, "calib/step_q_c_n": 752.0, "calib/step_q_gap": 0.15322073282491655, "calib/step_q_w": 0.4343857831325302, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 566.55859375, "completions/mean_terminated_length": 571.0196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.09066666666666667, "grad_norm": 0.02877894788980484, "kl": 0.059356689453125, "learning_rate": 3.1944444444444443e-06, "loss": -0.0493, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.0347464419901371, "mask/share_reasoning": 0.8299733400344849, "mask/share_step_conf": 0.12746763229370117, "num_tokens": 20439867.0, "reward": 0.787490725517273, "reward_std": 0.24738119542598724, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6393284797668457, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6403404474258423, "step": 85 }, { "adv/mean_abs_final_conf": 0.6203538179397583, "adv/mean_abs_reasoning": 0.46224188804626465, "adv/mean_abs_step_conf": 0.7593492269515991, "adv/ratio_final_to_reasoning": 1.3420545259577787, "adv/ratio_step_to_reasoning": 1.6427529537859575, "adv/std_final_conf": 0.8301018476486206, "adv/std_reasoning": 0.7574146389961243, "adv/std_step_conf": 0.9356285333633423, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6894151303242212, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.3686454183266932, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7529880478087649, "calib/gap": 0.14154545454545453, "calib/mean_conf": 0.8787649402390438, "calib/mu_c": 0.947, "calib/mu_w": 0.8054545454545454, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.36474103585657364, "calib/std_conf": 0.23264901947107577, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5542901726427623, "calib/step_q_c_n": 753.0, "calib/step_q_gap": 0.07619684877912591, "calib/step_q_w": 0.47809332386363634, "calib/step_q_w_n": 704.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2851.0, "completions/max_terminated_length": 2851.0, "completions/mean_length": 511.73046875, "completions/mean_terminated_length": 513.7373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.09173333333333333, "grad_norm": 0.05549116060137749, "kl": 0.07260894775390625, "learning_rate": 3.1666666666666667e-06, "loss": -0.0172, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03642665967345238, "mask/share_reasoning": 0.8344923257827759, "mask/share_step_conf": 0.12517479062080383, "num_tokens": 20676382.0, "reward": 0.7826776504516602, "reward_std": 0.21747705340385437, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.624239444732666, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6434596180915833, "step": 86 }, { "adv/mean_abs_final_conf": 0.5463113784790039, "adv/mean_abs_reasoning": 0.43426644802093506, "adv/mean_abs_step_conf": 0.7794994115829468, "adv/ratio_final_to_reasoning": 1.2580096412437265, "adv/ratio_step_to_reasoning": 1.794979591758304, "adv/std_final_conf": 0.7795889377593994, "adv/std_reasoning": 0.7013609409332275, "adv/std_step_conf": 0.935667097568512, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.6329787234042552, "calib/avg_num_step_conf": 5.03515625, "calib/ece": 0.2235968379446641, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8932806324110671, "calib/gap": 0.10783878887070386, "calib/mean_conf": 0.9435177865612648, "calib/mu_c": 0.9712234042553193, "calib/mu_w": 0.8633846153846154, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2120158102766799, "calib/std_conf": 0.1738629823745123, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5968572486772488, "calib/step_q_c_n": 945.0, "calib/step_q_gap": 0.012293295188876674, "calib/step_q_w": 0.5845639534883721, "calib/step_q_w_n": 344.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 442.85546875, "completions/mean_terminated_length": 444.5921936035156, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.0928, "grad_norm": 0.033377598971128464, "kl": 0.07457733154296875, "learning_rate": 3.138888888888889e-06, "loss": 0.0571, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.04012666642665863, "mask/share_reasoning": 0.8329303860664368, "mask/share_step_conf": 0.1230367049574852, "num_tokens": 20895249.0, "reward": 0.9138012528419495, "reward_std": 0.20177030563354492, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7707230448722839, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.7123481631278992, "step": 87 }, { "adv/mean_abs_final_conf": 0.584118127822876, "adv/mean_abs_reasoning": 0.45337072014808655, "adv/mean_abs_step_conf": 0.7674804329872131, "adv/ratio_final_to_reasoning": 1.2883896155271846, "adv/ratio_step_to_reasoning": 1.6928319339557867, "adv/std_final_conf": 0.8002000451087952, "adv/std_reasoning": 0.7206096649169922, "adv/std_step_conf": 0.9355928897857666, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.8007994670219853, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.2229249011857706, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7549407114624506, "calib/gap": 0.307707528314457, "calib/mean_conf": 0.8455335968379447, "calib/mu_c": 0.9610759493670886, "calib/mu_w": 0.6533684210526316, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2219762845849801, "calib/std_conf": 0.2814786828007629, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5822375365497076, "calib/step_q_c_n": 912.0, "calib/step_q_gap": 0.11479371632498842, "calib/step_q_w": 0.46744382022471914, "calib/step_q_w_n": 534.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 539.8984375, "completions/mean_terminated_length": 539.8984375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.09386666666666667, "grad_norm": 0.05166981369256973, "kl": 0.069366455078125, "learning_rate": 3.1111111111111116e-06, "loss": 0.1237, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.033254824578762054, "mask/share_reasoning": 0.8517872095108032, "mask/share_step_conf": 0.11495798081159592, "num_tokens": 21143311.0, "reward": 0.8919699192047119, "reward_std": 0.20028287172317505, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7725710868835449, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.6902748942375183, "step": 88 }, { "adv/mean_abs_final_conf": 0.6227092742919922, "adv/mean_abs_reasoning": 0.4595007300376892, "adv/mean_abs_step_conf": 0.7697288393974304, "adv/ratio_final_to_reasoning": 1.355186691957848, "adv/ratio_step_to_reasoning": 1.6751417116014062, "adv/std_final_conf": 0.8114991188049316, "adv/std_reasoning": 0.7205544710159302, "adv/std_step_conf": 0.9357718825340271, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8098837209302324, "calib/avg_num_step_conf": 5.1328125, "calib/ece": 0.2583132530120481, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6184738955823293, "calib/gap": 0.2777732558139536, "calib/mean_conf": 0.7691566265060241, "calib/mu_c": 0.9030232558139536, "calib/mu_w": 0.62525, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.2546987951807228, "calib/std_conf": 0.3165065515196875, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5568376281112738, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.09803667723964155, "calib/step_q_w": 0.45880095087163225, "calib/step_q_w_n": 631.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 534.1015625, "completions/mean_terminated_length": 536.1961059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.09493333333333333, "grad_norm": 0.05365041643381119, "kl": 0.0618743896484375, "learning_rate": 3.0833333333333336e-06, "loss": -0.0563, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.035472556948661804, "mask/share_reasoning": 0.8467456102371216, "mask/share_step_conf": 0.1138756275177002, "num_tokens": 21388929.0, "reward": 0.8298825025558472, "reward_std": 0.20616307854652405, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7020609378814697, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6631728410720825, "step": 89 }, { "adv/mean_abs_final_conf": 0.6249933838844299, "adv/mean_abs_reasoning": 0.47696006298065186, "adv/mean_abs_step_conf": 0.7670803666114807, "adv/ratio_final_to_reasoning": 1.3103683775506862, "adv/ratio_step_to_reasoning": 1.6082695935122722, "adv/std_final_conf": 0.8583157062530518, "adv/std_reasoning": 0.7575711011886597, "adv/std_step_conf": 0.9356687068939209, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6634831460674158, "calib/avg_num_step_conf": 6.26953125, "calib/ece": 0.22144578313253022, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7068273092369478, "calib/gap": 0.2344143258426965, "calib/mean_conf": 0.8220883534136546, "calib/mu_c": 0.905875, "calib/mu_w": 0.6714606741573035, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20048192771084344, "calib/std_conf": 0.30331047658418014, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5344715644820296, "calib/step_q_c_n": 946.0, "calib/step_q_gap": 0.12081450833635432, "calib/step_q_w": 0.4136570561456753, "calib/step_q_w_n": 659.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 516.94921875, "completions/mean_terminated_length": 518.9765014648438, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.096, "grad_norm": 0.04983136057853699, "kl": 0.06966400146484375, "learning_rate": 3.055555555555556e-06, "loss": -0.0064, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03583477437496185, "mask/share_reasoning": 0.8315213918685913, "mask/share_step_conf": 0.12873761355876923, "num_tokens": 21624588.0, "reward": 0.8576638698577881, "reward_std": 0.21779102087020874, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7331687808036804, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6626277565956116, "step": 90 }, { "adv/mean_abs_final_conf": 0.6391729116439819, "adv/mean_abs_reasoning": 0.5117773413658142, "adv/mean_abs_step_conf": 0.7805782556533813, "adv/ratio_final_to_reasoning": 1.2489277269255, "adv/ratio_step_to_reasoning": 1.5252301979024712, "adv/std_final_conf": 0.8471426963806152, "adv/std_reasoning": 0.7576175928115845, "adv/std_step_conf": 0.9355592131614685, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7152430555555555, "calib/avg_num_step_conf": 6.44140625, "calib/ece": 0.26849593495934976, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.7195121951219512, "calib/gap": 0.21103333333333363, "calib/mean_conf": 0.8178455284552846, "calib/mu_c": 0.9002000000000002, "calib/mu_w": 0.6891666666666666, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.23829268292682942, "calib/std_conf": 0.3099013620288274, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5096382388419782, "calib/step_q_c_n": 829.0, "calib/step_q_gap": 0.16013738518344162, "calib/step_q_w": 0.3495008536585366, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 525.984375, "completions/mean_terminated_length": 534.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.09706666666666666, "grad_norm": 0.03875572606921196, "kl": 0.0771942138671875, "learning_rate": 3.0277777777777776e-06, "loss": -0.0082, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.032178137451410294, "mask/share_reasoning": 0.8356429934501648, "mask/share_step_conf": 0.11655385792255402, "num_tokens": 21866952.0, "reward": 0.8358221054077148, "reward_std": 0.22954022884368896, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6948910355567932, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6673781871795654, "step": 91 }, { "adv/mean_abs_final_conf": 0.5705046653747559, "adv/mean_abs_reasoning": 0.41283679008483887, "adv/mean_abs_step_conf": 0.7558197975158691, "adv/ratio_final_to_reasoning": 1.3819133349465194, "adv/ratio_step_to_reasoning": 1.8307956453215968, "adv/std_final_conf": 0.7968953251838684, "adv/std_reasoning": 0.6817342638969421, "adv/std_step_conf": 0.9357613921165466, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8106280193236715, "calib/avg_num_step_conf": 5.16015625, "calib/ece": 0.168804780876494, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6454183266932271, "calib/gap": 0.3728233264320222, "calib/mean_conf": 0.7656972111553785, "calib/mu_c": 0.8993788819875778, "calib/mu_w": 0.5265555555555556, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14653386454183268, "calib/std_conf": 0.3355791389793453, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5625102319236017, "calib/step_q_c_n": 733.0, "calib/step_q_gap": 0.1794626128759827, "calib/step_q_w": 0.383047619047619, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 449.39453125, "completions/mean_terminated_length": 456.5278015136719, "completions/min_length": 0.0, "completions/min_terminated_length": 164.0, "epoch": 0.09813333333333334, "grad_norm": 0.6035517454147339, "kl": 5.547882080078125, "learning_rate": 3e-06, "loss": -0.0138, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.037832967936992645, "mask/share_reasoning": 0.8288391828536987, "mask/share_step_conf": 0.11770287156105042, "num_tokens": 22088717.0, "reward": 0.9051172733306885, "reward_std": 0.20287597179412842, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7898840308189392, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.700037956237793, "step": 92 }, { "adv/mean_abs_final_conf": 0.6488316655158997, "adv/mean_abs_reasoning": 0.519194483757019, "adv/mean_abs_step_conf": 0.761212944984436, "adv/ratio_final_to_reasoning": 1.2496890583674813, "adv/ratio_step_to_reasoning": 1.4661422045090153, "adv/std_final_conf": 0.8455647826194763, "adv/std_reasoning": 0.77541583776474, "adv/std_step_conf": 0.9357032179832458, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7870209339774557, "calib/avg_num_step_conf": 6.67578125, "calib/ece": 0.13904000000000008, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.408, "calib/gap": 0.3577520128824478, "calib/mean_conf": 0.60736, "calib/mu_c": 0.771925925925926, "calib/mu_w": 0.4141739130434782, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10320000000000007, "calib/std_conf": 0.3631708005883733, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5020725868725868, "calib/step_q_c_n": 777.0, "calib/step_q_gap": 0.13572602034898174, "calib/step_q_w": 0.3663465665236051, "calib/step_q_w_n": 932.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 554.30859375, "completions/mean_terminated_length": 556.4823608398438, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.0992, "grad_norm": 0.03496658056974411, "kl": 0.074615478515625, "learning_rate": 2.9722222222222225e-06, "loss": 0.0333, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.034261010587215424, "mask/share_reasoning": 0.832213282585144, "mask/share_step_conf": 0.12961949408054352, "num_tokens": 22336396.0, "reward": 0.8794798851013184, "reward_std": 0.16742658615112305, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7743171453475952, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6838613152503967, "step": 93 }, { "adv/mean_abs_final_conf": 0.6393656134605408, "adv/mean_abs_reasoning": 0.4309445023536682, "adv/mean_abs_step_conf": 0.7628836035728455, "adv/ratio_final_to_reasoning": 1.4836379393832602, "adv/ratio_step_to_reasoning": 1.7702595099977885, "adv/std_final_conf": 0.8436124920845032, "adv/std_reasoning": 0.701392412185669, "adv/std_step_conf": 0.9355144500732422, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6926949587782133, "calib/avg_num_step_conf": 5.78125, "calib/ece": 0.25032128514056223, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.4497991967871486, "calib/gap": 0.24862211109609406, "calib/mean_conf": 0.6090361445783131, "calib/mu_c": 0.7068874172185431, "calib/mu_w": 0.45826530612244903, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12646586345381525, "calib/std_conf": 0.38961812564886106, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5151922305764411, "calib/step_q_c_n": 798.0, "calib/step_q_gap": 0.16981099890488688, "calib/step_q_w": 0.34538123167155427, "calib/step_q_w_n": 682.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 499.0078125, "completions/mean_terminated_length": 502.93701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.10026666666666667, "grad_norm": 0.05602293834090233, "kl": 0.08460235595703125, "learning_rate": 2.944444444444445e-06, "loss": -0.0119, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03713148459792137, "mask/share_reasoning": 0.830633282661438, "mask/share_step_conf": 0.12442268431186676, "num_tokens": 22572822.0, "reward": 0.8528326749801636, "reward_std": 0.16480697691440582, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7082847356796265, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6848805546760559, "step": 94 }, { "adv/mean_abs_final_conf": 0.5959144830703735, "adv/mean_abs_reasoning": 0.39803582429885864, "adv/mean_abs_step_conf": 0.756955623626709, "adv/ratio_final_to_reasoning": 1.4971378119546872, "adv/ratio_step_to_reasoning": 1.901727375821231, "adv/std_final_conf": 0.8272056579589844, "adv/std_reasoning": 0.6816222667694092, "adv/std_step_conf": 0.9357096552848816, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8125456404264643, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.1490763052208835, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5140562248995983, "calib/gap": 0.4050452753030522, "calib/mean_conf": 0.6608032128514056, "calib/mu_c": 0.7941916167664669, "calib/mu_w": 0.3891463414634147, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.06959839357429713, "calib/std_conf": 0.3806348499358165, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4964188097768331, "calib/step_q_c_n": 941.0, "calib/step_q_gap": 0.1183862106578904, "calib/step_q_w": 0.3780325991189427, "calib/step_q_w_n": 454.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 510.45703125, "completions/mean_terminated_length": 512.4588623046875, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.10133333333333333, "grad_norm": 0.03427678346633911, "kl": 0.0807037353515625, "learning_rate": 2.916666666666667e-06, "loss": -0.0039, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.036947742104530334, "mask/share_reasoning": 0.8355151414871216, "mask/share_step_conf": 0.12363088130950928, "num_tokens": 22809627.0, "reward": 0.9095442295074463, "reward_std": 0.1789747178554535, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.7908421754837036, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.7032462954521179, "step": 95 }, { "adv/mean_abs_final_conf": 0.6383320093154907, "adv/mean_abs_reasoning": 0.5042709708213806, "adv/mean_abs_step_conf": 0.7976299524307251, "adv/ratio_final_to_reasoning": 1.2658511916237118, "adv/ratio_step_to_reasoning": 1.5817486997744632, "adv/std_final_conf": 0.8464713096618652, "adv/std_reasoning": 0.7574781775474548, "adv/std_step_conf": 0.9355773329734802, "calib/answer_extract_rate": 1.0, "calib/auroc": 0.8643814187580766, "calib/avg_num_step_conf": 5.1015625, "calib/ece": 0.10957031250000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.40625, "calib/gap": 0.48501870366591865, "calib/mean_conf": 0.5958203125000001, "calib/mu_c": 0.7606508875739646, "calib/mu_w": 0.27563218390804595, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02261718750000005, "calib/std_conf": 0.3817805106830394, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.5401399491094147, "calib/step_q_c_n": 786.0, "calib/step_q_gap": 0.11918398757095305, "calib/step_q_w": 0.4209559615384616, "calib/step_q_w_n": 520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 449.87109375, "completions/mean_terminated_length": 451.63531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1024, "grad_norm": 0.11715222895145416, "kl": 0.101593017578125, "learning_rate": 2.888888888888889e-06, "loss": 0.0658, "mask/has_final_conf_rate": 0.99609375, "mask/share_final_conf": 0.038984205573797226, "mask/share_reasoning": 0.8376288414001465, "mask/share_step_conf": 0.1194806918501854, "num_tokens": 23030610.0, "reward": 0.9290471076965332, "reward_std": 0.12138018012046814, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.8433824181556702, "rewards/format_reward_step": 1.0, "rewards/step_l1_reward": 0.6826804280281067, "step": 96 }, { "adv/mean_abs_final_conf": 0.7232212424278259, "adv/mean_abs_reasoning": 0.5229663848876953, "adv/mean_abs_step_conf": 0.73717200756073, "adv/ratio_final_to_reasoning": 1.3829210888633587, "adv/ratio_step_to_reasoning": 1.409597306563087, "adv/std_final_conf": 0.9249638915061951, "adv/std_reasoning": 0.7927162647247314, "adv/std_step_conf": 0.9359372854232788, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7628742901393908, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.18264426877470355, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.3241106719367589, "calib/gap": 0.3215271037687146, "calib/mean_conf": 0.5374347826086957, "calib/mu_c": 0.6696040268456377, "calib/mu_w": 0.3480769230769231, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.06557312252964426, "calib/std_conf": 0.3752260776959079, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5125435185185185, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.1586303856071261, "calib/step_q_w": 0.3539131329113924, "calib/step_q_w_n": 632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 454.6875, "completions/mean_terminated_length": 456.4706115722656, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.10346666666666667, "grad_norm": 0.07992636412382126, "kl": 0.101898193359375, "learning_rate": 2.861111111111111e-06, "loss": -0.0563, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.037653982639312744, "mask/share_reasoning": 0.83205246925354, "mask/share_step_conf": 0.1263873130083084, "num_tokens": 23252082.0, "reward": 0.8832191824913025, "reward_std": 0.17623098194599152, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7600910067558289, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6930661201477051, "step": 97 }, { "adv/mean_abs_final_conf": 0.7379151582717896, "adv/mean_abs_reasoning": 0.5539554357528687, "adv/mean_abs_step_conf": 0.7956494092941284, "adv/ratio_final_to_reasoning": 1.332083973991347, "adv/ratio_step_to_reasoning": 1.436305807186058, "adv/std_final_conf": 0.9205290079116821, "adv/std_reasoning": 0.8098165392875671, "adv/std_step_conf": 0.9357894659042358, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7368261455525607, "calib/avg_num_step_conf": 4.78515625, "calib/ece": 0.176869918699187, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5, "calib/gap": 0.33664016172506755, "calib/mean_conf": 0.65630081300813, "calib/mu_c": 0.801357142857143, "calib/mu_w": 0.46471698113207544, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.13203252032520327, "calib/std_conf": 0.3736903077374365, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5414263565891473, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.10310928762363003, "calib/step_q_w": 0.43831706896551725, "calib/step_q_w_n": 580.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 497.50390625, "completions/mean_terminated_length": 503.4031677246094, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.10453333333333334, "grad_norm": 0.04371804744005203, "kl": 0.0875244140625, "learning_rate": 2.8333333333333335e-06, "loss": 0.0235, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03780476376414299, "mask/share_reasoning": 0.8403627872467041, "mask/share_step_conf": 0.11011366546154022, "num_tokens": 23485627.0, "reward": 0.8494217395782471, "reward_std": 0.21939392387866974, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7424519062042236, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6548289060592651, "step": 98 }, { "adv/mean_abs_final_conf": 0.6894935369491577, "adv/mean_abs_reasoning": 0.6670982837677002, "adv/mean_abs_step_conf": 0.7432233095169067, "adv/ratio_final_to_reasoning": 1.0335711449517926, "adv/ratio_step_to_reasoning": 1.1141136585140956, "adv/std_final_conf": 0.8914725184440613, "adv/std_reasoning": 0.8590838313102722, "adv/std_step_conf": 0.9360917210578918, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7185814455231931, "calib/avg_num_step_conf": 6.04296875, "calib/ece": 0.19858299595141699, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3319838056680162, "calib/gap": 0.30917003775620283, "calib/mean_conf": 0.4873279352226721, "calib/mu_c": 0.6675728155339806, "calib/mu_w": 0.3584027777777778, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13445344129554654, "calib/std_conf": 0.3916488369532454, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.47593595706618963, "calib/step_q_c_n": 559.0, "calib/step_q_gap": 0.07857472224837586, "calib/step_q_w": 0.39736123481781377, "calib/step_q_w_n": 988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 574.546875, "completions/mean_terminated_length": 576.800048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.1056, "grad_norm": 0.03644208237528801, "kl": 0.0891571044921875, "learning_rate": 2.805555555555556e-06, "loss": -0.0077, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03330254554748535, "mask/share_reasoning": 0.8504179120063782, "mask/share_step_conf": 0.11237329244613647, "num_tokens": 23738511.0, "reward": 0.8194519877433777, "reward_std": 0.21536259353160858, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.7225519418716431, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6429144740104675, "step": 99 }, { "adv/mean_abs_final_conf": 0.621369481086731, "adv/mean_abs_reasoning": 0.5159175395965576, "adv/mean_abs_step_conf": 0.7591180801391602, "adv/ratio_final_to_reasoning": 1.2043968917448236, "adv/ratio_step_to_reasoning": 1.4713942091071044, "adv/std_final_conf": 0.8657611012458801, "adv/std_reasoning": 0.7576115727424622, "adv/std_step_conf": 0.935834527015686, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8504812479256555, "calib/avg_num_step_conf": 6.140625, "calib/ece": 0.15922764227642283, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5447154471544715, "calib/gap": 0.49087620311981434, "calib/mean_conf": 0.6692276422764227, "calib/mu_c": 0.8987022900763361, "calib/mu_w": 0.4078260869565217, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.1479674796747968, "calib/std_conf": 0.3852871026014581, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5043648648648649, "calib/step_q_c_n": 740.0, "calib/step_q_gap": 0.12756198024948023, "calib/step_q_w": 0.37680288461538464, "calib/step_q_w_n": 832.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 557.296875, "completions/mean_terminated_length": 563.9051513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.10666666666666667, "grad_norm": 0.02691764384508133, "kl": 0.08896636962890625, "learning_rate": 2.7777777777777783e-06, "loss": 0.0165, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03223732113838196, "mask/share_reasoning": 0.8407446146011353, "mask/share_step_conf": 0.11529930680990219, "num_tokens": 23988587.0, "reward": 0.8852555751800537, "reward_std": 0.20348386466503143, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7959659695625305, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6800137758255005, "step": 100 }, { "adv/mean_abs_final_conf": 0.6966441869735718, "adv/mean_abs_reasoning": 0.5274724364280701, "adv/mean_abs_step_conf": 0.7686508297920227, "adv/ratio_final_to_reasoning": 1.3207214991006855, "adv/ratio_step_to_reasoning": 1.4572341163401084, "adv/std_final_conf": 0.8765040636062622, "adv/std_reasoning": 0.7576013803482056, "adv/std_step_conf": 0.935871422290802, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7394955194158646, "calib/avg_num_step_conf": 5.48828125, "calib/ece": 0.21447154471544716, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.4024390243902439, "calib/gap": 0.3306777298373713, "calib/mean_conf": 0.5500813008130081, "calib/mu_c": 0.7261739130434782, "calib/mu_w": 0.3954961832061069, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.14853658536585368, "calib/std_conf": 0.4048837953031966, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4583184895833334, "calib/step_q_c_n": 640.0, "calib/step_q_gap": 0.05150646343954257, "calib/step_q_w": 0.4068120261437908, "calib/step_q_w_n": 765.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 563.3359375, "completions/mean_terminated_length": 565.5451049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.10773333333333333, "grad_norm": 0.05202191695570946, "kl": 0.09320831298828125, "learning_rate": 2.7500000000000004e-06, "loss": -0.0365, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03161200135946274, "mask/share_reasoning": 0.8537149429321289, "mask/share_step_conf": 0.11076676100492477, "num_tokens": 24239793.0, "reward": 0.8232483863830566, "reward_std": 0.20141300559043884, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7158437371253967, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.648621678352356, "step": 101 }, { "adv/mean_abs_final_conf": 0.5397753715515137, "adv/mean_abs_reasoning": 0.3822072744369507, "adv/mean_abs_step_conf": 0.7508090734481812, "adv/ratio_final_to_reasoning": 1.412258236964968, "adv/ratio_step_to_reasoning": 1.9644028872926003, "adv/std_final_conf": 0.7761178016662598, "adv/std_reasoning": 0.6815133690834045, "adv/std_step_conf": 0.9356717467308044, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7669100964265456, "calib/avg_num_step_conf": 5.44140625, "calib/ece": 0.18171999999999988, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.62, "calib/gap": 0.37247022121384005, "calib/mean_conf": 0.71748, "calib/mu_c": 0.8456097560975611, "calib/mu_w": 0.473139534883721, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1215999999999999, "calib/std_conf": 0.37968572477774304, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5301996259351621, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.13019099649353777, "calib/step_q_w": 0.40000862944162435, "calib/step_q_w_n": 591.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 465.5, "completions/mean_terminated_length": 469.16534423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.1088, "grad_norm": 0.02946024015545845, "kl": 0.103668212890625, "learning_rate": 2.7222222222222224e-06, "loss": -0.0641, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03844807296991348, "mask/share_reasoning": 0.8282647132873535, "mask/share_step_conf": 0.1254746913909912, "num_tokens": 24465657.0, "reward": 0.8995387554168701, "reward_std": 0.16266554594039917, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7758800983428955, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.699759840965271, "step": 102 }, { "adv/mean_abs_final_conf": 0.5946629047393799, "adv/mean_abs_reasoning": 0.4869500994682312, "adv/mean_abs_step_conf": 0.7678142786026001, "adv/ratio_final_to_reasoning": 1.2211988566975864, "adv/ratio_step_to_reasoning": 1.576782260525429, "adv/std_final_conf": 0.8273619413375854, "adv/std_reasoning": 0.7576767206192017, "adv/std_step_conf": 0.9359101057052612, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7894394188596492, "calib/avg_num_step_conf": 5.53125, "calib/ece": 0.18274193548387105, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6290322580645161, "calib/gap": 0.3444901315789475, "calib/mean_conf": 0.7289516129032259, "calib/mu_c": 0.8623026315789475, "calib/mu_w": 0.5178125, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14939516129032265, "calib/std_conf": 0.3704853367617443, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5283688311688313, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.13278059587471358, "calib/step_q_w": 0.3955882352941177, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 590.29296875, "completions/mean_terminated_length": 592.6078491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.10986666666666667, "grad_norm": 0.026862915605306625, "kl": 0.09345245361328125, "learning_rate": 2.6944444444444444e-06, "loss": -0.0181, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03263404965400696, "mask/share_reasoning": 0.8558740615844727, "mask/share_step_conf": 0.1075856164097786, "num_tokens": 24721324.0, "reward": 0.8752179145812988, "reward_std": 0.22170045971870422, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7512491941452026, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6866865158081055, "step": 103 }, { "adv/mean_abs_final_conf": 0.6674306392669678, "adv/mean_abs_reasoning": 0.4978955090045929, "adv/mean_abs_step_conf": 0.7701559662818909, "adv/ratio_final_to_reasoning": 1.3405034333435029, "adv/ratio_step_to_reasoning": 1.5468224805273076, "adv/std_final_conf": 0.8744555115699768, "adv/std_reasoning": 0.7575566172599792, "adv/std_step_conf": 0.9357998967170715, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7863015431317987, "calib/avg_num_step_conf": 5.9921875, "calib/ece": 0.20384920634920628, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5277777777777778, "calib/gap": 0.4040817100935998, "calib/mean_conf": 0.6436904761904761, "calib/mu_c": 0.8585593220338983, "calib/mu_w": 0.4544776119402985, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18964285714285709, "calib/std_conf": 0.4054855401153417, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5442792792792793, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.14303119940216724, "calib/step_q_w": 0.4012480798771121, "calib/step_q_w_n": 868.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 547.1171875, "completions/mean_terminated_length": 549.2627563476562, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.11093333333333333, "grad_norm": 0.034476689994335175, "kl": 0.10207366943359375, "learning_rate": 2.666666666666667e-06, "loss": -0.0127, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03161388635635376, "mask/share_reasoning": 0.846795916557312, "mask/share_step_conf": 0.11768393218517303, "num_tokens": 24968066.0, "reward": 0.8389008045196533, "reward_std": 0.2200300395488739, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.7377249598503113, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6525766253471375, "step": 104 }, { "adv/mean_abs_final_conf": 0.6893397569656372, "adv/mean_abs_reasoning": 0.6556515693664551, "adv/mean_abs_step_conf": 0.772680401802063, "adv/ratio_final_to_reasoning": 1.0513812353591017, "adv/ratio_step_to_reasoning": 1.1784924156418795, "adv/std_final_conf": 0.876616358757019, "adv/std_reasoning": 0.8590664863586426, "adv/std_step_conf": 0.9359455704689026, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7068999345977762, "calib/avg_num_step_conf": 5.79296875, "calib/ece": 0.26803212851405617, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6104417670682731, "calib/gap": 0.30777697841726614, "calib/mean_conf": 0.6828112449799196, "calib/mu_c": 0.8187769784172662, "calib/mu_w": 0.511, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19630522088353408, "calib/std_conf": 0.4171495938493178, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5334794908062234, "calib/step_q_c_n": 707.0, "calib/step_q_gap": 0.20042317637323365, "calib/step_q_w": 0.3330563144329897, "calib/step_q_w_n": 776.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2749.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 524.859375, "completions/mean_terminated_length": 528.9921264648438, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.112, "grad_norm": 0.0510830283164978, "kl": 0.1021575927734375, "learning_rate": 2.6388888888888893e-06, "loss": 0.0018, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03460601717233658, "mask/share_reasoning": 0.8448066115379333, "mask/share_step_conf": 0.11277486383914948, "num_tokens": 25208190.0, "reward": 0.8309906721115112, "reward_std": 0.2768874168395996, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6843714714050293, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6768285036087036, "step": 105 }, { "adv/mean_abs_final_conf": 0.5826190114021301, "adv/mean_abs_reasoning": 0.4672355651855469, "adv/mean_abs_step_conf": 0.7615571022033691, "adv/ratio_final_to_reasoning": 1.2469491939697754, "adv/ratio_step_to_reasoning": 1.629921091090192, "adv/std_final_conf": 0.8441154956817627, "adv/std_reasoning": 0.7573754787445068, "adv/std_step_conf": 0.9359279274940491, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8174759127789046, "calib/avg_num_step_conf": 5.3203125, "calib/ece": 0.23849206349206362, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6785714285714286, "calib/gap": 0.41696247464503045, "calib/mean_conf": 0.7496825396825397, "calib/mu_c": 0.9416176470588236, "calib/mu_w": 0.5246551724137931, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.22424603174603186, "calib/std_conf": 0.381475698757806, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5276293103448276, "calib/step_q_c_n": 696.0, "calib/step_q_gap": 0.08791459563011289, "calib/step_q_w": 0.43971471471471474, "calib/step_q_w_n": 666.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 529.5, "completions/mean_terminated_length": 531.5764770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.11306666666666666, "grad_norm": 0.03778756782412529, "kl": 0.10107421875, "learning_rate": 2.6111111111111113e-06, "loss": 0.006, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03254499286413193, "mask/share_reasoning": 0.854480504989624, "mask/share_step_conf": 0.10906830430030823, "num_tokens": 25448326.0, "reward": 0.8657881021499634, "reward_std": 0.20610877871513367, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7496660351753235, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.680347740650177, "step": 106 }, { "adv/mean_abs_final_conf": 0.6514444351196289, "adv/mean_abs_reasoning": 0.5541516542434692, "adv/mean_abs_step_conf": 0.7585681676864624, "adv/ratio_final_to_reasoning": 1.1755706766029315, "adv/ratio_step_to_reasoning": 1.3688818969999532, "adv/std_final_conf": 0.8437116146087646, "adv/std_reasoning": 0.7929292321205139, "adv/std_step_conf": 0.9359145164489746, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6583306886702633, "calib/avg_num_step_conf": 5.21484375, "calib/ece": 0.3572354497354498, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8174603174603174, "calib/gap": 0.162450227441024, "calib/mean_conf": 0.8686640211640212, "calib/mu_c": 0.9427980535279805, "calib/mu_w": 0.7803478260869565, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.34112433862433866, "calib/std_conf": 0.2926518147032169, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5499251497005988, "calib/step_q_c_n": 668.0, "calib/step_q_gap": 0.10364329063013411, "calib/step_q_w": 0.4462818590704647, "calib/step_q_w_n": 667.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 473.02734375, "completions/mean_terminated_length": 474.88238525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.11413333333333334, "grad_norm": 0.03718942403793335, "kl": 0.1115875244140625, "learning_rate": 2.5833333333333337e-06, "loss": -0.1106, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.034642137587070465, "mask/share_reasoning": 0.8416838049888611, "mask/share_step_conf": 0.11976781487464905, "num_tokens": 25674037.0, "reward": 0.7891925573348999, "reward_std": 0.25099611282348633, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6162525415420532, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6613513231277466, "step": 107 }, { "adv/mean_abs_final_conf": 0.5130915641784668, "adv/mean_abs_reasoning": 0.46088606119155884, "adv/mean_abs_step_conf": 0.7698599100112915, "adv/ratio_final_to_reasoning": 1.1132720370235056, "adv/ratio_step_to_reasoning": 1.670390959581035, "adv/std_final_conf": 0.7588806748390198, "adv/std_reasoning": 0.7206546068191528, "adv/std_step_conf": 0.9357091188430786, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6596387847248795, "calib/avg_num_step_conf": 6.12890625, "calib/ece": 0.24246825396825394, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8452380952380952, "calib/gap": 0.19065386087089609, "calib/mean_conf": 0.8751507936507937, "calib/mu_c": 0.9303798882681563, "calib/mu_w": 0.7397260273972602, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20365079365079364, "calib/std_conf": 0.2976943027607272, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5298139079333987, "calib/step_q_c_n": 1021.0, "calib/step_q_gap": 0.07169346997719428, "calib/step_q_w": 0.4581204379562044, "calib/step_q_w_n": 548.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 543.625, "completions/mean_terminated_length": 543.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1152, "grad_norm": 0.03079112060368061, "kl": 0.1038818359375, "learning_rate": 2.5555555555555557e-06, "loss": 0.0951, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03367979824542999, "mask/share_reasoning": 0.8438863754272461, "mask/share_step_conf": 0.1224338486790657, "num_tokens": 25916437.0, "reward": 0.8804038763046265, "reward_std": 0.2148403525352478, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7413252592086792, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.683544933795929, "step": 108 }, { "adv/mean_abs_final_conf": 0.5654588341712952, "adv/mean_abs_reasoning": 0.40117618441581726, "adv/mean_abs_step_conf": 0.7693681716918945, "adv/ratio_final_to_reasoning": 1.409502498247004, "adv/ratio_step_to_reasoning": 1.917781268128439, "adv/std_final_conf": 0.7718923091888428, "adv/std_reasoning": 0.7012916207313538, "adv/std_step_conf": 0.9355076551437378, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8424151627064316, "calib/avg_num_step_conf": 7.265625, "calib/ece": 0.16341666666666663, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6083333333333333, "calib/gap": 0.5384042923838059, "calib/mean_conf": 0.6636666666666666, "calib/mu_c": 0.9171653543307087, "calib/mu_w": 0.3787610619469027, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14895833333333328, "calib/std_conf": 0.433065494148659, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5212273901808786, "calib/step_q_c_n": 774.0, "calib/step_q_gap": 0.24687195740003148, "calib/step_q_w": 0.2743554327808471, "calib/step_q_w_n": 1086.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 595.25390625, "completions/mean_terminated_length": 602.312255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.11626666666666667, "grad_norm": 0.04381285607814789, "kl": 0.098236083984375, "learning_rate": 2.5277777777777778e-06, "loss": 0.0054, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.029191289097070694, "mask/share_reasoning": 0.8358380794525146, "mask/share_step_conf": 0.12325184047222137, "num_tokens": 26173422.0, "reward": 0.873216450214386, "reward_std": 0.1828155517578125, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7626570463180542, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6970571279525757, "step": 109 }, { "adv/mean_abs_final_conf": 0.6914485692977905, "adv/mean_abs_reasoning": 0.507434606552124, "adv/mean_abs_step_conf": 0.7679909467697144, "adv/ratio_final_to_reasoning": 1.3626358162601282, "adv/ratio_step_to_reasoning": 1.5134776715131781, "adv/std_final_conf": 0.8608611822128296, "adv/std_reasoning": 0.7576548457145691, "adv/std_step_conf": 0.9358069896697998, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.6680944881889763, "calib/avg_num_step_conf": 5.2265625, "calib/ece": 0.28321428571428575, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.6111111111111112, "calib/gap": 0.2855200000000001, "calib/mean_conf": 0.6983730158730159, "calib/mu_c": 0.8400000000000002, "calib/mu_w": 0.5544800000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23880952380952386, "calib/std_conf": 0.4011661223041588, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.510073775671406, "calib/step_q_c_n": 633.0, "calib/step_q_gap": 0.08099576148700882, "calib/step_q_w": 0.42907801418439717, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2154.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 516.265625, "completions/mean_terminated_length": 516.265625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.11733333333333333, "grad_norm": 0.036357905715703964, "kl": 0.10791015625, "learning_rate": 2.5e-06, "loss": -0.0157, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033511631190776825, "mask/share_reasoning": 0.8503848910331726, "mask/share_step_conf": 0.11610350012779236, "num_tokens": 26410506.0, "reward": 0.8202331066131592, "reward_std": 0.23582850396633148, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.6793886423110962, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.665765106678009, "step": 110 }, { "adv/mean_abs_final_conf": 0.6397824883460999, "adv/mean_abs_reasoning": 0.47608935832977295, "adv/mean_abs_step_conf": 0.7391402721405029, "adv/ratio_final_to_reasoning": 1.3438285841771358, "adv/ratio_step_to_reasoning": 1.5525242461490232, "adv/std_final_conf": 0.8426072001457214, "adv/std_reasoning": 0.7394782304763794, "adv/std_step_conf": 0.9360306262969971, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7719921104536489, "calib/avg_num_step_conf": 5.2734375, "calib/ece": 0.2665182186234818, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6356275303643725, "calib/gap": 0.32824786324786315, "calib/mean_conf": 0.7125910931174089, "calib/mu_c": 0.8680769230769231, "calib/mu_w": 0.5398290598290599, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.22639676113360324, "calib/std_conf": 0.4041492373796017, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.553573943661972, "calib/step_q_c_n": 568.0, "calib/step_q_gap": 0.16092176974892852, "calib/step_q_w": 0.39265217391304347, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2088.0, "completions/max_terminated_length": 2088.0, "completions/mean_length": 540.1640625, "completions/mean_terminated_length": 540.1640625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1184, "grad_norm": 0.04719334468245506, "kl": 0.1068572998046875, "learning_rate": 2.4722222222222226e-06, "loss": -0.0502, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.0356113538146019, "mask/share_reasoning": 0.8533041477203369, "mask/share_step_conf": 0.11108453571796417, "num_tokens": 26656196.0, "reward": 0.8155786991119385, "reward_std": 0.2449261099100113, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6872370839118958, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6501702070236206, "step": 111 }, { "adv/mean_abs_final_conf": 0.6649261116981506, "adv/mean_abs_reasoning": 0.48851847648620605, "adv/mean_abs_step_conf": 0.750723123550415, "adv/ratio_final_to_reasoning": 1.3611073965529443, "adv/ratio_step_to_reasoning": 1.5367343502546371, "adv/std_final_conf": 0.8596379160881042, "adv/std_reasoning": 0.7754268050193787, "adv/std_step_conf": 0.9359248280525208, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.8349869091911257, "calib/avg_num_step_conf": 5.37890625, "calib/ece": 0.14709543568464734, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.48132780082987553, "calib/gap": 0.5595528455284552, "calib/mean_conf": 0.5505809128630705, "calib/mu_c": 0.8245528455284552, "calib/mu_w": 0.265, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.09365145228215771, "calib/std_conf": 0.44771211237724085, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.5092366412213741, "calib/step_q_c_n": 655.0, "calib/step_q_gap": 0.18262777695544613, "calib/step_q_w": 0.326608864265928, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 622.4140625, "completions/mean_terminated_length": 627.31494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.11946666666666667, "grad_norm": 0.03650563210248947, "kl": 0.09206390380859375, "learning_rate": 2.4444444444444447e-06, "loss": -0.0677, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.028805337846279144, "mask/share_reasoning": 0.8638450503349304, "mask/share_step_conf": 0.09953711181879044, "num_tokens": 26923454.0, "reward": 0.8434610366821289, "reward_std": 0.22578881680965424, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7792026996612549, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6233442425727844, "step": 112 }, { "adv/mean_abs_final_conf": 0.6491919755935669, "adv/mean_abs_reasoning": 0.49580565094947815, "adv/mean_abs_step_conf": 0.7402918338775635, "adv/ratio_final_to_reasoning": 1.3093678427229516, "adv/ratio_step_to_reasoning": 1.4931089076130721, "adv/std_final_conf": 0.8549534678459167, "adv/std_reasoning": 0.757546067237854, "adv/std_step_conf": 0.9357084035873413, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.8036998972250771, "calib/avg_num_step_conf": 5.484375, "calib/ece": 0.20167330677290835, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.4900398406374502, "calib/gap": 0.42600077081192184, "calib/mean_conf": 0.5850199203187251, "calib/mu_c": 0.7751079136690647, "calib/mu_w": 0.3491071428571429, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11645418326693224, "calib/std_conf": 0.43168748333547907, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5039972337482711, "calib/step_q_c_n": 723.0, "calib/step_q_gap": 0.1472865142181683, "calib/step_q_w": 0.3567107195301028, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 485.78125, "completions/mean_terminated_length": 485.78125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.12053333333333334, "grad_norm": 0.04201961308717728, "kl": 0.1237335205078125, "learning_rate": 2.4166666666666667e-06, "loss": 0.0239, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03510171175003052, "mask/share_reasoning": 0.8447432518005371, "mask/share_step_conf": 0.12015500664710999, "num_tokens": 27153014.0, "reward": 0.8813836574554443, "reward_std": 0.1917453110218048, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.7570343613624573, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.7018265724182129, "step": 113 }, { "adv/mean_abs_final_conf": 0.6625143885612488, "adv/mean_abs_reasoning": 0.4924757778644562, "adv/mean_abs_step_conf": 0.7456798553466797, "adv/ratio_final_to_reasoning": 1.3452730435477218, "adv/ratio_step_to_reasoning": 1.5141452409704355, "adv/std_final_conf": 0.8566796183586121, "adv/std_reasoning": 0.7393878698348999, "adv/std_step_conf": 0.9358442425727844, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8234196961475855, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.16908969210174024, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5542168674698795, "calib/gap": 0.4935476578043046, "calib/mean_conf": 0.6339625167336012, "calib/mu_c": 0.8262280701754386, "calib/mu_w": 0.33268041237113405, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.0963052208835341, "calib/std_conf": 0.4255745586569815, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5245025510204081, "calib/step_q_c_n": 784.0, "calib/step_q_gap": 0.20095960623513204, "calib/step_q_w": 0.32354294478527607, "calib/step_q_w_n": 652.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 481.01171875, "completions/mean_terminated_length": 482.8980712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.1216, "grad_norm": 0.06588571518659592, "kl": 0.1134490966796875, "learning_rate": 2.388888888888889e-06, "loss": 0.0127, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03660762310028076, "mask/share_reasoning": 0.8350028991699219, "mask/share_step_conf": 0.12448324263095856, "num_tokens": 27381177.0, "reward": 0.8956842422485352, "reward_std": 0.21398276090621948, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.7890973091125488, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6897711753845215, "step": 114 }, { "adv/mean_abs_final_conf": 0.6562278270721436, "adv/mean_abs_reasoning": 0.49689021706581116, "adv/mean_abs_step_conf": 0.7802358269691467, "adv/ratio_final_to_reasoning": 1.3206696459979383, "adv/ratio_step_to_reasoning": 1.5702378516858737, "adv/std_final_conf": 0.8748510479927063, "adv/std_reasoning": 0.7753586173057556, "adv/std_step_conf": 0.9357790946960449, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7208734787018255, "calib/avg_num_step_conf": 5.69140625, "calib/ece": 0.25587301587301586, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.44047619047619047, "calib/gap": 0.3375532454361055, "calib/mean_conf": 0.5238095238095238, "calib/mu_c": 0.6791911764705882, "calib/mu_w": 0.34163793103448276, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12000000000000001, "calib/std_conf": 0.4412039033754591, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4502941176470588, "calib/step_q_c_n": 748.0, "calib/step_q_gap": 0.061746867999668165, "calib/step_q_w": 0.3885472496473906, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 514.38671875, "completions/mean_terminated_length": 516.4039306640625, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.12266666666666666, "grad_norm": 0.03526785224676132, "kl": 0.1184539794921875, "learning_rate": 2.361111111111111e-06, "loss": 0.0283, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033889539539813995, "mask/share_reasoning": 0.8409953117370605, "mask/share_step_conf": 0.12120887637138367, "num_tokens": 27618124.0, "reward": 0.8284285068511963, "reward_std": 0.20196843147277832, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7062526941299438, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6490417718887329, "step": 115 }, { "adv/mean_abs_final_conf": 0.7022169828414917, "adv/mean_abs_reasoning": 0.4930022358894348, "adv/mean_abs_step_conf": 0.7640174627304077, "adv/ratio_final_to_reasoning": 1.4243687588446907, "adv/ratio_step_to_reasoning": 1.5497241333034304, "adv/std_final_conf": 0.8844742178916931, "adv/std_reasoning": 0.7393848896026611, "adv/std_step_conf": 0.9359100461006165, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7323863636363637, "calib/avg_num_step_conf": 5.6015625, "calib/ece": 0.24416666666666656, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.3412698412698413, "calib/gap": 0.355151515151515, "calib/mean_conf": 0.42686507936507934, "calib/mu_c": 0.5959848484848483, "calib/mu_w": 0.24083333333333332, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.07361111111111097, "calib/std_conf": 0.4298662740230431, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4934696308724832, "calib/step_q_c_n": 596.0, "calib/step_q_gap": 0.1733264327817911, "calib/step_q_w": 0.3201431980906921, "calib/step_q_w_n": 838.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 583.46875, "completions/mean_terminated_length": 583.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.12373333333333333, "grad_norm": 0.05140653997659683, "kl": 0.10097503662109375, "learning_rate": 2.3333333333333336e-06, "loss": 0.0501, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.033677708357572556, "mask/share_reasoning": 0.8585555553436279, "mask/share_step_conf": 0.10776673257350922, "num_tokens": 27872012.0, "reward": 0.8470515608787537, "reward_std": 0.20287998020648956, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7065671682357788, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6906609535217285, "step": 116 }, { "adv/mean_abs_final_conf": 0.7353163361549377, "adv/mean_abs_reasoning": 0.581294059753418, "adv/mean_abs_step_conf": 0.7533575892448425, "adv/ratio_final_to_reasoning": 1.2649644767862505, "adv/ratio_step_to_reasoning": 1.296000839169788, "adv/std_final_conf": 0.9155080318450928, "adv/std_reasoning": 0.809903085231781, "adv/std_step_conf": 0.9358757138252258, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.676923076923077, "calib/avg_num_step_conf": 5.8828125, "calib/ece": 0.28400793650793654, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.23015873015873015, "calib/gap": 0.24174928774928775, "calib/mean_conf": 0.32416666666666666, "calib/mu_c": 0.4536752136752137, "calib/mu_w": 0.21192592592592593, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07194444444444445, "calib/std_conf": 0.39142558436468883, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.44987281399046103, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.10096734078635616, "calib/step_q_w": 0.34890547320410487, "calib/step_q_w_n": 877.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 554.640625, "completions/mean_terminated_length": 556.8157348632812, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.1248, "grad_norm": 0.03409578278660774, "kl": 0.1085052490234375, "learning_rate": 2.305555555555556e-06, "loss": 0.0262, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.031194612383842468, "mask/share_reasoning": 0.8501709699630737, "mask/share_step_conf": 0.11472815275192261, "num_tokens": 28120600.0, "reward": 0.8299675583839417, "reward_std": 0.1901315152645111, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6877691745758057, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6838846206665039, "step": 117 }, { "adv/mean_abs_final_conf": 0.6877869367599487, "adv/mean_abs_reasoning": 0.3961462378501892, "adv/mean_abs_step_conf": 0.7762961387634277, "adv/ratio_final_to_reasoning": 1.7361945439452826, "adv/ratio_step_to_reasoning": 1.9596201215395612, "adv/std_final_conf": 0.8664116859436035, "adv/std_reasoning": 0.6614351868629456, "adv/std_step_conf": 0.9355314373970032, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7724089635854342, "calib/avg_num_step_conf": 6.54296875, "calib/ece": 0.23402489626556028, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.34024896265560167, "calib/gap": 0.4088928571428571, "calib/mean_conf": 0.41560165975103736, "calib/mu_c": 0.59375, "calib/mu_w": 0.1848571428571429, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.042655601659751116, "calib/std_conf": 0.42794179694674456, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.48573924731182794, "calib/step_q_c_n": 744.0, "calib/step_q_gap": 0.15860713130753146, "calib/step_q_w": 0.3271321160042965, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2951.0, "completions/max_terminated_length": 2951.0, "completions/mean_length": 591.2109375, "completions/mean_terminated_length": 593.5294189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.12586666666666665, "grad_norm": 0.030181964859366417, "kl": 0.10308837890625, "learning_rate": 2.277777777777778e-06, "loss": 0.0364, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03168809413909912, "mask/share_reasoning": 0.8426321148872375, "mask/share_step_conf": 0.12177351117134094, "num_tokens": 28375958.0, "reward": 0.8295300006866455, "reward_std": 0.20747120678424835, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7060078382492065, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.657739520072937, "step": 118 }, { "adv/mean_abs_final_conf": 0.7348015308380127, "adv/mean_abs_reasoning": 0.5914027094841003, "adv/mean_abs_step_conf": 0.789761483669281, "adv/ratio_final_to_reasoning": 1.242472378050151, "adv/ratio_step_to_reasoning": 1.3354038982307257, "adv/std_final_conf": 0.88929283618927, "adv/std_reasoning": 0.8099509477615356, "adv/std_step_conf": 0.9357774257659912, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7302684028700505, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.3320564516129032, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.23790322580645162, "calib/gap": 0.2736526707414297, "calib/mean_conf": 0.3336693548387097, "calib/mu_c": 0.45063380281690135, "calib/mu_w": 0.17698113207547167, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.046572580645161316, "calib/std_conf": 0.4023607798051274, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.47201560468140447, "calib/step_q_c_n": 769.0, "calib/step_q_gap": 0.17291339473665307, "calib/step_q_w": 0.2991022099447514, "calib/step_q_w_n": 724.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 594.43359375, "completions/mean_terminated_length": 594.43359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12693333333333334, "grad_norm": 0.03562173619866371, "kl": 0.10428619384765625, "learning_rate": 2.25e-06, "loss": 0.0115, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03042715974152088, "mask/share_reasoning": 0.8605513572692871, "mask/share_step_conf": 0.10902152210474014, "num_tokens": 28633197.0, "reward": 0.8282876014709473, "reward_std": 0.21210943162441254, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6453093886375427, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.7081408500671387, "step": 119 }, { "adv/mean_abs_final_conf": 0.7406454682350159, "adv/mean_abs_reasoning": 0.4430517256259918, "adv/mean_abs_step_conf": 0.7665520310401917, "adv/ratio_final_to_reasoning": 1.6716907426295455, "adv/ratio_step_to_reasoning": 1.7301637409427157, "adv/std_final_conf": 0.9075197577476501, "adv/std_reasoning": 0.720545768737793, "adv/std_step_conf": 0.935765266418457, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7524282169443459, "calib/avg_num_step_conf": 5.53125, "calib/ece": 0.3556097560975611, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.21951219512195122, "calib/gap": 0.2853626373626374, "calib/mean_conf": 0.3224390243902439, "calib/mu_c": 0.42800000000000005, "calib/mu_w": 0.14263736263736265, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.023983739837398453, "calib/std_conf": 0.39028411886271586, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4414182558139535, "calib/step_q_c_n": 860.0, "calib/step_q_gap": 0.14240746444704705, "calib/step_q_w": 0.29901079136690645, "calib/step_q_w_n": 556.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 538.8515625, "completions/mean_terminated_length": 538.8515625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.128, "grad_norm": 0.10469574481248856, "kl": 0.1149749755859375, "learning_rate": 2.222222222222222e-06, "loss": -0.0268, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03278191387653351, "mask/share_reasoning": 0.8496057987213135, "mask/share_step_conf": 0.11761230230331421, "num_tokens": 28877831.0, "reward": 0.8052427768707275, "reward_std": 0.20154544711112976, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.6247480511665344, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6732374429702759, "step": 120 }, { "adv/mean_abs_final_conf": 0.8187373876571655, "adv/mean_abs_reasoning": 0.5719882249832153, "adv/mean_abs_step_conf": 0.7808330655097961, "adv/ratio_final_to_reasoning": 1.4313885354566362, "adv/ratio_step_to_reasoning": 1.365120873830417, "adv/std_final_conf": 0.9337377548217773, "adv/std_reasoning": 0.7929165363311768, "adv/std_step_conf": 0.935745358467102, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6220058878629733, "calib/avg_num_step_conf": 6.1171875, "calib/ece": 0.35663967611336034, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.1417004048582996, "calib/gap": 0.17057607386591722, "calib/mean_conf": 0.2710526315789473, "calib/mu_c": 0.3442553191489361, "calib/mu_w": 0.17367924528301887, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.02842105263157897, "calib/std_conf": 0.34524488941498305, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4288122605363985, "calib/step_q_c_n": 783.0, "calib/step_q_gap": 0.09837803320561939, "calib/step_q_w": 0.3304342273307791, "calib/step_q_w_n": 783.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 590.8671875, "completions/mean_terminated_length": 593.184326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.12906666666666666, "grad_norm": 0.047843992710113525, "kl": 0.10814666748046875, "learning_rate": 2.1944444444444445e-06, "loss": -0.0028, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029814988374710083, "mask/share_reasoning": 0.8531795740127563, "mask/share_step_conf": 0.11309921741485596, "num_tokens": 29134149.0, "reward": 0.7928335666656494, "reward_std": 0.1947779357433319, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6073909997940063, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6751511096954346, "step": 121 }, { "adv/mean_abs_final_conf": 0.7172421216964722, "adv/mean_abs_reasoning": 0.5039461851119995, "adv/mean_abs_step_conf": 0.764214038848877, "adv/ratio_final_to_reasoning": 1.4232514162937233, "adv/ratio_step_to_reasoning": 1.516459616970876, "adv/std_final_conf": 0.8973472714424133, "adv/std_reasoning": 0.7576039433479309, "adv/std_step_conf": 0.9354971647262573, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8712351478308926, "calib/avg_num_step_conf": 6.01953125, "calib/ece": 0.19612903225806452, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.32661290322580644, "calib/gap": 0.5359353412544903, "calib/mean_conf": 0.4505645161290323, "calib/mu_c": 0.6537012987012988, "calib/mu_w": 0.11776595744680854, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.012862903225806459, "calib/std_conf": 0.42767055156554645, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5368765133171913, "calib/step_q_c_n": 826.0, "calib/step_q_gap": 0.22497441541509344, "calib/step_q_w": 0.3119020979020979, "calib/step_q_w_n": 715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 572.37109375, "completions/mean_terminated_length": 572.37109375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.13013333333333332, "grad_norm": 0.08367012441158295, "kl": 0.10848236083984375, "learning_rate": 2.166666666666667e-06, "loss": 0.0144, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.03154144436120987, "mask/share_reasoning": 0.8526613116264343, "mask/share_step_conf": 0.115797258913517, "num_tokens": 29388020.0, "reward": 0.9030140042304993, "reward_std": 0.16899213194847107, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7720438241958618, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.7214842438697815, "step": 122 }, { "adv/mean_abs_final_conf": 0.7326878309249878, "adv/mean_abs_reasoning": 0.6410530805587769, "adv/mean_abs_step_conf": 0.7618309855461121, "adv/ratio_final_to_reasoning": 1.1429440917534273, "adv/ratio_step_to_reasoning": 1.1884054669576793, "adv/std_final_conf": 0.9054855108261108, "adv/std_reasoning": 0.8590341210365295, "adv/std_step_conf": 0.9356116056442261, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.734200867534201, "calib/avg_num_step_conf": 6.18359375, "calib/ece": 0.26967479674796746, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.2682926829268293, "calib/gap": 0.3152672672672672, "calib/mean_conf": 0.37796747967479677, "calib/mu_c": 0.5202222222222221, "calib/mu_w": 0.20495495495495492, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.049430894308943076, "calib/std_conf": 0.40186649757009824, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4807588739290085, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.13929673293684142, "calib/step_q_w": 0.3414621409921671, "calib/step_q_w_n": 766.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 629.16015625, "completions/mean_terminated_length": 631.6275024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.1312, "grad_norm": 0.045245952904224396, "kl": 0.0968475341796875, "learning_rate": 2.138888888888889e-06, "loss": 0.0366, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.027621716260910034, "mask/share_reasoning": 0.8611097931861877, "mask/share_step_conf": 0.10736224055290222, "num_tokens": 29654373.0, "reward": 0.8298928737640381, "reward_std": 0.23200906813144684, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6820992231369019, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6815928220748901, "step": 123 }, { "adv/mean_abs_final_conf": 0.6971818208694458, "adv/mean_abs_reasoning": 0.4460069239139557, "adv/mean_abs_step_conf": 0.7492355108261108, "adv/ratio_final_to_reasoning": 1.5631636718804554, "adv/ratio_step_to_reasoning": 1.679874169331628, "adv/std_final_conf": 0.8757473230361938, "adv/std_reasoning": 0.7206626534461975, "adv/std_step_conf": 0.9358493685722351, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7534274193548387, "calib/avg_num_step_conf": 5.921875, "calib/ece": 0.24800796812749007, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.40239043824701193, "calib/gap": 0.3463111559139785, "calib/mean_conf": 0.5208366533864541, "calib/mu_c": 0.6532903225806451, "calib/mu_w": 0.30697916666666664, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.0756573705179283, "calib/std_conf": 0.42253137888986314, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5294457274826789, "calib/step_q_c_n": 866.0, "calib/step_q_gap": 0.11970726594421732, "calib/step_q_w": 0.4097384615384616, "calib/step_q_w_n": 650.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 556.14453125, "completions/mean_terminated_length": 556.14453125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.13226666666666667, "grad_norm": 0.042626217007637024, "kl": 0.1062164306640625, "learning_rate": 2.1111111111111114e-06, "loss": 0.0339, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03107757493853569, "mask/share_reasoning": 0.8505281805992126, "mask/share_step_conf": 0.11839421093463898, "num_tokens": 29903562.0, "reward": 0.8588284850120544, "reward_std": 0.18197014927864075, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7211695313453674, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6800811290740967, "step": 124 }, { "adv/mean_abs_final_conf": 0.705986499786377, "adv/mean_abs_reasoning": 0.5254706144332886, "adv/mean_abs_step_conf": 0.7537710666656494, "adv/ratio_final_to_reasoning": 1.3435318367855296, "adv/ratio_step_to_reasoning": 1.434468542981379, "adv/std_final_conf": 0.8920004963874817, "adv/std_reasoning": 0.7754695415496826, "adv/std_step_conf": 0.9357609748840332, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.6819216757741349, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.29302419354838705, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.43548387096774194, "calib/gap": 0.24824225865209465, "calib/mean_conf": 0.5389919354838709, "calib/mu_c": 0.6611111111111111, "calib/mu_w": 0.41286885245901644, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.16197580645161283, "calib/std_conf": 0.429372422675605, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.564992, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.13887619782870936, "calib/step_q_w": 0.4261158021712907, "calib/step_q_w_n": 829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 604.16796875, "completions/mean_terminated_length": 604.16796875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.13333333333333333, "grad_norm": 0.04363800212740898, "kl": 0.09326934814453125, "learning_rate": 2.0833333333333334e-06, "loss": 0.0111, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031416282057762146, "mask/share_reasoning": 0.8578252792358398, "mask/share_step_conf": 0.11075843870639801, "num_tokens": 30163037.0, "reward": 0.7977668642997742, "reward_std": 0.2162882685661316, "rewards/accuracy_reward_step": 0.4921875, "rewards/final_brier_reward_step": 0.6658421754837036, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.638285219669342, "step": 125 }, { "adv/mean_abs_final_conf": 0.6187704205513, "adv/mean_abs_reasoning": 0.5242427587509155, "adv/mean_abs_step_conf": 0.7606955766677856, "adv/ratio_final_to_reasoning": 1.1803127658369768, "adv/ratio_step_to_reasoning": 1.4510368793271524, "adv/std_final_conf": 0.8281337022781372, "adv/std_reasoning": 0.7754493355751038, "adv/std_step_conf": 0.935357928276062, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8422611213477231, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.16449392712550603, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.5060728744939271, "calib/gap": 0.536800473808897, "calib/mean_conf": 0.5879757085020242, "calib/mu_c": 0.8400763358778626, "calib/mu_w": 0.30327586206896556, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11105263157894732, "calib/std_conf": 0.4373752585810382, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5522022160664821, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.18518816454423387, "calib/step_q_w": 0.36701405152224825, "calib/step_q_w_n": 854.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 574.4765625, "completions/mean_terminated_length": 579.0, "completions/min_length": 0.0, "completions/min_terminated_length": 63.0, "epoch": 0.1344, "grad_norm": 0.02868361584842205, "kl": 0.09455108642578125, "learning_rate": 2.0555555555555555e-06, "loss": -0.0941, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.032516445964574814, "mask/share_reasoning": 0.8370121121406555, "mask/share_step_conf": 0.12265896797180176, "num_tokens": 30415567.0, "reward": 0.881715714931488, "reward_std": 0.19162803888320923, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7947573661804199, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6733614206314087, "step": 126 }, { "adv/mean_abs_final_conf": 0.681225061416626, "adv/mean_abs_reasoning": 0.49039721488952637, "adv/mean_abs_step_conf": 0.7797641754150391, "adv/ratio_final_to_reasoning": 1.3891291400790033, "adv/ratio_step_to_reasoning": 1.590066484351261, "adv/std_final_conf": 0.879097044467926, "adv/std_reasoning": 0.7752722501754761, "adv/std_step_conf": 0.935668408870697, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7891156462585034, "calib/avg_num_step_conf": 6.9375, "calib/ece": 0.18991795918367346, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4489795918367347, "calib/gap": 0.4317468720821663, "calib/mean_conf": 0.5536738775510205, "calib/mu_c": 0.7757151260504203, "calib/mu_w": 0.343968253968254, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.12893877551020408, "calib/std_conf": 0.4321931973173728, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5909212283044059, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.23538841428298424, "calib/step_q_w": 0.3555328140214216, "calib/step_q_w_n": 1027.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2663.0, "completions/max_terminated_length": 2663.0, "completions/mean_length": 574.3828125, "completions/mean_terminated_length": 578.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.13546666666666668, "grad_norm": 0.03852643445134163, "kl": 0.097259521484375, "learning_rate": 2.027777777777778e-06, "loss": -0.0136, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.03190770372748375, "mask/share_reasoning": 0.8286213874816895, "mask/share_step_conf": 0.1316584348678589, "num_tokens": 30666281.0, "reward": 0.8436447381973267, "reward_std": 0.20062381029129028, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.7412128448486328, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.661701500415802, "step": 127 }, { "adv/mean_abs_final_conf": 0.7218127250671387, "adv/mean_abs_reasoning": 0.6118108034133911, "adv/mean_abs_step_conf": 0.7718486189842224, "adv/ratio_final_to_reasoning": 1.179797285435349, "adv/ratio_step_to_reasoning": 1.2615805649033238, "adv/std_final_conf": 0.8918989896774292, "adv/std_reasoning": 0.8266298174858093, "adv/std_step_conf": 0.9357960224151611, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7920240730492529, "calib/avg_num_step_conf": 5.49609375, "calib/ece": 0.23065843621399176, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.5061728395061729, "calib/gap": 0.3997516602102934, "calib/mean_conf": 0.6077777777777778, "calib/mu_c": 0.8364423076923078, "calib/mu_w": 0.43669064748201436, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.20522633744855967, "calib/std_conf": 0.4181716031869383, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.6208582089552239, "calib/step_q_c_n": 536.0, "calib/step_q_gap": 0.18634615384615383, "calib/step_q_w": 0.4345120551090701, "calib/step_q_w_n": 871.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 592.40234375, "completions/mean_terminated_length": 592.40234375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.13653333333333334, "grad_norm": 0.06992755830287933, "kl": 0.5743637084960938, "learning_rate": 2.0000000000000003e-06, "loss": 0.0527, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03100384771823883, "mask/share_reasoning": 0.8571256399154663, "mask/share_step_conf": 0.11187045276165009, "num_tokens": 30924600.0, "reward": 0.7991189360618591, "reward_std": 0.23056471347808838, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6868827939033508, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.6433862447738647, "step": 128 }, { "adv/mean_abs_final_conf": 0.6912192702293396, "adv/mean_abs_reasoning": 0.4884779453277588, "adv/mean_abs_step_conf": 0.7703871726989746, "adv/ratio_final_to_reasoning": 1.4150470391565897, "adv/ratio_step_to_reasoning": 1.577117616194648, "adv/std_final_conf": 0.8760667443275452, "adv/std_reasoning": 0.7575657367706299, "adv/std_step_conf": 0.9357084035873413, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.6890708173303686, "calib/avg_num_step_conf": 6.42578125, "calib/ece": 0.25760000000000005, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.616, "calib/gap": 0.22011859039148318, "calib/mean_conf": 0.7429600000000001, "calib/mu_c": 0.8283660130718955, "calib/mu_w": 0.6082474226804123, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.19428000000000006, "calib/std_conf": 0.35702554306379813, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5938338495575222, "calib/step_q_c_n": 904.0, "calib/step_q_gap": 0.11958283741177317, "calib/step_q_w": 0.474251012145749, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 520.86328125, "completions/mean_terminated_length": 522.9058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.1376, "grad_norm": 0.053724076598882675, "kl": 0.09871673583984375, "learning_rate": 1.9722222222222224e-06, "loss": 0.0358, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03388860076665878, "mask/share_reasoning": 0.8344084024429321, "mask/share_step_conf": 0.12779675424098969, "num_tokens": 31160325.0, "reward": 0.840374231338501, "reward_std": 0.21307358145713806, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7047882676124573, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.661116361618042, "step": 129 }, { "adv/mean_abs_final_conf": 0.6372058987617493, "adv/mean_abs_reasoning": 0.400245726108551, "adv/mean_abs_step_conf": 0.7534694075584412, "adv/ratio_final_to_reasoning": 1.5920367344258213, "adv/ratio_step_to_reasoning": 1.8825170599175667, "adv/std_final_conf": 0.8489394187927246, "adv/std_reasoning": 0.6816495060920715, "adv/std_step_conf": 0.9351107478141785, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7861244658119658, "calib/avg_num_step_conf": 5.54296875, "calib/ece": 0.18436468253968263, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6904761904761905, "calib/gap": 0.40012083333333326, "calib/mean_conf": 0.7673813492063493, "calib/mu_c": 0.9198083333333332, "calib/mu_w": 0.5196875, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.16634920634920644, "calib/std_conf": 0.369600384598021, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6010465393794749, "calib/step_q_c_n": 838.0, "calib/step_q_gap": 0.16238905228825296, "calib/step_q_w": 0.438657487091222, "calib/step_q_w_n": 581.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 476.19140625, "completions/mean_terminated_length": 478.058837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.13866666666666666, "grad_norm": 0.03426535055041313, "kl": 0.1034393310546875, "learning_rate": 1.944444444444445e-06, "loss": -0.0046, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.035158127546310425, "mask/share_reasoning": 0.837215781211853, "mask/share_step_conf": 0.12371985614299774, "num_tokens": 31387518.0, "reward": 0.9131213426589966, "reward_std": 0.1750873327255249, "rewards/accuracy_reward_step": 0.609375, "rewards/final_brier_reward_step": 0.7818734645843506, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.7256191968917847, "step": 130 }, { "adv/mean_abs_final_conf": 0.6618838310241699, "adv/mean_abs_reasoning": 0.42829570174217224, "adv/mean_abs_step_conf": 0.757628321647644, "adv/ratio_final_to_reasoning": 1.5453898517585738, "adv/ratio_step_to_reasoning": 1.7689374854005078, "adv/std_final_conf": 0.8466708660125732, "adv/std_reasoning": 0.7392298579216003, "adv/std_step_conf": 0.9358780384063721, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8254754621625217, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.2733333333333334, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5421686746987951, "calib/gap": 0.3904661524138847, "calib/mean_conf": 0.6777510040160643, "calib/mu_c": 0.9066990291262135, "calib/mu_w": 0.5162328767123288, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.26871485943775103, "calib/std_conf": 0.38695568357785987, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.6225497287522603, "calib/step_q_c_n": 553.0, "calib/step_q_gap": 0.180175001083415, "calib/step_q_w": 0.44237472766884534, "calib/step_q_w_n": 918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 537.8984375, "completions/mean_terminated_length": 540.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.13973333333333332, "grad_norm": 0.05444134399294853, "kl": 0.092529296875, "learning_rate": 1.916666666666667e-06, "loss": -0.0149, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03179455175995827, "mask/share_reasoning": 0.8473207950592041, "mask/share_step_conf": 0.11697839200496674, "num_tokens": 31631428.0, "reward": 0.7956714034080505, "reward_std": 0.22978679835796356, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6960210800170898, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6226654052734375, "step": 131 }, { "adv/mean_abs_final_conf": 0.6360141038894653, "adv/mean_abs_reasoning": 0.5169456601142883, "adv/mean_abs_step_conf": 0.744135856628418, "adv/ratio_final_to_reasoning": 1.2303306768236586, "adv/ratio_step_to_reasoning": 1.4394856443207233, "adv/std_final_conf": 0.8145498037338257, "adv/std_reasoning": 0.7576601505279541, "adv/std_step_conf": 0.9350570440292358, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8388744307091738, "calib/avg_num_step_conf": 6.29296875, "calib/ece": 0.24513944223107575, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.6972111553784861, "calib/gap": 0.3549986987638256, "calib/mean_conf": 0.7664940239043824, "calib/mu_c": 0.9164137931034483, "calib/mu_w": 0.5614150943396227, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.21697211155378493, "calib/std_conf": 0.3704853860532522, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.6456609195402299, "calib/step_q_c_n": 870.0, "calib/step_q_gap": 0.2348107171110801, "calib/step_q_w": 0.41085020242914977, "calib/step_q_w_n": 741.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 562.328125, "completions/mean_terminated_length": 562.328125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1408, "grad_norm": 0.04439416155219078, "kl": 0.08860015869140625, "learning_rate": 1.888888888888889e-06, "loss": -0.0004, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.033396653831005096, "mask/share_reasoning": 0.8381592035293579, "mask/share_step_conf": 0.128444105386734, "num_tokens": 31880976.0, "reward": 0.8613244891166687, "reward_std": 0.22873060405254364, "rewards/accuracy_reward_step": 0.56640625, "rewards/final_brier_reward_step": 0.7376258373260498, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6772106885910034, "step": 132 }, { "adv/mean_abs_final_conf": 0.7602120637893677, "adv/mean_abs_reasoning": 0.5774301290512085, "adv/mean_abs_step_conf": 0.794319748878479, "adv/ratio_final_to_reasoning": 1.3165438129085736, "adv/ratio_step_to_reasoning": 1.3756118860365805, "adv/std_final_conf": 0.9064504504203796, "adv/std_reasoning": 0.8099159002304077, "adv/std_step_conf": 0.9358553290367126, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7286092715231787, "calib/avg_num_step_conf": 6.80078125, "calib/ece": 0.34693227091633466, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5378486055776892, "calib/gap": 0.24443841059602656, "calib/mean_conf": 0.6531474103585657, "calib/mu_c": 0.8002000000000001, "calib/mu_w": 0.5557615894039736, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.30083665338645416, "calib/std_conf": 0.4015726730447699, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5384963503649635, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.08137198167809473, "calib/step_q_w": 0.45712436868686873, "calib/step_q_w_n": 1056.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 618.08984375, "completions/mean_terminated_length": 618.08984375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.14186666666666667, "grad_norm": 0.06433872133493423, "kl": 0.097198486328125, "learning_rate": 1.8611111111111113e-06, "loss": -0.0833, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.027483396232128143, "mask/share_reasoning": 0.8573780059814453, "mask/share_step_conf": 0.11513862758874893, "num_tokens": 32145551.0, "reward": 0.7753483057022095, "reward_std": 0.26082277297973633, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6386195421218872, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.6378582715988159, "step": 133 }, { "adv/mean_abs_final_conf": 0.6745326519012451, "adv/mean_abs_reasoning": 0.5636543035507202, "adv/mean_abs_step_conf": 0.7634190917015076, "adv/ratio_final_to_reasoning": 1.1967133891324002, "adv/ratio_step_to_reasoning": 1.3544101178548202, "adv/std_final_conf": 0.845694363117218, "adv/std_reasoning": 0.775534987449646, "adv/std_step_conf": 0.9360291957855225, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7445090439276486, "calib/avg_num_step_conf": 5.9765625, "calib/ece": 0.274738955823293, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6305220883534136, "calib/gap": 0.3084282945736434, "calib/mean_conf": 0.7245381526104419, "calib/mu_c": 0.8731782945736435, "calib/mu_w": 0.5647500000000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.24060240963855406, "calib/std_conf": 0.38394994176497294, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5048567870485678, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.09562707590688974, "calib/step_q_w": 0.4092297111416781, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 621.55859375, "completions/mean_terminated_length": 623.99609375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.14293333333333333, "grad_norm": 0.029332423582673073, "kl": 0.08277130126953125, "learning_rate": 1.8333333333333333e-06, "loss": -0.1807, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.028533540666103363, "mask/share_reasoning": 0.8640395402908325, "mask/share_step_conf": 0.10352067649364471, "num_tokens": 32413622.0, "reward": 0.8205505609512329, "reward_std": 0.2598622143268585, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6908574104309082, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6557124257087708, "step": 134 }, { "adv/mean_abs_final_conf": 0.7182911038398743, "adv/mean_abs_reasoning": 0.6433115005493164, "adv/mean_abs_step_conf": 0.733476996421814, "adv/ratio_final_to_reasoning": 1.1165525615919094, "adv/ratio_step_to_reasoning": 1.1401583770778327, "adv/std_final_conf": 0.9089246392250061, "adv/std_reasoning": 0.8748298287391663, "adv/std_step_conf": 0.9359358549118042, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.6704176804541767, "calib/avg_num_step_conf": 6.7265625, "calib/ece": 0.26257142857142846, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6122448979591837, "calib/gap": 0.2661023249526898, "calib/mean_conf": 0.7120408163265307, "calib/mu_c": 0.8293430656934306, "calib/mu_w": 0.5632407407407408, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.2077142857142856, "calib/std_conf": 0.38777218007504094, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4887920298879203, "calib/step_q_c_n": 803.0, "calib/step_q_gap": 0.10829148581827946, "calib/step_q_w": 0.38050054406964084, "calib/step_q_w_n": 919.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 620.19921875, "completions/mean_terminated_length": 625.0827026367188, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.144, "grad_norm": 0.04864854738116264, "kl": 0.084930419921875, "learning_rate": 1.8055555555555557e-06, "loss": 0.0746, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030507385730743408, "mask/share_reasoning": 0.8417672514915466, "mask/share_step_conf": 0.11991286277770996, "num_tokens": 32678273.0, "reward": 0.8297168016433716, "reward_std": 0.2450205683708191, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6764241456985474, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.686134397983551, "step": 135 }, { "adv/mean_abs_final_conf": 0.6761151552200317, "adv/mean_abs_reasoning": 0.5215020179748535, "adv/mean_abs_step_conf": 0.7722547650337219, "adv/ratio_final_to_reasoning": 1.2964765847802215, "adv/ratio_step_to_reasoning": 1.4808279516014442, "adv/std_final_conf": 0.8727582693099976, "adv/std_reasoning": 0.7928408980369568, "adv/std_step_conf": 0.9357433915138245, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.8062989997296567, "calib/avg_num_step_conf": 6.703125, "calib/ece": 0.21040816326530615, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4775510204081633, "calib/gap": 0.45645782643957816, "calib/mean_conf": 0.5883673469387755, "calib/mu_c": 0.843611111111111, "calib/mu_w": 0.3871532846715328, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.17897959183673473, "calib/std_conf": 0.4278686949968032, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5145760233918129, "calib/step_q_c_n": 684.0, "calib/step_q_gap": 0.19562253501971988, "calib/step_q_w": 0.318953488372093, "calib/step_q_w_n": 1032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 595.6875, "completions/mean_terminated_length": 598.0235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.14506666666666668, "grad_norm": 0.05059678107500076, "kl": 0.0946502685546875, "learning_rate": 1.777777777777778e-06, "loss": -0.0393, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029627911746501923, "mask/share_reasoning": 0.8413301706314087, "mask/share_step_conf": 0.1251356303691864, "num_tokens": 32939257.0, "reward": 0.8537254929542542, "reward_std": 0.24107038974761963, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.7365402579307556, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.695910632610321, "step": 136 }, { "adv/mean_abs_final_conf": 0.6564263105392456, "adv/mean_abs_reasoning": 0.5123791098594666, "adv/mean_abs_step_conf": 0.759476900100708, "adv/ratio_final_to_reasoning": 1.2811340234368411, "adv/ratio_step_to_reasoning": 1.4822557857774774, "adv/std_final_conf": 0.8764358758926392, "adv/std_reasoning": 0.7753708362579346, "adv/std_step_conf": 0.9357487559318542, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.82174636706997, "calib/avg_num_step_conf": 6.73046875, "calib/ece": 0.18956000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.508, "calib/gap": 0.4352736700595353, "calib/mean_conf": 0.6286799999999999, "calib/mu_c": 0.8428346456692914, "calib/mu_w": 0.4075609756097561, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.15512000000000004, "calib/std_conf": 0.4126386525763189, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5036849132176234, "calib/step_q_c_n": 749.0, "calib/step_q_gap": 0.1654302930944201, "calib/step_q_w": 0.33825462012320334, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 577.31640625, "completions/mean_terminated_length": 581.8621826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.14613333333333334, "grad_norm": 0.04172493517398834, "kl": 0.0917816162109375, "learning_rate": 1.75e-06, "loss": 0.0307, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.02957267314195633, "mask/share_reasoning": 0.8387045860290527, "mask/share_step_conf": 0.12391026318073273, "num_tokens": 33194034.0, "reward": 0.8764292001724243, "reward_std": 0.21637991070747375, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7532156109809875, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.7074552774429321, "step": 137 }, { "adv/mean_abs_final_conf": 0.5944216251373291, "adv/mean_abs_reasoning": 0.509716272354126, "adv/mean_abs_step_conf": 0.7596113681793213, "adv/ratio_final_to_reasoning": 1.1661813784990447, "adv/ratio_step_to_reasoning": 1.4902631314300683, "adv/std_final_conf": 0.813575804233551, "adv/std_reasoning": 0.757535457611084, "adv/std_step_conf": 0.9354429841041565, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7679584120982987, "calib/avg_num_step_conf": 6.1171875, "calib/ece": 0.20209486166007917, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5731225296442688, "calib/gap": 0.37669254658385093, "calib/mean_conf": 0.6783003952569171, "calib/mu_c": 0.81527950310559, "calib/mu_w": 0.4385869565217391, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12201581027667995, "calib/std_conf": 0.40390568093399337, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5056498388829216, "calib/step_q_c_n": 931.0, "calib/step_q_gap": 0.13876794911914203, "calib/step_q_w": 0.36688188976377956, "calib/step_q_w_n": 635.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 542.578125, "completions/mean_terminated_length": 542.578125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1472, "grad_norm": 0.04046575725078583, "kl": 0.10699462890625, "learning_rate": 1.7222222222222224e-06, "loss": 0.0195, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.0330255962908268, "mask/share_reasoning": 0.8443292379379272, "mask/share_step_conf": 0.12264513969421387, "num_tokens": 33437270.0, "reward": 0.9063873291015625, "reward_std": 0.18781498074531555, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7689160108566284, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.7204210758209229, "step": 138 }, { "adv/mean_abs_final_conf": 0.6607985496520996, "adv/mean_abs_reasoning": 0.436411052942276, "adv/mean_abs_step_conf": 0.7534776926040649, "adv/ratio_final_to_reasoning": 1.5141654758673202, "adv/ratio_step_to_reasoning": 1.7265321020724176, "adv/std_final_conf": 0.868923544883728, "adv/std_reasoning": 0.7205129861831665, "adv/std_step_conf": 0.9353315830230713, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.7728598561931898, "calib/avg_num_step_conf": 6.15234375, "calib/ece": 0.18521739130434786, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.4268774703557312, "calib/gap": 0.36057522724189406, "calib/mean_conf": 0.6115415019762845, "calib/mu_c": 0.7412345679012347, "calib/mu_w": 0.3806593406593406, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.07822134387351784, "calib/std_conf": 0.3977282267437555, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.5056736842105263, "calib/step_q_c_n": 950.0, "calib/step_q_gap": 0.13288968421052627, "calib/step_q_w": 0.372784, "calib/step_q_w_n": 625.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 544.03515625, "completions/mean_terminated_length": 544.03515625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.14826666666666666, "grad_norm": 0.059178274124860764, "kl": 0.09747314453125, "learning_rate": 1.6944444444444446e-06, "loss": 0.099, "mask/has_final_conf_rate": 0.98828125, "mask/share_final_conf": 0.03357715159654617, "mask/share_reasoning": 0.8442497253417969, "mask/share_step_conf": 0.12217306345701218, "num_tokens": 33679639.0, "reward": 0.9130136370658875, "reward_std": 0.17258858680725098, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7638003826141357, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.7387893199920654, "step": 139 }, { "adv/mean_abs_final_conf": 0.6083686947822571, "adv/mean_abs_reasoning": 0.526530921459198, "adv/mean_abs_step_conf": 0.7288892865180969, "adv/ratio_final_to_reasoning": 1.1554282379015055, "adv/ratio_step_to_reasoning": 1.384323800961384, "adv/std_final_conf": 0.8327970504760742, "adv/std_reasoning": 0.7753971219062805, "adv/std_step_conf": 0.9354557991027832, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8040061204618167, "calib/avg_num_step_conf": 6.13671875, "calib/ece": 0.13642570281124494, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6144578313253012, "calib/gap": 0.4835206565586313, "calib/mean_conf": 0.7058232931726908, "calib/mu_c": 0.8825316455696203, "calib/mu_w": 0.39901098901098897, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10385542168674697, "calib/std_conf": 0.39604093042751776, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5222881355932203, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.14488781661395395, "calib/step_q_w": 0.37740031897926635, "calib/step_q_w_n": 627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 530.8203125, "completions/mean_terminated_length": 532.9019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.14933333333333335, "grad_norm": 0.04341341555118561, "kl": 0.10687255859375, "learning_rate": 1.6666666666666667e-06, "loss": -0.0318, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03205030411481857, "mask/share_reasoning": 0.8420544862747192, "mask/share_step_conf": 0.12198895215988159, "num_tokens": 33920545.0, "reward": 0.9148607850074768, "reward_std": 0.2135268598794937, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.803813636302948, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.708720326423645, "step": 140 }, { "adv/mean_abs_final_conf": 0.6026667952537537, "adv/mean_abs_reasoning": 0.47053685784339905, "adv/mean_abs_step_conf": 0.7592720985412598, "adv/ratio_final_to_reasoning": 1.280806774661485, "adv/ratio_step_to_reasoning": 1.6136293807486504, "adv/std_final_conf": 0.7864569425582886, "adv/std_reasoning": 0.7015140652656555, "adv/std_step_conf": 0.9356001615524292, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8802285318559556, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.13603238866396755, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.6396761133603239, "calib/gap": 0.5843026315789475, "calib/mean_conf": 0.6912550607287449, "calib/mu_c": 0.9159868421052633, "calib/mu_w": 0.33168421052631575, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.10595141700404852, "calib/std_conf": 0.4246890309399393, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5249605411499436, "calib/step_q_c_n": 887.0, "calib/step_q_gap": 0.21411427880861783, "calib/step_q_w": 0.3108462623413258, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 594.671875, "completions/mean_terminated_length": 594.671875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.1504, "grad_norm": 0.02849111706018448, "kl": 0.09429931640625, "learning_rate": 1.638888888888889e-06, "loss": 0.0315, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.02957671508193016, "mask/share_reasoning": 0.8577046394348145, "mask/share_step_conf": 0.11271867156028748, "num_tokens": 34179877.0, "reward": 0.9285903573036194, "reward_std": 0.20410549640655518, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.8198671936988831, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.7255947589874268, "step": 141 }, { "adv/mean_abs_final_conf": 0.6020634174346924, "adv/mean_abs_reasoning": 0.47709208726882935, "adv/mean_abs_step_conf": 0.7400637865066528, "adv/ratio_final_to_reasoning": 1.2619438332781336, "adv/ratio_step_to_reasoning": 1.5511969413352387, "adv/std_final_conf": 0.8434348106384277, "adv/std_reasoning": 0.7574495673179626, "adv/std_step_conf": 0.9355960488319397, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8185070903361344, "calib/avg_num_step_conf": 6.65625, "calib/ece": 0.1876113360323887, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5141700404858299, "calib/gap": 0.47222623424369753, "calib/mean_conf": 0.6019433198380567, "calib/mu_c": 0.829453125, "calib/mu_w": 0.3572268907563025, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.13566801619433203, "calib/std_conf": 0.4341461311403343, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5000493218249075, "calib/step_q_c_n": 811.0, "calib/step_q_gap": 0.18078840357182796, "calib/step_q_w": 0.3192609182530795, "calib/step_q_w_n": 893.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 591.73046875, "completions/mean_terminated_length": 596.3897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.15146666666666667, "grad_norm": 0.04204293340444565, "kl": 0.09209442138671875, "learning_rate": 1.6111111111111113e-06, "loss": -0.0561, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.031102092936635017, "mask/share_reasoning": 0.8385723829269409, "mask/share_step_conf": 0.12251304090023041, "num_tokens": 34436520.0, "reward": 0.8704932928085327, "reward_std": 0.20925664901733398, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.7569445371627808, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6926357746124268, "step": 142 }, { "adv/mean_abs_final_conf": 0.6329153776168823, "adv/mean_abs_reasoning": 0.5149741172790527, "adv/mean_abs_step_conf": 0.772438108921051, "adv/ratio_final_to_reasoning": 1.2290236662009169, "adv/ratio_step_to_reasoning": 1.4999552074623674, "adv/std_final_conf": 0.8444722294807434, "adv/std_reasoning": 0.7753442525863647, "adv/std_step_conf": 0.9353972673416138, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8235011001100111, "calib/avg_num_step_conf": 7.10546875, "calib/ece": 0.15020408163265314, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.46530612244897956, "calib/gap": 0.4852901540154015, "calib/mean_conf": 0.5814693877551019, "calib/mu_c": 0.7815277777777777, "calib/mu_w": 0.2962376237623762, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07195918367346948, "calib/std_conf": 0.42237458479318685, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45192, "calib/step_q_c_n": 875.0, "calib/step_q_gap": 0.21121025423728815, "calib/step_q_w": 0.24070974576271184, "calib/step_q_w_n": 944.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2770.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 596.1015625, "completions/mean_terminated_length": 596.1015625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.15253333333333333, "grad_norm": 0.042096517980098724, "kl": 0.1017608642578125, "learning_rate": 1.5833333333333333e-06, "loss": 0.0124, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.029082752764225006, "mask/share_reasoning": 0.8452706336975098, "mask/share_step_conf": 0.12564660608768463, "num_tokens": 34696458.0, "reward": 0.887694239616394, "reward_std": 0.1788651943206787, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7755320072174072, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6967315077781677, "step": 143 }, { "adv/mean_abs_final_conf": 0.6178838014602661, "adv/mean_abs_reasoning": 0.47210583090782166, "adv/mean_abs_step_conf": 0.7773245573043823, "adv/ratio_final_to_reasoning": 1.3087823979469289, "adv/ratio_step_to_reasoning": 1.646504886011786, "adv/std_final_conf": 0.8332252502441406, "adv/std_reasoning": 0.7391970157623291, "adv/std_step_conf": 0.9355929493904114, "calib/answer_extract_rate": 0.98828125, "calib/auroc": 0.728921568627451, "calib/avg_num_step_conf": 5.84375, "calib/ece": 0.2555731225296442, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5138339920948617, "calib/gap": 0.372467086834734, "calib/mean_conf": 0.5895652173913043, "calib/mu_c": 0.714702380952381, "calib/mu_w": 0.342235294117647, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.09055335968379442, "calib/std_conf": 0.4395355623482885, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.47385786802030455, "calib/step_q_c_n": 985.0, "calib/step_q_gap": 0.13800659600464898, "calib/step_q_w": 0.33585127201565557, "calib/step_q_w_n": 511.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 519.9765625, "completions/mean_terminated_length": 522.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 81.0, "epoch": 0.1536, "grad_norm": 0.038866207003593445, "kl": 0.109161376953125, "learning_rate": 1.5555555555555558e-06, "loss": -0.0062, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03383968397974968, "mask/share_reasoning": 0.8380635976791382, "mask/share_step_conf": 0.12419050931930542, "num_tokens": 34933700.0, "reward": 0.8812453746795654, "reward_std": 0.18394407629966736, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7239183187484741, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.7120097875595093, "step": 144 }, { "adv/mean_abs_final_conf": 0.6504254341125488, "adv/mean_abs_reasoning": 0.5688995122909546, "adv/mean_abs_step_conf": 0.748096764087677, "adv/ratio_final_to_reasoning": 1.1433046083890808, "adv/ratio_step_to_reasoning": 1.3149892870800615, "adv/std_final_conf": 0.8487396836280823, "adv/std_reasoning": 0.8098734021186829, "adv/std_step_conf": 0.9354998469352722, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7931662087912088, "calib/avg_num_step_conf": 6.59765625, "calib/ece": 0.17147410358565737, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.46215139442231074, "calib/gap": 0.4005467032967033, "calib/mean_conf": 0.6120318725099602, "calib/mu_c": 0.75725, "calib/mu_w": 0.3567032967032967, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.07302788844621516, "calib/std_conf": 0.4032125167272188, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.45734328358208953, "calib/step_q_c_n": 1005.0, "calib/step_q_gap": 0.13039649995635855, "calib/step_q_w": 0.326946783625731, "calib/step_q_w_n": 684.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 518.21484375, "completions/mean_terminated_length": 522.2952880859375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.15466666666666667, "grad_norm": 0.0429336316883564, "kl": 0.10005950927734375, "learning_rate": 1.527777777777778e-06, "loss": 0.005, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03339899703860283, "mask/share_reasoning": 0.8209314346313477, "mask/share_step_conf": 0.1378570944070816, "num_tokens": 35169067.0, "reward": 0.9023834466934204, "reward_std": 0.18495270609855652, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.7753593325614929, "rewards/format_reward_step": 0.98046875, "rewards/step_l1_reward": 0.7083138227462769, "step": 145 }, { "adv/mean_abs_final_conf": 0.7168145179748535, "adv/mean_abs_reasoning": 0.5483123660087585, "adv/mean_abs_step_conf": 0.7604609727859497, "adv/ratio_final_to_reasoning": 1.307310508410827, "adv/ratio_step_to_reasoning": 1.3869119500649787, "adv/std_final_conf": 0.8841872811317444, "adv/std_reasoning": 0.7928834557533264, "adv/std_step_conf": 0.9359769821166992, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8001698946653075, "calib/avg_num_step_conf": 6.75390625, "calib/ece": 0.18754098360655735, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.36885245901639346, "calib/gap": 0.4223730886850153, "calib/mean_conf": 0.504016393442623, "calib/mu_c": 0.7377064220183487, "calib/mu_w": 0.3153333333333334, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.1224180327868852, "calib/std_conf": 0.42553757676549764, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.4633579881656805, "calib/step_q_c_n": 676.0, "calib/step_q_gap": 0.17284516765286, "calib/step_q_w": 0.2905128205128205, "calib/step_q_w_n": 1053.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 595.3359375, "completions/mean_terminated_length": 600.0236206054688, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.15573333333333333, "grad_norm": 0.03687906637787819, "kl": 0.1004638671875, "learning_rate": 1.5e-06, "loss": -0.0411, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.03055627830326557, "mask/share_reasoning": 0.8380646705627441, "mask/share_step_conf": 0.12356653809547424, "num_tokens": 35428689.0, "reward": 0.851368248462677, "reward_std": 0.23046618700027466, "rewards/accuracy_reward_step": 0.42578125, "rewards/final_brier_reward_step": 0.7408281564712524, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6861271858215332, "step": 146 }, { "adv/mean_abs_final_conf": 0.684159517288208, "adv/mean_abs_reasoning": 0.5179585814476013, "adv/mean_abs_step_conf": 0.7577897310256958, "adv/ratio_final_to_reasoning": 1.3208768843564767, "adv/ratio_step_to_reasoning": 1.4630315206050055, "adv/std_final_conf": 0.8728747963905334, "adv/std_reasoning": 0.7753680348396301, "adv/std_step_conf": 0.935628354549408, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7412220702234382, "calib/avg_num_step_conf": 6.61328125, "calib/ece": 0.2665322580645161, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5201612903225806, "calib/gap": 0.369658002735978, "calib/mean_conf": 0.5983064516129032, "calib/mu_c": 0.7905882352941175, "calib/mu_w": 0.4209302325581395, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.1924999999999999, "calib/std_conf": 0.44023677971796177, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5238210227272727, "calib/step_q_c_n": 704.0, "calib/step_q_gap": 0.2095540864279805, "calib/step_q_w": 0.3142669362992922, "calib/step_q_w_n": 989.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 592.8515625, "completions/mean_terminated_length": 595.176513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.1568, "grad_norm": 0.03164289519190788, "kl": 0.0979461669921875, "learning_rate": 1.4722222222222225e-06, "loss": 0.002, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.029957056045532227, "mask/share_reasoning": 0.8438076972961426, "mask/share_step_conf": 0.12232896685600281, "num_tokens": 35684139.0, "reward": 0.8342704772949219, "reward_std": 0.22487780451774597, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6990612745285034, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6827608942985535, "step": 147 }, { "adv/mean_abs_final_conf": 0.6066899299621582, "adv/mean_abs_reasoning": 0.5051259994506836, "adv/mean_abs_step_conf": 0.7660642862319946, "adv/ratio_final_to_reasoning": 1.201066527206922, "adv/ratio_step_to_reasoning": 1.5165805899222713, "adv/std_final_conf": 0.8223289847373962, "adv/std_reasoning": 0.7752719521522522, "adv/std_step_conf": 0.9354250431060791, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.8528516057585825, "calib/avg_num_step_conf": 6.49609375, "calib/ece": 0.15503937007874008, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.531496062992126, "calib/gap": 0.5558693244739756, "calib/mean_conf": 0.6141732283464567, "calib/mu_c": 0.8023809523809524, "calib/mu_w": 0.24651162790697678, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05389763779527553, "calib/std_conf": 0.43869516786363244, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4502458210422812, "calib/step_q_c_n": 1017.0, "calib/step_q_gap": 0.1794099077295877, "calib/step_q_w": 0.2708359133126935, "calib/step_q_w_n": 646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2995.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 552.46484375, "completions/mean_terminated_length": 552.46484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.15786666666666666, "grad_norm": 0.03762984275817871, "kl": 0.1083526611328125, "learning_rate": 1.4444444444444445e-06, "loss": 0.0936, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03496871143579483, "mask/share_reasoning": 0.8342556953430176, "mask/share_step_conf": 0.130775585770607, "num_tokens": 35930682.0, "reward": 0.9441393613815308, "reward_std": 0.16690000891685486, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.8203698992729187, "rewards/format_reward_step": 0.98828125, "rewards/step_l1_reward": 0.7390023469924927, "step": 148 }, { "adv/mean_abs_final_conf": 0.6033927202224731, "adv/mean_abs_reasoning": 0.5049916505813599, "adv/mean_abs_step_conf": 0.765459418296814, "adv/ratio_final_to_reasoning": 1.1948568249154838, "adv/ratio_step_to_reasoning": 1.515786285606102, "adv/std_final_conf": 0.8198185563087463, "adv/std_reasoning": 0.7393207550048828, "adv/std_step_conf": 0.9351623058319092, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.872257053291536, "calib/avg_num_step_conf": 6.3515625, "calib/ece": 0.15314516129032252, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4959677419354839, "calib/gap": 0.5660292580982238, "calib/mean_conf": 0.5940322580645161, "calib/mu_c": 0.8587878787878789, "calib/mu_w": 0.29275862068965514, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.10745967741935478, "calib/std_conf": 0.43786214385801087, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5007593123209169, "calib/step_q_c_n": 698.0, "calib/step_q_gap": 0.2164920709416065, "calib/step_q_w": 0.28426724137931036, "calib/step_q_w_n": 928.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 581.6328125, "completions/mean_terminated_length": 586.2125854492188, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.15893333333333334, "grad_norm": 0.03899293765425682, "kl": 0.091217041015625, "learning_rate": 1.4166666666666667e-06, "loss": 0.0468, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032176218926906586, "mask/share_reasoning": 0.8431887626647949, "mask/share_step_conf": 0.1168224886059761, "num_tokens": 36184036.0, "reward": 0.9130155444145203, "reward_std": 0.1807914674282074, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.8074538707733154, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.7224833965301514, "step": 149 }, { "adv/mean_abs_final_conf": 0.6292825937271118, "adv/mean_abs_reasoning": 0.5423662662506104, "adv/mean_abs_step_conf": 0.781609296798706, "adv/ratio_final_to_reasoning": 1.160253933337993, "adv/ratio_step_to_reasoning": 1.4411097176120262, "adv/std_final_conf": 0.832348108291626, "adv/std_reasoning": 0.7927838563919067, "adv/std_step_conf": 0.9355080723762512, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.8042444821731748, "calib/avg_num_step_conf": 6.5390625, "calib/ece": 0.21875999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.552, "calib/gap": 0.4037657045840408, "calib/mean_conf": 0.63644, "calib/mu_c": 0.7898709677419355, "calib/mu_w": 0.38610526315789473, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11759999999999995, "calib/std_conf": 0.4271989307102722, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.4305185185185186, "calib/step_q_c_n": 945.0, "calib/step_q_gap": 0.13014814814814823, "calib/step_q_w": 0.30037037037037034, "calib/step_q_w_n": 729.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 503.78125, "completions/mean_terminated_length": 503.78125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.16, "grad_norm": 0.03334632143378258, "kl": 0.1125946044921875, "learning_rate": 1.3888888888888892e-06, "loss": -0.0875, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.03699849545955658, "mask/share_reasoning": 0.8212940692901611, "mask/share_step_conf": 0.1417074352502823, "num_tokens": 36417964.0, "reward": 0.8815261721611023, "reward_std": 0.19953811168670654, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7500602006912231, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6973670721054077, "step": 150 }, { "adv/mean_abs_final_conf": 0.6751225590705872, "adv/mean_abs_reasoning": 0.5508012771606445, "adv/mean_abs_step_conf": 0.7583919763565063, "adv/ratio_final_to_reasoning": 1.2257098650003377, "adv/ratio_step_to_reasoning": 1.376888558185599, "adv/std_final_conf": 0.8787547945976257, "adv/std_reasoning": 0.8098519444465637, "adv/std_step_conf": 0.9357379674911499, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.8040110930735931, "calib/avg_num_step_conf": 6.73828125, "calib/ece": 0.19200819672131153, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.4180327868852459, "calib/gap": 0.47400703463203464, "calib/mean_conf": 0.5265163934426229, "calib/mu_c": 0.7829464285714286, "calib/mu_w": 0.30893939393939396, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1297540983606558, "calib/std_conf": 0.44338923309699013, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.45865693430656934, "calib/step_q_c_n": 685.0, "calib/step_q_gap": 0.16194539584503087, "calib/step_q_w": 0.29671153846153847, "calib/step_q_w_n": 1040.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 649.10546875, "completions/mean_terminated_length": 649.10546875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.16106666666666666, "grad_norm": 0.025989269837737083, "kl": 0.0850067138671875, "learning_rate": 1.3611111111111112e-06, "loss": 0.0793, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.029520340263843536, "mask/share_reasoning": 0.8527331948280334, "mask/share_step_conf": 0.11774645745754242, "num_tokens": 36691159.0, "reward": 0.8346282839775085, "reward_std": 0.24052713811397552, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7384581565856934, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6557983160018921, "step": 151 }, { "adv/mean_abs_final_conf": 0.7776981592178345, "adv/mean_abs_reasoning": 0.6137273907661438, "adv/mean_abs_step_conf": 0.7873239517211914, "adv/ratio_final_to_reasoning": 1.267171990233316, "adv/ratio_step_to_reasoning": 1.2828561403106664, "adv/std_final_conf": 0.9161269664764404, "adv/std_reasoning": 0.8266661167144775, "adv/std_step_conf": 0.9355749487876892, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7055150884495317, "calib/avg_num_step_conf": 6.5546875, "calib/ece": 0.22153225806451612, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.3387096774193548, "calib/gap": 0.3346774193548387, "calib/mean_conf": 0.4889516129032258, "calib/mu_c": 0.6562903225806451, "calib/mu_w": 0.32161290322580643, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.10524193548387095, "calib/std_conf": 0.42316171420435267, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4089871794871795, "calib/step_q_c_n": 780.0, "calib/step_q_gap": 0.07697158928673403, "calib/step_q_w": 0.33201559020044547, "calib/step_q_w_n": 898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2708.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 582.03125, "completions/mean_terminated_length": 582.03125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.16213333333333332, "grad_norm": 0.04250745102763176, "kl": 0.0987548828125, "learning_rate": 1.3333333333333334e-06, "loss": -0.0271, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.031988441944122314, "mask/share_reasoning": 0.8357316255569458, "mask/share_step_conf": 0.13227994740009308, "num_tokens": 36945551.0, "reward": 0.8413926362991333, "reward_std": 0.2079649418592453, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.7150835990905762, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6778579354286194, "step": 152 }, { "adv/mean_abs_final_conf": 0.7094208002090454, "adv/mean_abs_reasoning": 0.5205680727958679, "adv/mean_abs_step_conf": 0.7477751970291138, "adv/ratio_final_to_reasoning": 1.3627820015908523, "adv/ratio_step_to_reasoning": 1.4364599676906067, "adv/std_final_conf": 0.886049211025238, "adv/std_reasoning": 0.7575827836990356, "adv/std_step_conf": 0.9357052445411682, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.6473487876365573, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.3198785425101213, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.4331983805668016, "calib/gap": 0.20837063682387424, "calib/mean_conf": 0.5426315789473685, "calib/mu_c": 0.6337410071942446, "calib/mu_w": 0.4253703703703704, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.14987854251012137, "calib/std_conf": 0.4436852911076501, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.43919047619047613, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.12442226426994635, "calib/step_q_w": 0.3147682119205298, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 566.20703125, "completions/mean_terminated_length": 570.6653442382812, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.1632, "grad_norm": 0.062288716435432434, "kl": 0.09494781494140625, "learning_rate": 1.3055555555555556e-06, "loss": -0.0485, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.029957033693790436, "mask/share_reasoning": 0.844646692276001, "mask/share_step_conf": 0.11758378893136978, "num_tokens": 37197820.0, "reward": 0.7996716499328613, "reward_std": 0.21979312598705292, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6360449194908142, "rewards/format_reward_step": 0.96484375, "rewards/step_l1_reward": 0.6617358326911926, "step": 153 }, { "adv/mean_abs_final_conf": 0.6398130059242249, "adv/mean_abs_reasoning": 0.40170079469680786, "adv/mean_abs_step_conf": 0.7534546852111816, "adv/ratio_final_to_reasoning": 1.59276011989754, "adv/ratio_step_to_reasoning": 1.8756614255141502, "adv/std_final_conf": 0.8723781704902649, "adv/std_reasoning": 0.7013341188430786, "adv/std_step_conf": 0.9358360767364502, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.7958110236220471, "calib/avg_num_step_conf": 5.28125, "calib/ece": 0.22142857142857147, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.5714285714285714, "calib/gap": 0.4411199999999998, "calib/mean_conf": 0.6611904761904762, "calib/mu_c": 0.8799999999999999, "calib/mu_w": 0.4388800000000001, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18932539682539687, "calib/std_conf": 0.4256351130171318, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5416639209225701, "calib/step_q_c_n": 607.0, "calib/step_q_gap": 0.20727465917760363, "calib/step_q_w": 0.33438926174496647, "calib/step_q_w_n": 745.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 503.796875, "completions/mean_terminated_length": 503.796875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.16426666666666667, "grad_norm": 0.05955752730369568, "kl": 0.1015777587890625, "learning_rate": 1.2777777777777779e-06, "loss": 0.0488, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03422338888049126, "mask/share_reasoning": 0.8485907316207886, "mask/share_step_conf": 0.11718593537807465, "num_tokens": 37431232.0, "reward": 0.8689066171646118, "reward_std": 0.21552817523479462, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7449566125869751, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6983253359794617, "step": 154 }, { "adv/mean_abs_final_conf": 0.6755253076553345, "adv/mean_abs_reasoning": 0.5433062314987183, "adv/mean_abs_step_conf": 0.7571421265602112, "adv/ratio_final_to_reasoning": 1.2433601318944714, "adv/ratio_step_to_reasoning": 1.393582629213774, "adv/std_final_conf": 0.8848652243614197, "adv/std_reasoning": 0.7928366661071777, "adv/std_step_conf": 0.9357208013534546, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7252258064516128, "calib/avg_num_step_conf": 5.94921875, "calib/ece": 0.23799196787148594, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.39357429718875503, "calib/gap": 0.3751412903225807, "calib/mean_conf": 0.47566265060240964, "calib/mu_c": 0.6624800000000001, "calib/mu_w": 0.28733870967741937, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10582329317269074, "calib/std_conf": 0.44708837460779516, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.40711621233859396, "calib/step_q_c_n": 697.0, "calib/step_q_gap": 0.11015495325869085, "calib/step_q_w": 0.2969612590799031, "calib/step_q_w_n": 826.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 500.46875, "completions/mean_terminated_length": 502.431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.16533333333333333, "grad_norm": 3373.44287109375, "kl": 18688.10890197754, "learning_rate": 1.25e-06, "loss": 203.8649, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03441904857754707, "mask/share_reasoning": 0.833466112613678, "mask/share_step_conf": 0.12820863723754883, "num_tokens": 37666568.0, "reward": 0.8419536352157593, "reward_std": 0.19931158423423767, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7168375253677368, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6748822927474976, "step": 155 }, { "adv/mean_abs_final_conf": 0.6437397003173828, "adv/mean_abs_reasoning": 0.5139199495315552, "adv/mean_abs_step_conf": 0.7718127369880676, "adv/ratio_final_to_reasoning": 1.2526069495923637, "adv/ratio_step_to_reasoning": 1.5018150933653873, "adv/std_final_conf": 0.8419256210327148, "adv/std_reasoning": 0.7393944263458252, "adv/std_step_conf": 0.9355224370956421, "calib/answer_extract_rate": 0.9921875, "calib/auroc": 0.7397759103641457, "calib/avg_num_step_conf": 6.015625, "calib/ece": 0.2540944881889763, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5118110236220472, "calib/gap": 0.3686212262682851, "calib/mean_conf": 0.6069291338582677, "calib/mu_c": 0.7796296296296297, "calib/mu_w": 0.4110084033613446, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.16476377952755897, "calib/std_conf": 0.43801091528308267, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4493654822335026, "calib/step_q_c_n": 788.0, "calib/step_q_gap": 0.0800702694675452, "calib/step_q_w": 0.3692952127659574, "calib/step_q_w_n": 752.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 527.34375, "completions/mean_terminated_length": 527.34375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1664, "grad_norm": 0.03064139373600483, "kl": 0.10507965087890625, "learning_rate": 1.2222222222222223e-06, "loss": 0.0278, "mask/has_final_conf_rate": 0.9921875, "mask/share_final_conf": 0.03395324945449829, "mask/share_reasoning": 0.83623206615448, "mask/share_step_conf": 0.12981468439102173, "num_tokens": 37906328.0, "reward": 0.8754494190216064, "reward_std": 0.1909467875957489, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7312695384025574, "rewards/format_reward_step": 0.9921875, "rewards/step_l1_reward": 0.7157229781150818, "step": 156 }, { "adv/mean_abs_final_conf": 0.6759968996047974, "adv/mean_abs_reasoning": 0.5088943243026733, "adv/mean_abs_step_conf": 0.744426965713501, "adv/ratio_final_to_reasoning": 1.3283639988146871, "adv/ratio_step_to_reasoning": 1.4628321247904912, "adv/std_final_conf": 0.8722103238105774, "adv/std_reasoning": 0.7576680183410645, "adv/std_step_conf": 0.9357429146766663, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.756754130223518, "calib/avg_num_step_conf": 6.58984375, "calib/ece": 0.2149999999999999, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6031746031746031, "calib/gap": 0.42416326530612236, "calib/mean_conf": 0.661904761904762, "calib/mu_c": 0.8386394557823128, "calib/mu_w": 0.41447619047619044, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1467857142857142, "calib/std_conf": 0.4313723289315165, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.4958779443254818, "calib/step_q_c_n": 934.0, "calib/step_q_gap": 0.15291592573318435, "calib/step_q_w": 0.34296201859229747, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 542.71875, "completions/mean_terminated_length": 542.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.16746666666666668, "grad_norm": 0.03399444371461868, "kl": 0.114013671875, "learning_rate": 1.1944444444444446e-06, "loss": 0.0575, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03503589332103729, "mask/share_reasoning": 0.8307565450668335, "mask/share_step_conf": 0.1342075616121292, "num_tokens": 38148992.0, "reward": 0.8802298903465271, "reward_std": 0.21261295676231384, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7588343620300293, "rewards/format_reward_step": 0.984375, "rewards/step_l1_reward": 0.6899067163467407, "step": 157 }, { "adv/mean_abs_final_conf": 0.7352586388587952, "adv/mean_abs_reasoning": 0.5799189805984497, "adv/mean_abs_step_conf": 0.7681926488876343, "adv/ratio_final_to_reasoning": 1.2678644146119207, "adv/ratio_step_to_reasoning": 1.324655123539662, "adv/std_final_conf": 0.8943493366241455, "adv/std_reasoning": 0.8099675178527832, "adv/std_step_conf": 0.935797393321991, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.6640706465067778, "calib/avg_num_step_conf": 5.77734375, "calib/ece": 0.3078313253012047, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5783132530120482, "calib/gap": 0.26410192909280505, "calib/mean_conf": 0.6776305220883535, "calib/mu_c": 0.7964233576642337, "calib/mu_w": 0.5323214285714286, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.21763052208835335, "calib/std_conf": 0.41419832089017633, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4758634020618556, "calib/step_q_c_n": 776.0, "calib/step_q_gap": 0.11411375768063231, "calib/step_q_w": 0.3617496443812233, "calib/step_q_w_n": 703.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 533.03125, "completions/mean_terminated_length": 533.03125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.16853333333333334, "grad_norm": 0.03445930778980255, "kl": 0.1155548095703125, "learning_rate": 1.1666666666666668e-06, "loss": -0.0607, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03543959930539131, "mask/share_reasoning": 0.8357149362564087, "mask/share_step_conf": 0.1288454830646515, "num_tokens": 38390688.0, "reward": 0.8163944482803345, "reward_std": 0.243827685713768, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.6710984706878662, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6624716520309448, "step": 158 }, { "adv/mean_abs_final_conf": 0.7034273743629456, "adv/mean_abs_reasoning": 0.5286320447921753, "adv/mean_abs_step_conf": 0.759345531463623, "adv/ratio_final_to_reasoning": 1.3306559473508435, "adv/ratio_step_to_reasoning": 1.4364349247162074, "adv/std_final_conf": 0.9008255004882812, "adv/std_reasoning": 0.8097714185714722, "adv/std_step_conf": 0.935985267162323, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.7885853658536586, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.23326612903225813, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.5604838709677419, "calib/gap": 0.4060260162601626, "calib/mean_conf": 0.636975806451613, "calib/mu_c": 0.8416260162601626, "calib/mu_w": 0.43560000000000004, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1871370967741936, "calib/std_conf": 0.4311651125188376, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5488432267884323, "calib/step_q_c_n": 657.0, "calib/step_q_gap": 0.18952023666149298, "calib/step_q_w": 0.35932299012693936, "calib/step_q_w_n": 709.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 521.828125, "completions/mean_terminated_length": 521.828125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.1696, "grad_norm": 0.033365968614816666, "kl": 0.125640869140625, "learning_rate": 1.138888888888889e-06, "loss": 0.1105, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.0347648486495018, "mask/share_reasoning": 0.8454828262329102, "mask/share_step_conf": 0.11975230276584625, "num_tokens": 38629060.0, "reward": 0.8489425182342529, "reward_std": 0.2489396631717682, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7197418212890625, "rewards/format_reward_step": 0.9609375, "rewards/step_l1_reward": 0.6898620128631592, "step": 159 }, { "adv/mean_abs_final_conf": 0.6693143248558044, "adv/mean_abs_reasoning": 0.5256766080856323, "adv/mean_abs_step_conf": 0.7664551734924316, "adv/ratio_final_to_reasoning": 1.273243500967754, "adv/ratio_step_to_reasoning": 1.4580355330697474, "adv/std_final_conf": 0.8575978875160217, "adv/std_reasoning": 0.7755852341651917, "adv/std_step_conf": 0.9358670115470886, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7649155592469545, "calib/avg_num_step_conf": 6.0390625, "calib/ece": 0.24713692946058086, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5228215767634855, "calib/gap": 0.39020487264673315, "calib/mean_conf": 0.5867219917012448, "calib/mu_c": 0.768062015503876, "calib/mu_w": 0.37785714285714284, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.14929460580912857, "calib/std_conf": 0.44766943927496705, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5154703328509408, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.2108036661842741, "calib/step_q_w": 0.3046666666666667, "calib/step_q_w_n": 855.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 588.36328125, "completions/mean_terminated_length": 590.6705932617188, "completions/min_length": 0.0, "completions/min_terminated_length": 186.0, "epoch": 0.17066666666666666, "grad_norm": 0.03893861174583435, "kl": 0.122528076171875, "learning_rate": 1.111111111111111e-06, "loss": 0.0047, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03165167197585106, "mask/share_reasoning": 0.8518185615539551, "mask/share_step_conf": 0.11262348294258118, "num_tokens": 38884521.0, "reward": 0.8104245662689209, "reward_std": 0.23924857378005981, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6910691261291504, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.642279863357544, "step": 160 }, { "adv/mean_abs_final_conf": 0.6290079951286316, "adv/mean_abs_reasoning": 0.49614065885543823, "adv/mean_abs_step_conf": 0.7665145993232727, "adv/ratio_final_to_reasoning": 1.2678017491646603, "adv/ratio_step_to_reasoning": 1.5449542093396824, "adv/std_final_conf": 0.8233130574226379, "adv/std_reasoning": 0.7394353747367859, "adv/std_step_conf": 0.9357913732528687, "calib/answer_extract_rate": 0.984375, "calib/auroc": 0.754988913525499, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.24591269841269853, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5476190476190477, "calib/gap": 0.36416851441241677, "calib/mean_conf": 0.6231349206349206, "calib/mu_c": 0.7503048780487804, "calib/mu_w": 0.38613636363636367, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.10912698412698423, "calib/std_conf": 0.43738319716411334, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.527593896713615, "calib/step_q_c_n": 852.0, "calib/step_q_gap": 0.17391965428937256, "calib/step_q_w": 0.35367424242424245, "calib/step_q_w_n": 528.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 492.62109375, "completions/mean_terminated_length": 494.552978515625, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.17173333333333332, "grad_norm": 0.03384341672062874, "kl": 0.1311798095703125, "learning_rate": 1.0833333333333335e-06, "loss": -0.0703, "mask/has_final_conf_rate": 0.984375, "mask/share_final_conf": 0.03554953634738922, "mask/share_reasoning": 0.8357522487640381, "mask/share_step_conf": 0.1247919574379921, "num_tokens": 39114552.0, "reward": 0.8582545518875122, "reward_std": 0.20523208379745483, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7181113362312317, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.6788665652275085, "step": 161 }, { "adv/mean_abs_final_conf": 0.6663964986801147, "adv/mean_abs_reasoning": 0.5564135313034058, "adv/mean_abs_step_conf": 0.7464975118637085, "adv/ratio_final_to_reasoning": 1.1976640775055785, "adv/ratio_step_to_reasoning": 1.3416235764702353, "adv/std_final_conf": 0.8759920001029968, "adv/std_reasoning": 0.7928615808486938, "adv/std_step_conf": 0.9360087513923645, "calib/answer_extract_rate": 0.9765625, "calib/auroc": 0.7336182336182338, "calib/avg_num_step_conf": 5.15234375, "calib/ece": 0.23295999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.576, "calib/gap": 0.3530294396961064, "calib/mean_conf": 0.6640799999999999, "calib/mu_c": 0.7784615384615384, "calib/mu_w": 0.42543209876543203, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.11051999999999997, "calib/std_conf": 0.4227171082414337, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.5009223367697595, "calib/step_q_c_n": 873.0, "calib/step_q_gap": 0.1033438614334366, "calib/step_q_w": 0.3975784753363229, "calib/step_q_w_n": 446.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 469.33984375, "completions/mean_terminated_length": 469.33984375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.1728, "grad_norm": 0.038525257259607315, "kl": 0.1505126953125, "learning_rate": 1.0555555555555557e-06, "loss": 0.0249, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037024784833192825, "mask/share_reasoning": 0.8409038186073303, "mask/share_step_conf": 0.12207139283418655, "num_tokens": 39338847.0, "reward": 0.8713865280151367, "reward_std": 0.23926329612731934, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7351444959640503, "rewards/format_reward_step": 0.97265625, "rewards/step_l1_reward": 0.6810659766197205, "step": 162 }, { "adv/mean_abs_final_conf": 0.6915171146392822, "adv/mean_abs_reasoning": 0.543194055557251, "adv/mean_abs_step_conf": 0.7450501918792725, "adv/ratio_final_to_reasoning": 1.2730572206462567, "adv/ratio_step_to_reasoning": 1.3716096195400034, "adv/std_final_conf": 0.8479636311531067, "adv/std_reasoning": 0.7756072282791138, "adv/std_step_conf": 0.9357462525367737, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7947831978319784, "calib/avg_num_step_conf": 6.13671875, "calib/ece": 0.22395061728395066, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.45267489711934156, "calib/gap": 0.4066382113821139, "calib/mean_conf": 0.5469958847736626, "calib/mu_c": 0.7478048780487806, "calib/mu_w": 0.3411666666666667, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13238683127572023, "calib/std_conf": 0.44145246143070666, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5358978328173374, "calib/step_q_c_n": 646.0, "calib/step_q_gap": 0.21395188687139144, "calib/step_q_w": 0.32194594594594594, "calib/step_q_w_n": 925.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 588.9609375, "completions/mean_terminated_length": 591.2706298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.17386666666666667, "grad_norm": 0.02460322342813015, "kl": 0.12744140625, "learning_rate": 1.0277777777777777e-06, "loss": 0.001, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.031804271042346954, "mask/share_reasoning": 0.841753363609314, "mask/share_step_conf": 0.12253613770008087, "num_tokens": 39594453.0, "reward": 0.8449864387512207, "reward_std": 0.22924767434597015, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7183492183685303, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6856861710548401, "step": 163 }, { "adv/mean_abs_final_conf": 0.7495858669281006, "adv/mean_abs_reasoning": 0.6183942556381226, "adv/mean_abs_step_conf": 0.7639873027801514, "adv/ratio_final_to_reasoning": 1.212148819452731, "adv/ratio_step_to_reasoning": 1.2354372567574887, "adv/std_final_conf": 0.8925396203994751, "adv/std_reasoning": 0.8267449736595154, "adv/std_step_conf": 0.9359915852546692, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7543032786885246, "calib/avg_num_step_conf": 5.83203125, "calib/ece": 0.24876033057851243, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.39669421487603307, "calib/gap": 0.3457527322404372, "calib/mean_conf": 0.5266115702479339, "calib/mu_c": 0.7009166666666667, "calib/mu_w": 0.35516393442622957, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.1397520661157025, "calib/std_conf": 0.437897028763628, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.45381872213967306, "calib/step_q_c_n": 673.0, "calib/step_q_gap": 0.10157481970064863, "calib/step_q_w": 0.3522439024390244, "calib/step_q_w_n": 820.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 582.76953125, "completions/mean_terminated_length": 587.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.17493333333333333, "grad_norm": 0.0385562889277935, "kl": 0.1408843994140625, "learning_rate": 1.0000000000000002e-06, "loss": -0.0825, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.028835684061050415, "mask/share_reasoning": 0.8567764759063721, "mask/share_step_conf": 0.10657534003257751, "num_tokens": 39849778.0, "reward": 0.8096024990081787, "reward_std": 0.2659996747970581, "rewards/accuracy_reward_step": 0.46875, "rewards/final_brier_reward_step": 0.6846359372138977, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.6541002988815308, "step": 164 }, { "adv/mean_abs_final_conf": 0.7020846605300903, "adv/mean_abs_reasoning": 0.5991629362106323, "adv/mean_abs_step_conf": 0.7744486331939697, "adv/ratio_final_to_reasoning": 1.1717758527761746, "adv/ratio_step_to_reasoning": 1.29255096800867, "adv/std_final_conf": 0.8907216787338257, "adv/std_reasoning": 0.826683759689331, "adv/std_step_conf": 0.9358740448951721, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.7471147540983606, "calib/avg_num_step_conf": 5.35546875, "calib/ece": 0.23390013495276651, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.43724696356275305, "calib/gap": 0.38368830601092885, "calib/mean_conf": 0.5483940620782726, "calib/mu_c": 0.7425683060109288, "calib/mu_w": 0.35888, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.14418353576248313, "calib/std_conf": 0.4392057029267829, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.46690032000000004, "calib/step_q_c_n": 625.0, "calib/step_q_gap": 0.09279844332439674, "calib/step_q_w": 0.3741018766756033, "calib/step_q_w_n": 746.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 538.7109375, "completions/mean_terminated_length": 540.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.176, "grad_norm": 0.05522003397345543, "kl": 0.1431732177734375, "learning_rate": 9.722222222222224e-07, "loss": -0.1268, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03285511955618858, "mask/share_reasoning": 0.8516615629196167, "mask/share_step_conf": 0.11157705634832382, "num_tokens": 40093264.0, "reward": 0.8300672769546509, "reward_std": 0.22351546585559845, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7081432342529297, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6660536527633667, "step": 165 }, { "adv/mean_abs_final_conf": 0.6692671179771423, "adv/mean_abs_reasoning": 0.5687819123268127, "adv/mean_abs_step_conf": 0.7794054746627808, "adv/ratio_final_to_reasoning": 1.1766673719268914, "adv/ratio_step_to_reasoning": 1.3703063648320575, "adv/std_final_conf": 0.8801870346069336, "adv/std_reasoning": 0.8265607357025146, "adv/std_step_conf": 0.935825765132904, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.8490291262135923, "calib/avg_num_step_conf": 5.93359375, "calib/ece": 0.1401646090534978, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.51440329218107, "calib/gap": 0.6078966712898752, "calib/mean_conf": 0.5744032921810699, "calib/mu_c": 0.8320714285714287, "calib/mu_w": 0.22417475728155342, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.06921810699588467, "calib/std_conf": 0.4604640193061574, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5104418282548476, "calib/step_q_c_n": 722.0, "calib/step_q_gap": 0.1837843627592391, "calib/step_q_w": 0.32665746549560853, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 553.51171875, "completions/mean_terminated_length": 553.51171875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.17706666666666668, "grad_norm": 0.028190821409225464, "kl": 0.1366119384765625, "learning_rate": 9.444444444444445e-07, "loss": -0.034, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.031847476959228516, "mask/share_reasoning": 0.8460642099380493, "mask/share_step_conf": 0.12208834290504456, "num_tokens": 40341147.0, "reward": 0.8893702030181885, "reward_std": 0.2390143871307373, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7901648283004761, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6909192800521851, "step": 166 }, { "adv/mean_abs_final_conf": 0.6949819922447205, "adv/mean_abs_reasoning": 0.5812476873397827, "adv/mean_abs_step_conf": 0.7511816620826721, "adv/ratio_final_to_reasoning": 1.1956727009538217, "adv/ratio_step_to_reasoning": 1.2923606896753987, "adv/std_final_conf": 0.8732189536094666, "adv/std_reasoning": 0.8099851012229919, "adv/std_step_conf": 0.9360182881355286, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.6898390057901425, "calib/avg_num_step_conf": 5.98828125, "calib/ece": 0.25744855967078195, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.6584362139917695, "calib/gap": 0.291127665583957, "calib/mean_conf": 0.7356378600823045, "calib/mu_c": 0.8518493150684932, "calib/mu_w": 0.5607216494845362, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.19613168724279842, "calib/std_conf": 0.3930391038655533, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5163614744351962, "calib/step_q_c_n": 841.0, "calib/step_q_gap": 0.10021985593808636, "calib/step_q_w": 0.41614161849710984, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 541.13671875, "completions/mean_terminated_length": 541.13671875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.17813333333333334, "grad_norm": 0.03131222724914551, "kl": 0.13812255859375, "learning_rate": 9.166666666666666e-07, "loss": -0.0563, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03239912539720535, "mask/share_reasoning": 0.8487462997436523, "mask/share_step_conf": 0.11885453760623932, "num_tokens": 40585286.0, "reward": 0.8169732689857483, "reward_std": 0.2587045431137085, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6863234043121338, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.6437168717384338, "step": 167 }, { "adv/mean_abs_final_conf": 0.6552596092224121, "adv/mean_abs_reasoning": 0.5942199230194092, "adv/mean_abs_step_conf": 0.7558906078338623, "adv/ratio_final_to_reasoning": 1.1027223824688375, "adv/ratio_step_to_reasoning": 1.2720721378592559, "adv/std_final_conf": 0.8636217713356018, "adv/std_reasoning": 0.8267137408256531, "adv/std_step_conf": 0.9357888698577881, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.8008602574205411, "calib/avg_num_step_conf": 5.9140625, "calib/ece": 0.18473895582329308, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5180722891566265, "calib/gap": 0.484554767533491, "calib/mean_conf": 0.6106827309236947, "calib/mu_c": 0.8208510638297873, "calib/mu_w": 0.3362962962962963, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.11457831325301197, "calib/std_conf": 0.4380927261319581, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5236814621409922, "calib/step_q_c_n": 766.0, "calib/step_q_gap": 0.19397557978805102, "calib/step_q_w": 0.3297058823529412, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2547.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 557.58203125, "completions/mean_terminated_length": 564.1937255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.1792, "grad_norm": 0.033638909459114075, "kl": 0.1472320556640625, "learning_rate": 8.88888888888889e-07, "loss": -0.1233, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03107985481619835, "mask/share_reasoning": 0.8415172696113586, "mask/share_step_conf": 0.11568412184715271, "num_tokens": 40832699.0, "reward": 0.8927489519119263, "reward_std": 0.22894792258739471, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7766804695129395, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.7049111723899841, "step": 168 }, { "adv/mean_abs_final_conf": 0.7353278398513794, "adv/mean_abs_reasoning": 0.5933083295822144, "adv/mean_abs_step_conf": 0.75636225938797, "adv/ratio_final_to_reasoning": 1.2393688124506357, "adv/ratio_step_to_reasoning": 1.274821575352856, "adv/std_final_conf": 0.9213799834251404, "adv/std_reasoning": 0.8266744017601013, "adv/std_step_conf": 0.9360849857330322, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7169069462647444, "calib/avg_num_step_conf": 5.10546875, "calib/ece": 0.25710743801652886, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.5371900826446281, "calib/gap": 0.33034420914672, "calib/mean_conf": 0.6325619834710743, "calib/mu_c": 0.7813533834586467, "calib/mu_w": 0.45100917431192666, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.17004132231404953, "calib/std_conf": 0.42736368263158425, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5288538677918425, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.1270753443019096, "calib/step_q_w": 0.4017785234899329, "calib/step_q_w_n": 596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 530.4296875, "completions/mean_terminated_length": 534.6063232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.18026666666666666, "grad_norm": 0.038262739777565, "kl": 0.1501617431640625, "learning_rate": 8.611111111111112e-07, "loss": -0.1143, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.0322631374001503, "mask/share_reasoning": 0.8498676419258118, "mask/share_step_conf": 0.11005674302577972, "num_tokens": 41072673.0, "reward": 0.800749659538269, "reward_std": 0.28891515731811523, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6555206775665283, "rewards/format_reward_step": 0.9140625, "rewards/step_l1_reward": 0.6584785580635071, "step": 169 }, { "adv/mean_abs_final_conf": 0.7044241428375244, "adv/mean_abs_reasoning": 0.6058946251869202, "adv/mean_abs_step_conf": 0.7594249844551086, "adv/ratio_final_to_reasoning": 1.1626182401274936, "adv/ratio_step_to_reasoning": 1.2533944895464684, "adv/std_final_conf": 0.8785427212715149, "adv/std_reasoning": 0.8267114758491516, "adv/std_step_conf": 0.9360155463218689, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.8251050420168068, "calib/avg_num_step_conf": 5.66796875, "calib/ece": 0.1887136929460581, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5850622406639004, "calib/gap": 0.48409453781512624, "calib/mean_conf": 0.664896265560166, "calib/mu_c": 0.8758088235294119, "calib/mu_w": 0.3917142857142857, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.953125, "calib/pce": 0.14464730290456432, "calib/std_conf": 0.42813511656467346, "calib/step_conf_rate": 0.953125, "calib/step_q_c": 0.5469767441860466, "calib/step_q_c_n": 817.0, "calib/step_q_gap": 0.15737106595260808, "calib/step_q_w": 0.3896056782334385, "calib/step_q_w_n": 634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 561.98828125, "completions/mean_terminated_length": 564.1921997070312, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.18133333333333335, "grad_norm": 0.02597671188414097, "kl": 0.134185791015625, "learning_rate": 8.333333333333333e-07, "loss": -0.0605, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.029216032475233078, "mask/share_reasoning": 0.8554987907409668, "mask/share_step_conf": 0.11137893795967102, "num_tokens": 41320694.0, "reward": 0.8461229205131531, "reward_std": 0.28903552889823914, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.7363605499267578, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.6644790172576904, "step": 170 }, { "adv/mean_abs_final_conf": 0.7089108824729919, "adv/mean_abs_reasoning": 0.5430863499641418, "adv/mean_abs_step_conf": 0.7859926819801331, "adv/ratio_final_to_reasoning": 1.3053373234657784, "adv/ratio_step_to_reasoning": 1.4472701846253906, "adv/std_final_conf": 0.8812278509140015, "adv/std_reasoning": 0.7929053902626038, "adv/std_step_conf": 0.9361142516136169, "calib/answer_extract_rate": 0.9375, "calib/auroc": 0.7451990468180544, "calib/avg_num_step_conf": 5.44921875, "calib/ece": 0.25966527196652717, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.5271966527196653, "calib/gap": 0.3731476030277545, "calib/mean_conf": 0.6218410041841005, "calib/mu_c": 0.8138793103448276, "calib/mu_w": 0.4407317073170731, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.94140625, "calib/pce": 0.19807531380753135, "calib/std_conf": 0.4357660016489742, "calib/step_conf_rate": 0.94140625, "calib/step_q_c": 0.5213980263157895, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.15785291831070686, "calib/step_q_w": 0.3635451080050826, "calib/step_q_w_n": 787.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 560.30859375, "completions/mean_terminated_length": 560.30859375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1824, "grad_norm": 0.03308688476681709, "kl": 0.138214111328125, "learning_rate": 8.055555555555557e-07, "loss": -0.0429, "mask/has_final_conf_rate": 0.93359375, "mask/share_final_conf": 0.031629081815481186, "mask/share_reasoning": 0.8561517000198364, "mask/share_step_conf": 0.11221922934055328, "num_tokens": 41571029.0, "reward": 0.7820635437965393, "reward_std": 0.27486109733581543, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.666046142578125, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.6230810284614563, "step": 171 }, { "adv/mean_abs_final_conf": 0.6573461294174194, "adv/mean_abs_reasoning": 0.5512431263923645, "adv/mean_abs_step_conf": 0.7631179094314575, "adv/ratio_final_to_reasoning": 1.192479503045146, "adv/ratio_step_to_reasoning": 1.3843581405281498, "adv/std_final_conf": 0.8288174271583557, "adv/std_reasoning": 0.775492250919342, "adv/std_step_conf": 0.9357848763465881, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7014899149624007, "calib/avg_num_step_conf": 5.796875, "calib/ece": 0.27681571815718153, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5975609756097561, "calib/gap": 0.20806732728933852, "calib/mean_conf": 0.7315582655826558, "calib/mu_c": 0.8102178649237473, "calib/mu_w": 0.6021505376344087, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.19321138211382113, "calib/std_conf": 0.37416871063941554, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.5152008928571428, "calib/step_q_c_n": 896.0, "calib/step_q_gap": 0.09906143707482984, "calib/step_q_w": 0.41613945578231293, "calib/step_q_w_n": 588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2755.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 529.6796875, "completions/mean_terminated_length": 531.7568969726562, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.18346666666666667, "grad_norm": 0.029622117057442665, "kl": 0.1427001953125, "learning_rate": 7.777777777777779e-07, "loss": -0.088, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03298965096473694, "mask/share_reasoning": 0.837346076965332, "mask/share_step_conf": 0.12575796246528625, "num_tokens": 41809979.0, "reward": 0.8365813493728638, "reward_std": 0.2355513572692871, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6782691478729248, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6847372055053711, "step": 172 }, { "adv/mean_abs_final_conf": 0.7394602298736572, "adv/mean_abs_reasoning": 0.6103619337081909, "adv/mean_abs_step_conf": 0.7688398361206055, "adv/ratio_final_to_reasoning": 1.2115110544020053, "adv/ratio_step_to_reasoning": 1.2596457833626655, "adv/std_final_conf": 0.8896702527999878, "adv/std_reasoning": 0.8101977109909058, "adv/std_step_conf": 0.9361121654510498, "calib/answer_extract_rate": 0.953125, "calib/auroc": 0.7302925639983747, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.2732269387755103, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.6816326530612244, "calib/gap": 0.27601488554788045, "calib/mean_conf": 0.7906097959183673, "calib/mu_c": 0.9111550724637683, "calib/mu_w": 0.6351401869158878, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.2502857142857144, "calib/std_conf": 0.350853962622219, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4903967327887981, "calib/step_q_c_n": 857.0, "calib/step_q_gap": 0.042302156989076245, "calib/step_q_w": 0.44809457579972184, "calib/step_q_w_n": 719.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 561.82421875, "completions/mean_terminated_length": 561.82421875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.18453333333333333, "grad_norm": 0.03154466301202774, "kl": 0.15692138671875, "learning_rate": 7.5e-07, "loss": 0.0707, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.035199228674173355, "mask/share_reasoning": 0.8431538343429565, "mask/share_step_conf": 0.12164688855409622, "num_tokens": 42056966.0, "reward": 0.7900242209434509, "reward_std": 0.28265976905822754, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6709481477737427, "rewards/format_reward_step": 0.9375, "rewards/step_l1_reward": 0.6137877702713013, "step": 173 }, { "adv/mean_abs_final_conf": 0.815439760684967, "adv/mean_abs_reasoning": 0.6352990865707397, "adv/mean_abs_step_conf": 0.7776522040367126, "adv/ratio_final_to_reasoning": 1.2835525470162452, "adv/ratio_step_to_reasoning": 1.2240725989932966, "adv/std_final_conf": 0.9358423948287964, "adv/std_reasoning": 0.8431596159934998, "adv/std_step_conf": 0.935970664024353, "calib/answer_extract_rate": 0.921875, "calib/auroc": 0.6586296296296296, "calib/avg_num_step_conf": 6.09765625, "calib/ece": 0.34808510638297874, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.4808510638297872, "calib/gap": 0.19981851851851845, "calib/mean_conf": 0.5805106382978724, "calib/mu_c": 0.6953, "calib/mu_w": 0.4954814814814816, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.2515319148936171, "calib/std_conf": 0.4372785213179074, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.488593220338983, "calib/step_q_c_n": 590.0, "calib/step_q_gap": 0.1217010473214753, "calib/step_q_w": 0.3668921730175077, "calib/step_q_w_n": 971.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 602.16015625, "completions/mean_terminated_length": 606.9015502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.1856, "grad_norm": 0.04018768295645714, "kl": 0.131622314453125, "learning_rate": 7.222222222222222e-07, "loss": -0.1433, "mask/has_final_conf_rate": 0.91796875, "mask/share_final_conf": 0.028805209323763847, "mask/share_reasoning": 0.8497390747070312, "mask/share_step_conf": 0.11364327371120453, "num_tokens": 42315351.0, "reward": 0.7123181223869324, "reward_std": 0.29617708921432495, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.5734190940856934, "rewards/format_reward_step": 0.8984375, "rewards/step_l1_reward": 0.5934045910835266, "step": 174 }, { "adv/mean_abs_final_conf": 0.7533254027366638, "adv/mean_abs_reasoning": 0.5710115432739258, "adv/mean_abs_step_conf": 0.7993781566619873, "adv/ratio_final_to_reasoning": 1.3192822660246615, "adv/ratio_step_to_reasoning": 1.3999334445652518, "adv/std_final_conf": 0.8914464116096497, "adv/std_reasoning": 0.8100007176399231, "adv/std_step_conf": 0.9359956383705139, "calib/answer_extract_rate": 0.91015625, "calib/auroc": 0.7732487922705313, "calib/avg_num_step_conf": 5.81640625, "calib/ece": 0.23139601139601135, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.89453125, "calib/frac_conf_gt_0.9": 0.42735042735042733, "calib/gap": 0.3904830917874397, "calib/mean_conf": 0.5352706552706552, "calib/mu_c": 0.7655555555555557, "calib/mu_w": 0.375072463768116, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.9453125, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.17820512820512818, "calib/std_conf": 0.4406554815628469, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.5300852878464818, "calib/step_q_c_n": 469.0, "calib/step_q_gap": 0.1929135231405994, "calib/step_q_w": 0.3371717647058824, "calib/step_q_w_n": 1020.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 589.83984375, "completions/mean_terminated_length": 596.8340454101562, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.18666666666666668, "grad_norm": 0.027495726943016052, "kl": 0.1334686279296875, "learning_rate": 6.944444444444446e-07, "loss": -0.204, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.030761398375034332, "mask/share_reasoning": 0.8459650278091431, "mask/share_step_conf": 0.11155480146408081, "num_tokens": 42572174.0, "reward": 0.7611997127532959, "reward_std": 0.2747557759284973, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.6588290929794312, "rewards/format_reward_step": 0.89453125, "rewards/step_l1_reward": 0.6096640825271606, "step": 175 }, { "adv/mean_abs_final_conf": 0.7055671215057373, "adv/mean_abs_reasoning": 0.5384682416915894, "adv/mean_abs_step_conf": 0.7872297763824463, "adv/ratio_final_to_reasoning": 1.3103226279217683, "adv/ratio_step_to_reasoning": 1.4619799561611593, "adv/std_final_conf": 0.8565211892127991, "adv/std_reasoning": 0.7757197618484497, "adv/std_step_conf": 0.9361661076545715, "calib/answer_extract_rate": 0.92578125, "calib/auroc": 0.8024532876907717, "calib/avg_num_step_conf": 5.4921875, "calib/ece": 0.23059071729957803, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.90234375, "calib/frac_conf_gt_0.9": 0.5569620253164557, "calib/gap": 0.407507488232777, "calib/mean_conf": 0.6566666666666666, "calib/mu_c": 0.8526829268292683, "calib/mu_w": 0.4451754385964913, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.9375, "calib/pce": 0.18413502109704638, "calib/std_conf": 0.4232557488957203, "calib/step_conf_rate": 0.9375, "calib/step_q_c": 0.5499999999999999, "calib/step_q_c_n": 634.0, "calib/step_q_gap": 0.179062176165803, "calib/step_q_w": 0.37093782383419693, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 558.26171875, "completions/mean_terminated_length": 558.26171875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.18773333333333334, "grad_norm": 0.029045021161437035, "kl": 0.1388397216796875, "learning_rate": 6.666666666666667e-07, "loss": -0.0591, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.032933156937360764, "mask/share_reasoning": 0.8460503816604614, "mask/share_step_conf": 0.1210164874792099, "num_tokens": 42819153.0, "reward": 0.7955324649810791, "reward_std": 0.28861433267593384, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6817914247512817, "rewards/format_reward_step": 0.90234375, "rewards/step_l1_reward": 0.6327109336853027, "step": 176 }, { "adv/mean_abs_final_conf": 0.7384334802627563, "adv/mean_abs_reasoning": 0.6062425374984741, "adv/mean_abs_step_conf": 0.7673832774162292, "adv/ratio_final_to_reasoning": 1.2180495999336156, "adv/ratio_step_to_reasoning": 1.2658024304639968, "adv/std_final_conf": 0.8960546255111694, "adv/std_reasoning": 0.8267843723297119, "adv/std_step_conf": 0.9361924529075623, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7267643789941651, "calib/avg_num_step_conf": 5.421875, "calib/ece": 0.25808333333333344, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.475, "calib/gap": 0.33623228674631855, "calib/mean_conf": 0.5758333333333334, "calib/mu_c": 0.7411475409836067, "calib/mu_w": 0.40491525423728814, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.16279166666666672, "calib/std_conf": 0.4374482509077185, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5583333333333333, "calib/step_q_c_n": 558.0, "calib/step_q_gap": 0.20558634538152615, "calib/step_q_w": 0.3527469879518072, "calib/step_q_w_n": 830.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 570.81640625, "completions/mean_terminated_length": 573.054931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.1888, "grad_norm": 0.02761814184486866, "kl": 0.1354217529296875, "learning_rate": 6.388888888888889e-07, "loss": 0.0049, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03270846605300903, "mask/share_reasoning": 0.8531739115715027, "mask/share_step_conf": 0.11021138727664948, "num_tokens": 43069114.0, "reward": 0.8086408376693726, "reward_std": 0.2879253327846527, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6769281029701233, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.658322274684906, "step": 177 }, { "adv/mean_abs_final_conf": 0.7103442549705505, "adv/mean_abs_reasoning": 0.6379184722900391, "adv/mean_abs_step_conf": 0.7597620487213135, "adv/ratio_final_to_reasoning": 1.1135345437176525, "adv/ratio_step_to_reasoning": 1.1910017999539546, "adv/std_final_conf": 0.8917309641838074, "adv/std_reasoning": 0.8592365980148315, "adv/std_step_conf": 0.9360631704330444, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.773105625717566, "calib/avg_num_step_conf": 5.52734375, "calib/ece": 0.19252100840336128, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.4831932773109244, "calib/gap": 0.4285275545350173, "calib/mean_conf": 0.6018487394957982, "calib/mu_c": 0.7891044776119404, "calib/mu_w": 0.3605769230769231, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 0.96875, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.11567226890756296, "calib/std_conf": 0.43054287869801167, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5323484848484848, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.21071404776239222, "calib/step_q_w": 0.32163443708609263, "calib/step_q_w_n": 755.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 541.71875, "completions/mean_terminated_length": 543.8431396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.18986666666666666, "grad_norm": 0.04544935002923012, "kl": 0.1381683349609375, "learning_rate": 6.111111111111112e-07, "loss": -0.0587, "mask/has_final_conf_rate": 0.9296875, "mask/share_final_conf": 0.032741475850343704, "mask/share_reasoning": 0.8514248132705688, "mask/share_step_conf": 0.11192752420902252, "num_tokens": 43313866.0, "reward": 0.8213447332382202, "reward_std": 0.2749481797218323, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7128449082374573, "rewards/format_reward_step": 0.91015625, "rewards/step_l1_reward": 0.6431257128715515, "step": 178 }, { "adv/mean_abs_final_conf": 0.683972954750061, "adv/mean_abs_reasoning": 0.5911738872528076, "adv/mean_abs_step_conf": 0.7689919471740723, "adv/ratio_final_to_reasoning": 1.1569742329595982, "adv/ratio_step_to_reasoning": 1.3007880824161355, "adv/std_final_conf": 0.8711940050125122, "adv/std_reasoning": 0.8099489212036133, "adv/std_step_conf": 0.9359811544418335, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7785773309461506, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.2004074074074073, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.5308641975308642, "calib/gap": 0.41167291006748385, "calib/mean_conf": 0.6398312757201645, "calib/mu_c": 0.8194087591240876, "calib/mu_w": 0.40773584905660376, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.1382263374485596, "calib/std_conf": 0.4182487203904503, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.5527034120734907, "calib/step_q_c_n": 762.0, "calib/step_q_gap": 0.21847207971624327, "calib/step_q_w": 0.33423133235724745, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 525.5546875, "completions/mean_terminated_length": 529.6929321289062, "completions/min_length": 0.0, "completions/min_terminated_length": 82.0, "epoch": 0.19093333333333334, "grad_norm": 0.029601961374282837, "kl": 0.158111572265625, "learning_rate": 5.833333333333334e-07, "loss": -0.079, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.03268842399120331, "mask/share_reasoning": 0.8430576324462891, "mask/share_step_conf": 0.11644147336483002, "num_tokens": 43554672.0, "reward": 0.8535597324371338, "reward_std": 0.24574843049049377, "rewards/accuracy_reward_step": 0.53515625, "rewards/final_brier_reward_step": 0.7325922250747681, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.6784335374832153, "step": 179 }, { "adv/mean_abs_final_conf": 0.709583044052124, "adv/mean_abs_reasoning": 0.5634148120880127, "adv/mean_abs_step_conf": 0.7664583921432495, "adv/ratio_final_to_reasoning": 1.2594327107276653, "adv/ratio_step_to_reasoning": 1.360380266366726, "adv/std_final_conf": 0.8815131783485413, "adv/std_reasoning": 0.8267897963523865, "adv/std_step_conf": 0.9360949397087097, "calib/answer_extract_rate": 0.93359375, "calib/auroc": 0.7363108206245462, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.23623502109704642, "calib/final_conf_rate": 0.92578125, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.510548523206751, "calib/gap": 0.3471128976034858, "calib/mean_conf": 0.6423303797468355, "calib/mu_c": 0.7917207407407407, "calib/mu_w": 0.44460784313725493, "calib/nonempty_final_conf_rate": 0.92578125, "calib/nonempty_reasoning_rate": 0.94921875, "calib/nonempty_step_conf_rate": 0.93359375, "calib/pce": 0.15447257383966245, "calib/std_conf": 0.4174147894107768, "calib/step_conf_rate": 0.93359375, "calib/step_q_c": 0.49290909090909085, "calib/step_q_c_n": 770.0, "calib/step_q_gap": 0.10334832956209239, "calib/step_q_w": 0.38956076134699846, "calib/step_q_w_n": 683.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 622.33203125, "completions/mean_terminated_length": 624.7725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.192, "grad_norm": 0.02795577608048916, "kl": 0.1415863037109375, "learning_rate": 5.555555555555555e-07, "loss": -0.0733, "mask/has_final_conf_rate": 0.92578125, "mask/share_final_conf": 0.028850296512246132, "mask/share_reasoning": 0.8604491949081421, "mask/share_step_conf": 0.10679426789283752, "num_tokens": 43817845.0, "reward": 0.7983482480049133, "reward_std": 0.27285927534103394, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.6823587417602539, "rewards/format_reward_step": 0.91015625, "rewards/step_l1_reward": 0.6268377304077148, "step": 180 }, { "adv/mean_abs_final_conf": 0.7173429727554321, "adv/mean_abs_reasoning": 0.6155725121498108, "adv/mean_abs_step_conf": 0.7480303049087524, "adv/ratio_final_to_reasoning": 1.165326518967198, "adv/ratio_step_to_reasoning": 1.2151782123869521, "adv/std_final_conf": 0.8929460048675537, "adv/std_reasoning": 0.843059778213501, "adv/std_step_conf": 0.9359552264213562, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8446787800129787, "calib/avg_num_step_conf": 5.3359375, "calib/ece": 0.16776439089692097, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.4538152610441767, "calib/gap": 0.5058304131516331, "calib/mean_conf": 0.5534672021419009, "calib/mu_c": 0.8256811594202899, "calib/mu_w": 0.3198507462686567, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.12969210174029447, "calib/std_conf": 0.440777658405739, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.569655172413793, "calib/step_q_c_n": 580.0, "calib/step_q_gap": 0.21992234798631205, "calib/step_q_w": 0.349732824427481, "calib/step_q_w_n": 786.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 503.71875, "completions/mean_terminated_length": 505.69415283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.19306666666666666, "grad_norm": 0.028606941923499107, "kl": 0.1645660400390625, "learning_rate": 5.277777777777779e-07, "loss": 0.0363, "mask/has_final_conf_rate": 0.96875, "mask/share_final_conf": 0.032713860273361206, "mask/share_reasoning": 0.8471143245697021, "mask/share_step_conf": 0.11626553535461426, "num_tokens": 44053061.0, "reward": 0.8522889018058777, "reward_std": 0.2574465274810791, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.7553721070289612, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6695181131362915, "step": 181 }, { "adv/mean_abs_final_conf": 0.7431403398513794, "adv/mean_abs_reasoning": 0.6147758364677429, "adv/mean_abs_step_conf": 0.7546775937080383, "adv/ratio_final_to_reasoning": 1.2087988755725465, "adv/ratio_step_to_reasoning": 1.2275654782466976, "adv/std_final_conf": 0.8893713355064392, "adv/std_reasoning": 0.8430184125900269, "adv/std_step_conf": 0.9359642267227173, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.747278911564626, "calib/avg_num_step_conf": 5.4296875, "calib/ece": 0.26951020408163273, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5836734693877551, "calib/gap": 0.2914285714285715, "calib/mean_conf": 0.7079591836734694, "calib/mu_c": 0.832857142857143, "calib/mu_w": 0.5414285714285715, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.20302040816326533, "calib/std_conf": 0.39359452663274164, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.5378107843137254, "calib/step_q_c_n": 714.0, "calib/step_q_gap": 0.17727823993502717, "calib/step_q_w": 0.36053254437869825, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 533.83203125, "completions/mean_terminated_length": 533.83203125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.19413333333333332, "grad_norm": 0.028639815747737885, "kl": 0.1571044921875, "learning_rate": 5.000000000000001e-07, "loss": -0.0498, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.032081544399261475, "mask/share_reasoning": 0.8519595861434937, "mask/share_step_conf": 0.11595889925956726, "num_tokens": 44295882.0, "reward": 0.8192135691642761, "reward_std": 0.2845459580421448, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.678500771522522, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.6638326048851013, "step": 182 }, { "adv/mean_abs_final_conf": 0.7434015870094299, "adv/mean_abs_reasoning": 0.6174652576446533, "adv/mean_abs_step_conf": 0.7496039867401123, "adv/ratio_final_to_reasoning": 1.203956947869692, "adv/ratio_step_to_reasoning": 1.21400188506072, "adv/std_final_conf": 0.8804036378860474, "adv/std_reasoning": 0.8591917753219604, "adv/std_step_conf": 0.9359599351882935, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.7269551195915077, "calib/avg_num_step_conf": 5.33984375, "calib/ece": 0.2995491803278688, "calib/final_conf_rate": 0.953125, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.4713114754098361, "calib/gap": 0.29139344262295075, "calib/mean_conf": 0.576516393442623, "calib/mu_c": 0.7222131147540983, "calib/mu_w": 0.4308196721311476, "calib/nonempty_final_conf_rate": 0.953125, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9609375, "calib/pce": 0.18803278688524583, "calib/std_conf": 0.4391335495284083, "calib/step_conf_rate": 0.9609375, "calib/step_q_c": 0.514108527131783, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.14065977366917914, "calib/step_q_w": 0.37344875346260387, "calib/step_q_w_n": 722.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 583.421875, "completions/mean_terminated_length": 588.0157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.1952, "grad_norm": 0.027015017345547676, "kl": 0.1434173583984375, "learning_rate": 4.7222222222222226e-07, "loss": -0.0312, "mask/has_final_conf_rate": 0.953125, "mask/share_final_conf": 0.031002890318632126, "mask/share_reasoning": 0.8562546968460083, "mask/share_step_conf": 0.10492990911006927, "num_tokens": 44551918.0, "reward": 0.7831747531890869, "reward_std": 0.2825014591217041, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6438746452331543, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.6420061588287354, "step": 183 }, { "adv/mean_abs_final_conf": 0.682061493396759, "adv/mean_abs_reasoning": 0.5985027551651001, "adv/mean_abs_step_conf": 0.7494857311248779, "adv/ratio_final_to_reasoning": 1.139612955012394, "adv/ratio_step_to_reasoning": 1.2522678043781577, "adv/std_final_conf": 0.8821538090705872, "adv/std_reasoning": 0.8428606986999512, "adv/std_step_conf": 0.9358847141265869, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7734153263954588, "calib/avg_num_step_conf": 5.6484375, "calib/ece": 0.19544364123159297, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5823293172690763, "calib/gap": 0.4104044780826238, "calib/mean_conf": 0.683994109772423, "calib/mu_c": 0.8455187637969096, "calib/mu_w": 0.4351142857142858, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.13650602409638546, "calib/std_conf": 0.408365922521205, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.4911140583554376, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.10839729534965725, "calib/step_q_w": 0.38271676300578034, "calib/step_q_w_n": 692.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 537.29296875, "completions/mean_terminated_length": 539.4000244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.19626666666666667, "grad_norm": 0.02635062113404274, "kl": 0.150238037109375, "learning_rate": 4.444444444444445e-07, "loss": 0.0339, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.03211846947669983, "mask/share_reasoning": 0.8551608324050903, "mask/share_step_conf": 0.10881443321704865, "num_tokens": 44794745.0, "reward": 0.8776298761367798, "reward_std": 0.23959048092365265, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7629246711730957, "rewards/format_reward_step": 0.96875, "rewards/step_l1_reward": 0.6806163787841797, "step": 184 }, { "adv/mean_abs_final_conf": 0.6604286432266235, "adv/mean_abs_reasoning": 0.5685111880302429, "adv/mean_abs_step_conf": 0.7494895458221436, "adv/ratio_final_to_reasoning": 1.1616809961380934, "adv/ratio_step_to_reasoning": 1.3183373724252427, "adv/std_final_conf": 0.8744280934333801, "adv/std_reasoning": 0.7929867506027222, "adv/std_step_conf": 0.935917317867279, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7766541562413122, "calib/avg_num_step_conf": 5.67578125, "calib/ece": 0.242613692946058, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.92578125, "calib/frac_conf_gt_0.9": 0.5726141078838174, "calib/gap": 0.35443431331665287, "calib/mean_conf": 0.6847721991701246, "calib/mu_c": 0.8450765151515153, "calib/mu_w": 0.4906422018348624, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.1898340248962655, "calib/std_conf": 0.41007274682895484, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.4762269129287599, "calib/step_q_c_n": 758.0, "calib/step_q_gap": 0.1444486395474649, "calib/step_q_w": 0.331778273381295, "calib/step_q_w_n": 695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2926.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 553.40234375, "completions/mean_terminated_length": 562.1865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.19733333333333333, "grad_norm": 0.03410667926073074, "kl": 0.135406494140625, "learning_rate": 4.1666666666666667e-07, "loss": -0.1516, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.03099917247891426, "mask/share_reasoning": 0.8441491723060608, "mask/share_step_conf": 0.10922666639089584, "num_tokens": 45043336.0, "reward": 0.8032524585723877, "reward_std": 0.26167821884155273, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.6864601373672485, "rewards/format_reward_step": 0.92578125, "rewards/step_l1_reward": 0.6317634582519531, "step": 185 }, { "adv/mean_abs_final_conf": 0.7115479707717896, "adv/mean_abs_reasoning": 0.5765695571899414, "adv/mean_abs_step_conf": 0.7557822465896606, "adv/ratio_final_to_reasoning": 1.2341060361211227, "adv/ratio_step_to_reasoning": 1.310825792248134, "adv/std_final_conf": 0.8902884125709534, "adv/std_reasoning": 0.8266717791557312, "adv/std_step_conf": 0.9360302090644836, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.769036519036519, "calib/avg_num_step_conf": 5.80078125, "calib/ece": 0.25370816599732265, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.5863453815261044, "calib/gap": 0.4026961926961926, "calib/mean_conf": 0.6599732262382865, "calib/mu_c": 0.8491919191919192, "calib/mu_w": 0.44649572649572655, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.19178045515394918, "calib/std_conf": 0.43245312642725137, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.5278228438228438, "calib/step_q_c_n": 715.0, "calib/step_q_gap": 0.13078388278388275, "calib/step_q_w": 0.3970389610389611, "calib/step_q_w_n": 770.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 564.03125, "completions/mean_terminated_length": 564.03125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1984, "grad_norm": 0.02527988888323307, "kl": 0.146881103515625, "learning_rate": 3.8888888888888895e-07, "loss": -0.0027, "mask/has_final_conf_rate": 0.97265625, "mask/share_final_conf": 0.031393036246299744, "mask/share_reasoning": 0.853333592414856, "mask/share_step_conf": 0.11527342349290848, "num_tokens": 45292768.0, "reward": 0.8366233706474304, "reward_std": 0.26257753372192383, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7131576538085938, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6663390398025513, "step": 186 }, { "adv/mean_abs_final_conf": 0.7757323980331421, "adv/mean_abs_reasoning": 0.6955157518386841, "adv/mean_abs_step_conf": 0.7686678171157837, "adv/ratio_final_to_reasoning": 1.115334046687505, "adv/ratio_step_to_reasoning": 1.1051767196986018, "adv/std_final_conf": 0.9219383597373962, "adv/std_reasoning": 0.8903457522392273, "adv/std_step_conf": 0.9359862208366394, "calib/answer_extract_rate": 0.95703125, "calib/auroc": 0.6583033453285353, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.3204489795918367, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.5142857142857142, "calib/gap": 0.2174770091963214, "calib/mean_conf": 0.6295102040816327, "calib/mu_c": 0.7378048780487805, "calib/mu_w": 0.5203278688524591, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 0.98046875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.2239591836734694, "calib/std_conf": 0.42237492400293963, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.49464228934817167, "calib/step_q_c_n": 629.0, "calib/step_q_gap": 0.13767259237847468, "calib/step_q_w": 0.356969696969697, "calib/step_q_w_n": 825.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 563.265625, "completions/mean_terminated_length": 563.265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.19946666666666665, "grad_norm": 0.02938619814813137, "kl": 0.1455841064453125, "learning_rate": 3.611111111111111e-07, "loss": -0.1021, "mask/has_final_conf_rate": 0.95703125, "mask/share_final_conf": 0.030438825488090515, "mask/share_reasoning": 0.8618665933609009, "mask/share_step_conf": 0.10769452899694443, "num_tokens": 45538508.0, "reward": 0.7799279689788818, "reward_std": 0.30152833461761475, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6231152415275574, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6515845060348511, "step": 187 }, { "adv/mean_abs_final_conf": 0.753190279006958, "adv/mean_abs_reasoning": 0.5860728621482849, "adv/mean_abs_step_conf": 0.7702962160110474, "adv/ratio_final_to_reasoning": 1.2851478504670806, "adv/ratio_step_to_reasoning": 1.3143352401397341, "adv/std_final_conf": 0.8905980587005615, "adv/std_reasoning": 0.8267278671264648, "adv/std_step_conf": 0.9361079931259155, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.7764330795757693, "calib/avg_num_step_conf": 6.0546875, "calib/ece": 0.22419999999999995, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.58, "calib/gap": 0.40732188170993544, "calib/mean_conf": 0.65028, "calib/mu_c": 0.8278723404255318, "calib/mu_w": 0.4205504587155964, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.15524, "calib/std_conf": 0.4295526994444337, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.5022386223862239, "calib/step_q_c_n": 813.0, "calib/step_q_gap": 0.12204866309178697, "calib/step_q_w": 0.3801899592944369, "calib/step_q_w_n": 737.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 584.0390625, "completions/mean_terminated_length": 586.3294677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.20053333333333334, "grad_norm": 0.026707028970122337, "kl": 0.13092041015625, "learning_rate": 3.3333333333333335e-07, "loss": 0.0319, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.031184760853648186, "mask/share_reasoning": 0.8455798625946045, "mask/share_step_conf": 0.11932916939258575, "num_tokens": 45792094.0, "reward": 0.8444327116012573, "reward_std": 0.2778102159500122, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7186073660850525, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6718204021453857, "step": 188 }, { "adv/mean_abs_final_conf": 0.6530822515487671, "adv/mean_abs_reasoning": 0.4331696927547455, "adv/mean_abs_step_conf": 0.7734417915344238, "adv/ratio_final_to_reasoning": 1.507682237405591, "adv/ratio_step_to_reasoning": 1.7855399499806086, "adv/std_final_conf": 0.8334683179855347, "adv/std_reasoning": 0.7207141518592834, "adv/std_step_conf": 0.9360926151275635, "calib/answer_extract_rate": 0.96875, "calib/auroc": 0.8179279989488897, "calib/avg_num_step_conf": 5.64453125, "calib/ece": 0.1900404858299595, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4089068825910931, "calib/gap": 0.49601169360136654, "calib/mean_conf": 0.5221862348178137, "calib/mu_c": 0.7591472868217055, "calib/mu_w": 0.26313559322033897, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.97265625, "calib/pce": 0.09497975708502024, "calib/std_conf": 0.4422852846397903, "calib/step_conf_rate": 0.97265625, "calib/step_q_c": 0.539089376053963, "calib/step_q_c_n": 593.0, "calib/step_q_gap": 0.22370205211030098, "calib/step_q_w": 0.31538732394366203, "calib/step_q_w_n": 852.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 531.21484375, "completions/mean_terminated_length": 533.298095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.2016, "grad_norm": 0.0228364747017622, "kl": 0.1593017578125, "learning_rate": 3.055555555555556e-07, "loss": -0.1226, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03335583209991455, "mask/share_reasoning": 0.848726749420166, "mask/share_step_conf": 0.11401111632585526, "num_tokens": 46035853.0, "reward": 0.8595134615898132, "reward_std": 0.24013186991214752, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7632609009742737, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6643596887588501, "step": 189 }, { "adv/mean_abs_final_conf": 0.7142746448516846, "adv/mean_abs_reasoning": 0.570991039276123, "adv/mean_abs_step_conf": 0.7811095714569092, "adv/ratio_final_to_reasoning": 1.2509384486264619, "adv/ratio_step_to_reasoning": 1.367989193748408, "adv/std_final_conf": 0.9002556204795837, "adv/std_reasoning": 0.8100013136863708, "adv/std_step_conf": 0.9358389973640442, "calib/answer_extract_rate": 0.94140625, "calib/auroc": 0.7874912648497554, "calib/avg_num_step_conf": 6.046875, "calib/ece": 0.19755186721991694, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.48132780082987553, "calib/gap": 0.4253892382948987, "calib/mean_conf": 0.6052697095435685, "calib/mu_c": 0.7923703703703704, "calib/mu_w": 0.36698113207547167, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12132780082987545, "calib/std_conf": 0.4304942998397557, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.48983488681757653, "calib/step_q_c_n": 751.0, "calib/step_q_gap": 0.15336060827303455, "calib/step_q_w": 0.336474278544542, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 582.25, "completions/mean_terminated_length": 589.1541748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.20266666666666666, "grad_norm": 0.028145916759967804, "kl": 0.1420440673828125, "learning_rate": 2.7777777777777776e-07, "loss": 0.0116, "mask/has_final_conf_rate": 0.94140625, "mask/share_final_conf": 0.02930673211812973, "mask/share_reasoning": 0.8495047688484192, "mask/share_step_conf": 0.10946974903345108, "num_tokens": 46290517.0, "reward": 0.8393450975418091, "reward_std": 0.2597062587738037, "rewards/accuracy_reward_step": 0.52734375, "rewards/final_brier_reward_step": 0.7265636920928955, "rewards/format_reward_step": 0.9296875, "rewards/step_l1_reward": 0.6607202291488647, "step": 190 }, { "adv/mean_abs_final_conf": 0.6771456003189087, "adv/mean_abs_reasoning": 0.6281393766403198, "adv/mean_abs_step_conf": 0.7667137980461121, "adv/ratio_final_to_reasoning": 1.0780180729008022, "adv/ratio_step_to_reasoning": 1.2206109448940687, "adv/std_final_conf": 0.8823111653327942, "adv/std_reasoning": 0.8430548310279846, "adv/std_step_conf": 0.9359553456306458, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7153972153972153, "calib/avg_num_step_conf": 6.15625, "calib/ece": 0.2737037037037037, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.551440329218107, "calib/gap": 0.3602129402129402, "calib/mean_conf": 0.6509053497942388, "calib/mu_c": 0.8465765765765766, "calib/mu_w": 0.4863636363636364, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.23390946502057613, "calib/std_conf": 0.42241126773226845, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.5262015503875969, "calib/step_q_c_n": 645.0, "calib/step_q_gap": 0.15451519163356897, "calib/step_q_w": 0.3716863587540279, "calib/step_q_w_n": 931.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 545.640625, "completions/mean_terminated_length": 552.1107177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.20373333333333332, "grad_norm": 0.029416009783744812, "kl": 0.141448974609375, "learning_rate": 2.5000000000000004e-07, "loss": -0.0805, "mask/has_final_conf_rate": 0.94921875, "mask/share_final_conf": 0.034083664417266846, "mask/share_reasoning": 0.8317482471466064, "mask/share_step_conf": 0.12244933843612671, "num_tokens": 46534369.0, "reward": 0.8029336929321289, "reward_std": 0.2478676438331604, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6782324314117432, "rewards/format_reward_step": 0.9453125, "rewards/step_l1_reward": 0.651853621006012, "step": 191 }, { "adv/mean_abs_final_conf": 0.6828257441520691, "adv/mean_abs_reasoning": 0.5440605282783508, "adv/mean_abs_step_conf": 0.7629486918449402, "adv/ratio_final_to_reasoning": 1.2550547386939337, "adv/ratio_step_to_reasoning": 1.402323183156199, "adv/std_final_conf": 0.8883445858955383, "adv/std_reasoning": 0.7928472757339478, "adv/std_step_conf": 0.936056911945343, "calib/answer_extract_rate": 0.96484375, "calib/auroc": 0.8442028985507246, "calib/avg_num_step_conf": 5.390625, "calib/ece": 0.17631578947368423, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.5101214574898786, "calib/gap": 0.4841383399209487, "calib/mean_conf": 0.6177732793522267, "calib/mu_c": 0.8431818181818183, "calib/mu_w": 0.3590434782608696, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.12983805668016196, "calib/std_conf": 0.4270872788312106, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.512940251572327, "calib/step_q_c_n": 636.0, "calib/step_q_gap": 0.16678259028200448, "calib/step_q_w": 0.34615766129032255, "calib/step_q_w_n": 744.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 523.09375, "completions/mean_terminated_length": 525.1451416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.2048, "grad_norm": 0.0521027036011219, "kl": 0.18511962890625, "learning_rate": 2.2222222222222224e-07, "loss": 0.0458, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.03493378311395645, "mask/share_reasoning": 0.8442932367324829, "mask/share_step_conf": 0.11686672270298004, "num_tokens": 46773257.0, "reward": 0.8638187646865845, "reward_std": 0.24488694965839386, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7593094110488892, "rewards/format_reward_step": 0.94921875, "rewards/step_l1_reward": 0.6745781898498535, "step": 192 }, { "adv/mean_abs_final_conf": 0.7857605218887329, "adv/mean_abs_reasoning": 0.6942075490951538, "adv/mean_abs_step_conf": 0.7512802481651306, "adv/ratio_final_to_reasoning": 1.1318812693882563, "adv/ratio_step_to_reasoning": 1.0822127318326726, "adv/std_final_conf": 0.9214059710502625, "adv/std_reasoning": 0.8749935030937195, "adv/std_step_conf": 0.9361399412155151, "calib/answer_extract_rate": 0.94921875, "calib/auroc": 0.7118202652962011, "calib/avg_num_step_conf": 5.71875, "calib/ece": 0.269875, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.4583333333333333, "calib/gap": 0.2741461212584207, "calib/mean_conf": 0.597375, "calib/mu_c": 0.7333057851239669, "calib/mu_w": 0.45915966386554624, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.95703125, "calib/pce": 0.18154166666666668, "calib/std_conf": 0.4123997163452791, "calib/step_conf_rate": 0.95703125, "calib/step_q_c": 0.4712759643916914, "calib/step_q_c_n": 674.0, "calib/step_q_gap": 0.1188202681891598, "calib/step_q_w": 0.3524556962025316, "calib/step_q_w_n": 790.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 540.078125, "completions/mean_terminated_length": 544.3306884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.20586666666666667, "grad_norm": 0.027009624987840652, "kl": 0.1503753662109375, "learning_rate": 1.9444444444444447e-07, "loss": -0.1179, "mask/has_final_conf_rate": 0.9375, "mask/share_final_conf": 0.03015628084540367, "mask/share_reasoning": 0.8478561639785767, "mask/share_step_conf": 0.11417503654956818, "num_tokens": 47017229.0, "reward": 0.7857017517089844, "reward_std": 0.28552713990211487, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.6603667736053467, "rewards/format_reward_step": 0.91796875, "rewards/step_l1_reward": 0.6313490867614746, "step": 193 }, { "adv/mean_abs_final_conf": 0.7223851084709167, "adv/mean_abs_reasoning": 0.6534860134124756, "adv/mean_abs_step_conf": 0.7698065042495728, "adv/ratio_final_to_reasoning": 1.1054331594622095, "adv/ratio_step_to_reasoning": 1.1779999700830268, "adv/std_final_conf": 0.8887658715248108, "adv/std_reasoning": 0.8432233333587646, "adv/std_step_conf": 0.9361362457275391, "calib/answer_extract_rate": 0.91796875, "calib/auroc": 0.7728372434017595, "calib/avg_num_step_conf": 5.0390625, "calib/ece": 0.26772820512820517, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.8984375, "calib/frac_conf_gt_0.9": 0.5512820512820513, "calib/gap": 0.35449970674486797, "calib/mean_conf": 0.6374, "calib/mu_c": 0.8040451612903226, "calib/mu_w": 0.4495454545454546, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 0.9609375, "calib/nonempty_step_conf_rate": 0.9453125, "calib/pce": 0.18760683760683766, "calib/std_conf": 0.4288700355007649, "calib/step_conf_rate": 0.9453125, "calib/step_q_c": 0.5302444625407167, "calib/step_q_c_n": 614.0, "calib/step_q_gap": 0.17290718443420777, "calib/step_q_w": 0.3573372781065089, "calib/step_q_w_n": 676.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 543.15234375, "completions/mean_terminated_length": 547.4291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.20693333333333333, "grad_norm": 0.032942239195108414, "kl": 0.13763427734375, "learning_rate": 1.6666666666666668e-07, "loss": -0.1222, "mask/has_final_conf_rate": 0.9140625, "mask/share_final_conf": 0.03134291246533394, "mask/share_reasoning": 0.8551046848297119, "mask/share_step_conf": 0.10573991388082504, "num_tokens": 47262220.0, "reward": 0.7879054546356201, "reward_std": 0.30618467926979065, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6582788228988647, "rewards/format_reward_step": 0.8984375, "rewards/step_l1_reward": 0.6409695744514465, "step": 194 }, { "adv/mean_abs_final_conf": 0.7022555470466614, "adv/mean_abs_reasoning": 0.6497544050216675, "adv/mean_abs_step_conf": 0.7761138677597046, "adv/ratio_final_to_reasoning": 1.080801517649185, "adv/ratio_step_to_reasoning": 1.194472652684553, "adv/std_final_conf": 0.8913254737854004, "adv/std_reasoning": 0.8589892387390137, "adv/std_step_conf": 0.9360651969909668, "calib/answer_extract_rate": 0.97265625, "calib/auroc": 0.7623004880622608, "calib/avg_num_step_conf": 5.921875, "calib/ece": 0.2168016194331983, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.4939271255060729, "calib/gap": 0.39781954887218046, "calib/mean_conf": 0.6047368421052631, "calib/mu_c": 0.7883458646616541, "calib/mu_w": 0.39052631578947367, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1415384615384615, "calib/std_conf": 0.43081392648096545, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4987890625, "calib/step_q_c_n": 768.0, "calib/step_q_gap": 0.15608852774064175, "calib/step_q_w": 0.34270053475935824, "calib/step_q_w_n": 748.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 531.73828125, "completions/mean_terminated_length": 533.8235473632812, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.208, "grad_norm": 0.03202909603714943, "kl": 0.1470947265625, "learning_rate": 1.3888888888888888e-07, "loss": -0.0239, "mask/has_final_conf_rate": 0.96484375, "mask/share_final_conf": 0.032903797924518585, "mask/share_reasoning": 0.8412511944770813, "mask/share_step_conf": 0.12193876504898071, "num_tokens": 47504329.0, "reward": 0.836263120174408, "reward_std": 0.2455168217420578, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7245702743530273, "rewards/format_reward_step": 0.953125, "rewards/step_l1_reward": 0.6526434421539307, "step": 195 }, { "adv/mean_abs_final_conf": 0.6467857360839844, "adv/mean_abs_reasoning": 0.5239002704620361, "adv/mean_abs_step_conf": 0.7513079047203064, "adv/ratio_final_to_reasoning": 1.2345588894496533, "adv/ratio_step_to_reasoning": 1.4340666479475488, "adv/std_final_conf": 0.8463580012321472, "adv/std_reasoning": 0.792838990688324, "adv/std_step_conf": 0.9359694123268127, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.726465307169424, "calib/avg_num_step_conf": 5.38671875, "calib/ece": 0.25298764940239044, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6254980079681275, "calib/gap": 0.31832956906502496, "calib/mean_conf": 0.7194426294820718, "calib/mu_c": 0.862754347826087, "calib/mu_w": 0.544424778761062, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.2113147410358566, "calib/std_conf": 0.39718020052452685, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5708648275862069, "calib/step_q_c_n": 725.0, "calib/step_q_gap": 0.1408648275862069, "calib/step_q_w": 0.43000000000000005, "calib/step_q_w_n": 654.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 439.1953125, "completions/mean_terminated_length": 440.91766357421875, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.20906666666666668, "grad_norm": 0.046255625784397125, "kl": 0.1627349853515625, "learning_rate": 1.1111111111111112e-07, "loss": 0.0433, "mask/has_final_conf_rate": 0.9765625, "mask/share_final_conf": 0.037858519703149796, "mask/share_reasoning": 0.8283717632293701, "mask/share_step_conf": 0.129863440990448, "num_tokens": 47719307.0, "reward": 0.8447595238685608, "reward_std": 0.23569031059741974, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.705741822719574, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.680652379989624, "step": 196 }, { "adv/mean_abs_final_conf": 0.7253849506378174, "adv/mean_abs_reasoning": 0.5409168601036072, "adv/mean_abs_step_conf": 0.7447569370269775, "adv/ratio_final_to_reasoning": 1.3410285464181635, "adv/ratio_step_to_reasoning": 1.3768417883745143, "adv/std_final_conf": 0.8878090381622314, "adv/std_reasoning": 0.7930150628089905, "adv/std_step_conf": 0.9359817504882812, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.7492561983471074, "calib/avg_num_step_conf": 5.953125, "calib/ece": 0.25386178861788616, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.4105691056910569, "calib/gap": 0.3757229752066114, "calib/mean_conf": 0.5420731707317072, "calib/mu_c": 0.7268799999999999, "calib/mu_w": 0.35115702479338845, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.14390243902439023, "calib/std_conf": 0.4326419250332202, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.5155270655270655, "calib/step_q_c_n": 702.0, "calib/step_q_gap": 0.1652472601742675, "calib/step_q_w": 0.350279805352798, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 551.171875, "completions/mean_terminated_length": 551.171875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.21013333333333334, "grad_norm": 0.02587934210896492, "kl": 0.1564178466796875, "learning_rate": 8.333333333333334e-08, "loss": -0.0362, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.031424008309841156, "mask/share_reasoning": 0.8499119281768799, "mask/share_step_conf": 0.11866404861211777, "num_tokens": 47965463.0, "reward": 0.8559994697570801, "reward_std": 0.24890324473381042, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.7198866605758667, "rewards/format_reward_step": 0.95703125, "rewards/step_l1_reward": 0.7030494809150696, "step": 197 }, { "adv/mean_abs_final_conf": 0.6916540861129761, "adv/mean_abs_reasoning": 0.5280545353889465, "adv/mean_abs_step_conf": 0.7727373838424683, "adv/ratio_final_to_reasoning": 1.3098156341059126, "adv/ratio_step_to_reasoning": 1.4633666260877711, "adv/std_final_conf": 0.8943579792976379, "adv/std_reasoning": 0.7753936052322388, "adv/std_step_conf": 0.935874342918396, "calib/answer_extract_rate": 0.98046875, "calib/auroc": 0.8191516383032766, "calib/avg_num_step_conf": 6.359375, "calib/ece": 0.1817928286852589, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.47808764940239046, "calib/gap": 0.4929203708407418, "calib/mean_conf": 0.5857768924302789, "calib/mu_c": 0.8292913385826772, "calib/mu_w": 0.3363709677419354, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.13079681274900393, "calib/std_conf": 0.4360659125676879, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.5150514138817481, "calib/step_q_c_n": 778.0, "calib/step_q_gap": 0.12894553152880694, "calib/step_q_w": 0.3861058823529412, "calib/step_q_w_n": 850.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 484.390625, "completions/mean_terminated_length": 488.2047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.2112, "grad_norm": 0.03680581599473953, "kl": 0.1578521728515625, "learning_rate": 5.555555555555556e-08, "loss": -0.0457, "mask/has_final_conf_rate": 0.98046875, "mask/share_final_conf": 0.03651454299688339, "mask/share_reasoning": 0.8229556083679199, "mask/share_step_conf": 0.1327173113822937, "num_tokens": 48194851.0, "reward": 0.8810112476348877, "reward_std": 0.19196817278862, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7804093360900879, "rewards/format_reward_step": 0.9765625, "rewards/step_l1_reward": 0.6870818138122559, "step": 198 }, { "adv/mean_abs_final_conf": 0.7874263525009155, "adv/mean_abs_reasoning": 0.6962288618087769, "adv/mean_abs_step_conf": 0.7540781497955322, "adv/ratio_final_to_reasoning": 1.1309878054397386, "adv/ratio_step_to_reasoning": 1.0830894712357444, "adv/std_final_conf": 0.9218591451644897, "adv/std_reasoning": 0.8749672770500183, "adv/std_step_conf": 0.9361175298690796, "calib/answer_extract_rate": 0.9453125, "calib/auroc": 0.7220511767925561, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.2828512396694215, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.48760330578512395, "calib/gap": 0.3130925013683635, "calib/mean_conf": 0.5909504132231405, "calib/mu_c": 0.7539655172413794, "calib/mu_w": 0.4408730158730159, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 0.984375, "calib/nonempty_step_conf_rate": 0.98046875, "calib/pce": 0.1972314049586777, "calib/std_conf": 0.430134907748099, "calib/step_conf_rate": 0.98046875, "calib/step_q_c": 0.4932019704433498, "calib/step_q_c_n": 609.0, "calib/step_q_gap": 0.10533853849068708, "calib/step_q_w": 0.3878634319526627, "calib/step_q_w_n": 845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 597.0390625, "completions/mean_terminated_length": 604.1185913085938, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.21226666666666666, "grad_norm": 0.03494660183787346, "kl": 0.1394805908203125, "learning_rate": 2.777777777777778e-08, "loss": -0.0588, "mask/has_final_conf_rate": 0.9453125, "mask/share_final_conf": 0.03298765793442726, "mask/share_reasoning": 0.8413237929344177, "mask/share_step_conf": 0.11396980285644531, "num_tokens": 48451893.0, "reward": 0.7908999919891357, "reward_std": 0.2822682857513428, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6665651798248291, "rewards/format_reward_step": 0.94140625, "rewards/step_l1_reward": 0.6363283395767212, "step": 199 }, { "adv/mean_abs_final_conf": 0.642303466796875, "adv/mean_abs_reasoning": 0.5176878571510315, "adv/mean_abs_step_conf": 0.7742708921432495, "adv/ratio_final_to_reasoning": 1.2407157284538892, "adv/ratio_step_to_reasoning": 1.4956327088764647, "adv/std_final_conf": 0.847298800945282, "adv/std_reasoning": 0.7576335668563843, "adv/std_step_conf": 0.9359710216522217, "calib/answer_extract_rate": 0.9609375, "calib/auroc": 0.8972982100641674, "calib/avg_num_step_conf": 5.3671875, "calib/ece": 0.12780487804878057, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.5650406504065041, "calib/gap": 0.6300871327254306, "calib/mean_conf": 0.6388617886178862, "calib/mu_c": 0.9078014184397163, "calib/mu_w": 0.2777142857142857, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.94921875, "calib/pce": 0.0967479674796749, "calib/std_conf": 0.4316647736757926, "calib/step_conf_rate": 0.94921875, "calib/step_q_c": 0.5627881934566145, "calib/step_q_c_n": 703.0, "calib/step_q_gap": 0.2601503395072851, "calib/step_q_w": 0.30263785394932935, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 543.4140625, "completions/mean_terminated_length": 545.5451049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.21333333333333335, "grad_norm": 0.028668925166130066, "kl": 0.147674560546875, "learning_rate": 0.0, "loss": -0.0904, "mask/has_final_conf_rate": 0.9609375, "mask/share_final_conf": 0.03222372382879257, "mask/share_reasoning": 0.8570770621299744, "mask/share_step_conf": 0.10679294914007187, "num_tokens": 48699055.0, "reward": 0.8991020917892456, "reward_std": 0.24127095937728882, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.8118078112602234, "rewards/format_reward_step": 0.93359375, "rewards/step_l1_reward": 0.689521312713623, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 1.0105193024349866, "train_runtime": 14782.0565, "train_samples_per_second": 3.464, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 48699055, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }