{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "aux_distill/lambda": 0.09999999999999999, "aux_distill/loss": 1.884798833302089, "aux_distill/mean_u": 0.2680448395277542, "aux_distill/n_active_tok": 22.285714285714285, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.44921875, "calib/ece": 0.16135714285714264, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.12215151515151501, "calib/mean_conf": 0.9256428571428571, "calib/mu_c": 0.9518181818181817, "calib/mu_w": 0.8296666666666667, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.08203125, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.15064285714285694, "calib/std_conf": 0.1237495207164497, "calib/step_conf_rate": 0.078125, "calib/step_q_c": 0.8832727272727273, "calib/step_q_c_n": 55.0, "calib/step_q_gap": 0.042872727272727285, "calib/step_q_w": 0.8404, "calib/step_q_w_n": 60.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 579.68359375, "completions/mean_terminated_length": 648.0305786132812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.0010666666666666667, "grad_norm": 1.1307653188705444, "learning_rate": 0.0, "loss": 0.0331, "num_tokens": 255983.0, "reward": 0.06787636131048203, "reward_std": 0.1474149525165558, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.04200273007154465, "rewards/format_reward_step": 0.05078125, "step": 1 }, { "aux_distill/lambda": 0.09999999999999999, "aux_distill/loss": 1.3874275883038838, "aux_distill/mean_u": 0.329474540462036, "aux_distill/n_active_tok": 27.333333333333332, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.359375, "calib/ece": 0.6484615384615385, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.9230769230769231, "calib/gap": 0.016388888888888786, "calib/mean_conf": 0.9561538461538462, "calib/mu_c": 0.9674999999999999, "calib/mu_w": 0.9511111111111111, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.06640625, "calib/pce": 0.6484615384615385, "calib/std_conf": 0.024663414679817527, "calib/step_conf_rate": 0.06640625, "calib/step_q_c": 0.8756521739130435, "calib/step_q_c_n": 23.0, "calib/step_q_gap": 0.015072463768115885, "calib/step_q_w": 0.8605797101449276, "calib/step_q_w_n": 69.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 785.5390625, "completions/mean_terminated_length": 827.5637817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0021333333333333334, "grad_norm": 0.7146468162536621, "learning_rate": 2.5000000000000004e-07, "loss": 0.0274, "num_tokens": 560369.0, "reward": 0.040345899760723114, "reward_std": 0.1000000610947609, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.01819179579615593, "rewards/format_reward_step": 0.046875, "step": 2 }, { "aux_distill/lambda": 0.10000000000000002, "aux_distill/loss": 1.3222839037577312, "aux_distill/mean_u": 0.11826397114259928, "aux_distill/n_active_tok": 32.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.125, "calib/ece": 0.7699999999999998, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/mean_conf": 0.77, "calib/mu_c": NaN, "calib/mu_w": 0.77, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.015625, "calib/pce": 0.7699999999999998, "calib/std_conf": 0.3528928071430945, "calib/step_conf_rate": 0.015625, "calib/step_q_w": 0.863125, "calib/step_q_w_n": 32.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 699.63671875, "completions/mean_terminated_length": 778.72607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0032, "grad_norm": 0.3191400170326233, "learning_rate": 5.000000000000001e-07, "loss": 0.0012, "num_tokens": 844732.0, "reward": 0.006911718752235174, "reward_std": 0.0195492934435606, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0021046875044703484, "rewards/format_reward_step": 0.01171875, "step": 3 }, { "aux_distill/lambda": 0.1, "aux_distill/loss": 2.1405008137226105, "aux_distill/mean_u": 0.22794245170372168, "aux_distill/n_active_tok": 12.0, "calib/answer_extract_rate": 0.046875, "calib/avg_num_step_conf": 0.1953125, "calib/ece": 0.361, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.9, "calib/gap": 0.026666666666666727, "calib/mean_conf": 0.961, "calib/mu_c": 0.9716666666666667, "calib/mu_w": 0.945, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.0390625, "calib/pce": 0.361, "calib/std_conf": 0.02913760456866692, "calib/step_conf_rate": 0.0390625, "calib/step_q_c": 0.9222222222222222, "calib/step_q_c_n": 27.0, "calib/step_q_gap": 0.027004830917874534, "calib/step_q_w": 0.8952173913043476, "calib/step_q_w_n": 23.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 740.88671875, "completions/mean_terminated_length": 807.0935668945312, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.004266666666666667, "grad_norm": 0.6574472784996033, "learning_rate": 7.5e-07, "loss": 0.0335, "num_tokens": 1140567.0, "reward": 0.043781835585832596, "reward_std": 0.10839106887578964, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.02506367117166519, "rewards/format_reward_step": 0.0390625, "step": 4 }, { "aux_distill/lambda": 0.1, "aux_distill/loss": 1.5584832191467286, "aux_distill/mean_u": 0.555644570917228, "aux_distill/n_active_tok": 23.2, "calib/answer_extract_rate": 0.05078125, "calib/avg_num_step_conf": 0.21875, "calib/ece": 0.22153846153846146, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.6923076923076923, "calib/gap": 0.1283333333333332, "calib/mean_conf": 0.9138461538461538, "calib/mu_c": 0.9533333333333331, "calib/mu_w": 0.825, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.22153846153846146, "calib/std_conf": 0.12344656031837367, "calib/step_conf_rate": 0.04296875, "calib/step_q_c": 0.8882857142857142, "calib/step_q_c_n": 35.0, "calib/step_q_gap": 0.04019047619047622, "calib/step_q_w": 0.848095238095238, "calib/step_q_w_n": 21.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 686.07421875, "completions/mean_terminated_length": 734.8744506835938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.005333333333333333, "grad_norm": 0.7364365458488464, "learning_rate": 1.0000000000000002e-06, "loss": 0.0574, "num_tokens": 1422890.0, "reward": 0.05148027092218399, "reward_std": 0.1402149796485901, "rewards/accuracy_reward_step": 0.03515625, "rewards/final_brier_reward_step": 0.028741799294948578, "rewards/format_reward_step": 0.0390625, "step": 5 }, { "aux_distill/lambda": 0.1, "aux_distill/loss": 1.6745985746383667, "aux_distill/mean_u": 0.28617723428204433, "aux_distill/n_active_tok": 26.0, "calib/answer_extract_rate": 0.046875, "calib/avg_num_step_conf": 0.15234375, "calib/ece": 0.6709999999999998, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.010952380952381047, "calib/mean_conf": 0.9709999999999999, "calib/mu_c": 0.9633333333333333, "calib/mu_w": 0.9742857142857143, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.6709999999999998, "calib/std_conf": 0.018681541692269422, "calib/step_conf_rate": 0.02734375, "calib/step_q_c": 0.8683333333333332, "calib/step_q_c_n": 12.0, "calib/step_q_gap": 0.008703703703703547, "calib/step_q_w": 0.8596296296296296, "calib/step_q_w_n": 27.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 638.53515625, "completions/mean_terminated_length": 683.9539794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.46913886070251465, "learning_rate": 1.25e-06, "loss": 0.007, "num_tokens": 1692307.0, "reward": 0.02002226561307907, "reward_std": 0.05123838037252426, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.008794531226158142, "rewards/format_reward_step": 0.01953125, "step": 6 }, { "aux_distill/lambda": 0.10000000000000002, "aux_distill/loss": 1.4568543831507366, "aux_distill/mean_u": 0.3033000773576174, "aux_distill/n_active_tok": 17.333333333333332, "calib/answer_extract_rate": 0.0625, "calib/avg_num_step_conf": 0.30859375, "calib/ece": 0.6199999999999999, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.9333333333333333, "calib/gap": 0.006999999999999895, "calib/mean_conf": 0.9533333333333333, "calib/mu_c": 0.9579999999999999, "calib/mu_w": 0.951, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.0546875, "calib/pce": 0.6199999999999999, "calib/std_conf": 0.035150470203904174, "calib/step_conf_rate": 0.0546875, "calib/step_q_c": 0.89, "calib/step_q_c_n": 25.0, "calib/step_q_gap": 0.026296296296296262, "calib/step_q_w": 0.8637037037037038, "calib/step_q_w_n": 54.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 764.57421875, "completions/mean_terminated_length": 808.8057250976562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.007466666666666667, "grad_norm": 0.7070335149765015, "learning_rate": 1.5e-06, "loss": 0.0243, "num_tokens": 1995462.0, "reward": 0.0416390635073185, "reward_std": 0.09732498973608017, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.020778125151991844, "rewards/format_reward_step": 0.0390625, "step": 7 }, { "aux_distill/lambda": 0.1, "aux_distill/loss": 1.481102204322815, "aux_distill/mean_u": 0.2925329127684373, "aux_distill/n_active_tok": 59.2, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.3984375, "calib/ece": 0.537857142857143, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.9285714285714286, "calib/gap": -0.028749999999999942, "calib/mean_conf": 0.9664285714285716, "calib/mu_c": 0.9500000000000001, "calib/mu_w": 0.97875, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.05859375, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.537857142857143, "calib/std_conf": 0.024671308627386493, "calib/step_conf_rate": 0.04296875, "calib/step_q_c": 0.8899999999999999, "calib/step_q_c_n": 23.0, "calib/step_q_gap": -0.01430379746835464, "calib/step_q_w": 0.9043037974683545, "calib/step_q_w_n": 79.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 646.93359375, "completions/mean_terminated_length": 698.7974243164062, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.008533333333333334, "grad_norm": 0.6346028447151184, "learning_rate": 1.75e-06, "loss": 0.0211, "num_tokens": 2267589.0, "reward": 0.03550625219941139, "reward_std": 0.09643617272377014, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.016325000673532486, "rewards/format_reward_step": 0.03125, "step": 8 }, { "aux_distill/lambda": 0.1, "aux_distill/loss": 1.111382430791855, "aux_distill/mean_u": 0.26760089571160023, "aux_distill/n_active_tok": 18.4, "calib/answer_extract_rate": 0.0625, "calib/avg_num_step_conf": 0.33984375, "calib/ece": 0.4546111111111111, "calib/final_conf_rate": 0.0703125, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.7777777777777778, "calib/gap": 0.07819999999999983, "calib/mean_conf": 0.8990555555555555, "calib/mu_c": 0.9424999999999999, "calib/mu_w": 0.8643000000000001, "calib/nonempty_final_conf_rate": 0.0703125, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.4546111111111111, "calib/std_conf": 0.2067651572362074, "calib/step_conf_rate": 0.05859375, "calib/step_q_c": 0.8628571428571428, "calib/step_q_c_n": 35.0, "calib/step_q_gap": 0.020260989010988828, "calib/step_q_w": 0.8425961538461539, "calib/step_q_w_n": 52.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2991.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 629.04296875, "completions/mean_terminated_length": 668.195068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.0096, "grad_norm": 0.7474179863929749, "learning_rate": 2.0000000000000003e-06, "loss": 0.0211, "num_tokens": 2536160.0, "reward": 0.05555380508303642, "reward_std": 0.10311586409807205, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.02907637134194374, "rewards/format_reward_step": 0.05078125, "step": 9 }, { "aux_distill/lambda": 0.09999999999999999, "aux_distill/loss": 1.1272154109818595, "aux_distill/mean_u": 0.17926448979161314, "aux_distill/n_active_tok": 26.285714285714285, "calib/answer_extract_rate": 0.06640625, "calib/avg_num_step_conf": 0.28515625, "calib/ece": 0.7218749999999999, "calib/final_conf_rate": 0.0625, "calib/format_rate": 0.0546875, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.0541025641025642, "calib/mean_conf": 0.909375, "calib/mu_c": 0.9533333333333333, "calib/mu_w": 0.8992307692307691, "calib/nonempty_final_conf_rate": 0.0625, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.05859375, "calib/pce": 0.7218749999999999, "calib/std_conf": 0.15469602249249975, "calib/step_conf_rate": 0.05859375, "calib/step_q_c": 0.6654545454545455, "calib/step_q_c_n": 11.0, "calib/step_q_gap": -0.14260997067448677, "calib/step_q_w": 0.8080645161290323, "calib/step_q_w_n": 62.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3028.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 657.84765625, "completions/mean_terminated_length": 690.2008056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.010666666666666666, "grad_norm": 0.6247566938400269, "learning_rate": 2.25e-06, "loss": 0.0382, "num_tokens": 2811369.0, "reward": 0.04085097461938858, "reward_std": 0.08594364672899246, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.015295703895390034, "rewards/format_reward_step": 0.0546875, "step": 10 }, { "aux_distill/lambda": 0.09999999999999999, "aux_distill/loss": 1.1807828417846136, "aux_distill/mean_u": 0.3193847890032968, "aux_distill/n_active_tok": 25.714285714285715, "calib/answer_extract_rate": 0.08984375, "calib/avg_num_step_conf": 0.43359375, "calib/ece": 0.5734782608695652, "calib/final_conf_rate": 0.08984375, "calib/format_rate": 0.06640625, "calib/frac_conf_gt_0.9": 0.6956521739130435, "calib/gap": 0.012333333333333307, "calib/mean_conf": 0.8569565217391302, "calib/mu_c": 0.865, "calib/mu_w": 0.8526666666666667, "calib/nonempty_final_conf_rate": 0.08984375, "calib/nonempty_reasoning_rate": 0.09765625, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.5413043478260869, "calib/std_conf": 0.23582297676543393, "calib/step_conf_rate": 0.078125, "calib/step_q_c": 0.8226470588235294, "calib/step_q_c_n": 34.0, "calib/step_q_gap": -0.006573720397249794, "calib/step_q_w": 0.8292207792207792, "calib/step_q_w_n": 77.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 704.5546875, "completions/mean_terminated_length": 761.0379638671875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.011733333333333333, "grad_norm": 0.8381440043449402, "learning_rate": 2.5e-06, "loss": 0.0418, "num_tokens": 3096215.0, "reward": 0.06206425651907921, "reward_std": 0.14835157990455627, "rewards/accuracy_reward_step": 0.03125, "rewards/final_brier_reward_step": 0.026472264900803566, "rewards/format_reward_step": 0.06640625, "step": 11 }, { "aux_distill/lambda": 0.10000000000000003, "aux_distill/loss": 1.1680403964387045, "aux_distill/mean_u": 0.39082487489789425, "aux_distill/n_active_tok": 38.22222222222222, "calib/answer_extract_rate": 0.109375, "calib/avg_num_step_conf": 0.9609375, "calib/ece": 0.45625000000000004, "calib/final_conf_rate": 0.09375, "calib/format_rate": 0.0703125, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": 0.07741258741258716, "calib/mean_conf": 0.8562500000000001, "calib/mu_c": 0.8981818181818181, "calib/mu_w": 0.8207692307692309, "calib/nonempty_final_conf_rate": 0.09375, "calib/nonempty_reasoning_rate": 0.16796875, "calib/nonempty_step_conf_rate": 0.1328125, "calib/pce": 0.42708333333333337, "calib/std_conf": 0.22052612279123154, "calib/step_conf_rate": 0.1328125, "calib/step_q_c": 0.8590789473684209, "calib/step_q_c_n": 76.0, "calib/step_q_gap": 0.005726006191950406, "calib/step_q_w": 0.8533529411764705, "calib/step_q_w_n": 170.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 582.1171875, "completions/mean_terminated_length": 631.4491577148438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 1.0448517799377441, "learning_rate": 2.7500000000000004e-06, "loss": 0.0591, "num_tokens": 3349413.0, "reward": 0.07992050796747208, "reward_std": 0.18436968326568604, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.04655976593494415, "rewards/format_reward_step": 0.0703125, "step": 12 }, { "aux_distill/lambda": 0.10000000000000003, "aux_distill/loss": 1.087590326865514, "aux_distill/mean_u": 0.2842842258965096, "aux_distill/n_active_tok": 26.88888888888889, "calib/answer_extract_rate": 0.09375, "calib/avg_num_step_conf": 0.71484375, "calib/ece": 0.4649230769230769, "calib/final_conf_rate": 0.1015625, "calib/format_rate": 0.0859375, "calib/frac_conf_gt_0.9": 0.5769230769230769, "calib/gap": 0.12395151515151503, "calib/mean_conf": 0.8603076923076922, "calib/mu_c": 0.9318181818181818, "calib/mu_w": 0.8078666666666667, "calib/nonempty_final_conf_rate": 0.1015625, "calib/nonempty_reasoning_rate": 0.1484375, "calib/nonempty_step_conf_rate": 0.140625, "calib/pce": 0.45107692307692304, "calib/std_conf": 0.15624826626257063, "calib/step_conf_rate": 0.140625, "calib/step_q_c": 0.8850000000000001, "calib/step_q_c_n": 48.0, "calib/step_q_gap": 0.1614444444444445, "calib/step_q_w": 0.7235555555555556, "calib/step_q_w_n": 135.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 657.3046875, "completions/mean_terminated_length": 686.8162841796875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.013866666666666666, "grad_norm": 1.8521384000778198, "learning_rate": 3e-06, "loss": 0.0762, "num_tokens": 3622275.0, "reward": 0.0911479964852333, "reward_std": 0.21495230495929718, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.053389742970466614, "rewards/format_reward_step": 0.0859375, "step": 13 }, { "aux_distill/lambda": 0.10000000000000002, "aux_distill/loss": 0.9630577005445957, "aux_distill/mean_u": 0.2799823846927252, "aux_distill/n_active_tok": 30.5, "calib/answer_extract_rate": 0.140625, "calib/avg_num_step_conf": 1.26953125, "calib/ece": 0.4797850574712644, "calib/final_conf_rate": 0.11328125, "calib/format_rate": 0.08203125, "calib/frac_conf_gt_0.9": 0.3103448275862069, "calib/gap": -0.0819315359477123, "calib/mean_conf": 0.6202149425287355, "calib/mu_c": 0.5721861111111112, "calib/mu_w": 0.6541176470588235, "calib/nonempty_final_conf_rate": 0.11328125, "calib/nonempty_reasoning_rate": 0.21875, "calib/nonempty_step_conf_rate": 0.1640625, "calib/pce": 0.3431034482758621, "calib/std_conf": 0.36152277783771075, "calib/step_conf_rate": 0.1640625, "calib/step_q_c": 0.5964516129032258, "calib/step_q_c_n": 62.0, "calib/step_q_gap": 0.09865338730119794, "calib/step_q_w": 0.49779822560202786, "calib/step_q_w_n": 263.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 585.1875, "completions/mean_terminated_length": 619.0413208007812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.014933333333333333, "grad_norm": 1.0055991411209106, "learning_rate": 3.2500000000000002e-06, "loss": 0.0355, "num_tokens": 3877483.0, "reward": 0.09005921334028244, "reward_std": 0.18428009748458862, "rewards/accuracy_reward_step": 0.05078125, "rewards/final_brier_reward_step": 0.04730593040585518, "rewards/format_reward_step": 0.08203125, "step": 14 }, { "aux_distill/lambda": 0.10000000000000003, "aux_distill/loss": 0.9747249976448391, "aux_distill/mean_u": 0.39136178132639804, "aux_distill/n_active_tok": 33.04347826086956, "calib/answer_extract_rate": 0.14453125, "calib/avg_num_step_conf": 1.41796875, "calib/ece": 0.526875, "calib/final_conf_rate": 0.125, "calib/format_rate": 0.109375, "calib/frac_conf_gt_0.9": 0.3125, "calib/gap": -0.17142857142857143, "calib/mean_conf": 0.5725, "calib/mu_c": 0.43857142857142856, "calib/mu_w": 0.61, "calib/nonempty_final_conf_rate": 0.125, "calib/nonempty_reasoning_rate": 0.24609375, "calib/nonempty_step_conf_rate": 0.2109375, "calib/pce": 0.4403125, "calib/std_conf": 0.37339657202497184, "calib/step_conf_rate": 0.2109375, "calib/step_q_c": 0.42551724137931035, "calib/step_q_c_n": 58.0, "calib/step_q_gap": 0.011806287248435299, "calib/step_q_w": 0.41371095413087505, "calib/step_q_w_n": 305.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 540.75, "completions/mean_terminated_length": 555.9517822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.016, "grad_norm": 1.5617997646331787, "learning_rate": 3.5e-06, "loss": 0.0336, "num_tokens": 4123795.0, "reward": 0.0969601571559906, "reward_std": 0.2092396765947342, "rewards/accuracy_reward_step": 0.02734375, "rewards/final_brier_reward_step": 0.0572015605866909, "rewards/format_reward_step": 0.109375, "step": 15 }, { "aux_distill/lambda": 0.10000000000000003, "aux_distill/loss": 0.9529272094368935, "aux_distill/mean_u": 0.37173205164412104, "aux_distill/n_active_tok": 56.0, "calib/answer_extract_rate": 0.16015625, "calib/avg_num_step_conf": 1.6328125, "calib/ece": 0.42747368421052634, "calib/final_conf_rate": 0.1484375, "calib/format_rate": 0.10546875, "calib/frac_conf_gt_0.9": 0.2894736842105263, "calib/gap": 0.060500000000000054, "calib/mean_conf": 0.4624210526315789, "calib/mu_c": 0.507, "calib/mu_w": 0.44649999999999995, "calib/nonempty_final_conf_rate": 0.1484375, "calib/nonempty_reasoning_rate": 0.265625, "calib/nonempty_step_conf_rate": 0.2265625, "calib/pce": 0.3133684210526316, "calib/std_conf": 0.4219147227118263, "calib/step_conf_rate": 0.2265625, "calib/step_q_c": 0.4690909090909091, "calib/step_q_c_n": 44.0, "calib/step_q_gap": 0.07918449197860966, "calib/step_q_w": 0.38990641711229945, "calib/step_q_w_n": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 575.58984375, "completions/mean_terminated_length": 608.888427734375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.017066666666666667, "grad_norm": 0.7805458307266235, "learning_rate": 3.7500000000000005e-06, "loss": 0.0193, "num_tokens": 4379994.0, "reward": 0.10980231314897537, "reward_std": 0.21534091234207153, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.07116712629795074, "rewards/format_reward_step": 0.10546875, "step": 16 }, { "aux_distill/lambda": 0.10000000000000003, "aux_distill/loss": 0.9404510505821394, "aux_distill/mean_u": 0.3841350244727926, "aux_distill/n_active_tok": 52.69565217391305, "calib/answer_extract_rate": 0.25390625, "calib/avg_num_step_conf": 2.19921875, "calib/ece": 0.27664382713964286, "calib/final_conf_rate": 0.21875, "calib/format_rate": 0.16015625, "calib/frac_conf_gt_0.9": 0.16071428571428573, "calib/gap": 0.21649394476619044, "calib/mean_conf": 0.3804866842825, "calib/mu_c": 0.5428571428571428, "calib/mu_w": 0.3263631980909524, "calib/nonempty_final_conf_rate": 0.21875, "calib/nonempty_reasoning_rate": 0.3984375, "calib/nonempty_step_conf_rate": 0.32421875, "calib/pce": 0.20356525571107142, "calib/std_conf": 0.36673813544202716, "calib/step_conf_rate": 0.32421875, "calib/step_q_c": 0.27522388059701497, "calib/step_q_c_n": 67.0, "calib/step_q_gap": -0.06842597302122227, "calib/step_q_w": 0.34364985361823724, "calib/step_q_w_n": 496.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 498.5, "completions/mean_terminated_length": 514.5806274414062, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.018133333333333335, "grad_norm": 1.1857376098632812, "learning_rate": 4.000000000000001e-06, "loss": 0.0915, "num_tokens": 4611138.0, "reward": 0.1654895544052124, "reward_std": 0.3047766089439392, "rewards/accuracy_reward_step": 0.0546875, "rewards/final_brier_reward_step": 0.11613535135984421, "rewards/format_reward_step": 0.16015625, "step": 17 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9438614919781685, "aux_distill/mean_u": 0.3519507888561258, "aux_distill/n_active_tok": 69.125, "calib/answer_extract_rate": 0.3359375, "calib/avg_num_step_conf": 3.0390625, "calib/ece": 0.3567121111111111, "calib/final_conf_rate": 0.328125, "calib/format_rate": 0.25, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/gap": -0.17255727187591455, "calib/mean_conf": 0.33294084920634925, "calib/mu_c": 0.1953058823529412, "calib/mu_w": 0.36786315422885574, "calib/nonempty_final_conf_rate": 0.328125, "calib/nonempty_reasoning_rate": 0.5078125, "calib/nonempty_step_conf_rate": 0.453125, "calib/pce": 0.24363600396825394, "calib/std_conf": 0.34057311000666146, "calib/step_conf_rate": 0.453125, "calib/step_q_c": 0.19010934579439256, "calib/step_q_c_n": 107.0, "calib/step_q_gap": -0.0462484400259055, "calib/step_q_w": 0.23635778582029807, "calib/step_q_w_n": 671.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2714.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 514.56640625, "completions/mean_terminated_length": 526.916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.0192, "grad_norm": 1.1566716432571411, "learning_rate": 4.25e-06, "loss": 0.0994, "num_tokens": 4853587.0, "reward": 0.24126005172729492, "reward_std": 0.35394883155822754, "rewards/accuracy_reward_step": 0.0703125, "rewards/final_brier_reward_step": 0.16220760345458984, "rewards/format_reward_step": 0.25, "step": 18 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9123309293100911, "aux_distill/mean_u": 0.35264843125580947, "aux_distill/n_active_tok": 64.51612903225806, "calib/answer_extract_rate": 0.65625, "calib/avg_num_step_conf": 3.90234375, "calib/ece": 0.1993193862581379, "calib/final_conf_rate": 0.66015625, "calib/format_rate": 0.53515625, "calib/frac_conf_gt_0.9": 0.05325443786982249, "calib/gap": -0.06653554882625726, "calib/mean_conf": 0.21027278051141096, "calib/mu_c": 0.15239866407673747, "calib/mu_w": 0.21893421290299472, "calib/nonempty_final_conf_rate": 0.66015625, "calib/nonempty_reasoning_rate": 0.81640625, "calib/nonempty_step_conf_rate": 0.72265625, "calib/pce": 0.13970732598832475, "calib/std_conf": 0.2727893687990524, "calib/step_conf_rate": 0.72265625, "calib/step_q_c": 0.16477064220183493, "calib/step_q_c_n": 109.0, "calib/step_q_gap": -0.04536888836948741, "calib/step_q_w": 0.21013953057132234, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 407.1015625, "completions/mean_terminated_length": 408.69805908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.020266666666666665, "grad_norm": 1.1330541372299194, "learning_rate": 4.5e-06, "loss": 0.128, "num_tokens": 5062565.0, "reward": 0.5311158299446106, "reward_std": 0.45769965648651123, "rewards/accuracy_reward_step": 0.09765625, "rewards/final_brier_reward_step": 0.4294191896915436, "rewards/format_reward_step": 0.53515625, "step": 19 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9665737859904766, "aux_distill/mean_u": 0.446485798432125, "aux_distill/n_active_tok": 63.875, "calib/answer_extract_rate": 0.78515625, "calib/avg_num_step_conf": 3.921875, "calib/ece": 0.1426651822136941, "calib/final_conf_rate": 0.76171875, "calib/format_rate": 0.6640625, "calib/frac_conf_gt_0.9": 0.010256410256410256, "calib/gap": 0.020352225624694464, "calib/mean_conf": 0.1951296895811777, "calib/mu_c": 0.2115158404687522, "calib/mu_w": 0.19116361484405772, "calib/nonempty_final_conf_rate": 0.76171875, "calib/nonempty_reasoning_rate": 0.90625, "calib/nonempty_step_conf_rate": 0.81640625, "calib/pce": 0.07146153846153845, "calib/std_conf": 0.19727081252457762, "calib/step_conf_rate": 0.81640625, "calib/step_q_c": 0.23115456434267534, "calib/step_q_c_n": 149.0, "calib/step_q_gap": 0.031229714981662104, "calib/step_q_w": 0.19992484936101323, "calib/step_q_w_n": 855.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 338.4453125, "completions/mean_terminated_length": 342.4585266113281, "completions/min_length": 0.0, "completions/min_terminated_length": 5.0, "epoch": 0.021333333333333333, "grad_norm": 1.3762656450271606, "learning_rate": 4.75e-06, "loss": 0.1177, "num_tokens": 5254079.0, "reward": 0.6778804063796997, "reward_std": 0.46077030897140503, "rewards/accuracy_reward_step": 0.15234375, "rewards/final_brier_reward_step": 0.5393545627593994, "rewards/format_reward_step": 0.6640625, "step": 20 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9324986916035414, "aux_distill/mean_u": 0.32273819262569403, "aux_distill/n_active_tok": 61.75, "calib/answer_extract_rate": 0.890625, "calib/avg_num_step_conf": 3.81640625, "calib/ece": 0.1712819804076931, "calib/final_conf_rate": 0.8984375, "calib/format_rate": 0.78125, "calib/frac_conf_gt_0.9": 0.030434782608695653, "calib/gap": -0.01407527168420239, "calib/mean_conf": 0.24356141418077143, "calib/mu_c": 0.23223999999999997, "calib/mu_w": 0.24631527168420236, "calib/nonempty_final_conf_rate": 0.8984375, "calib/nonempty_reasoning_rate": 0.97265625, "calib/nonempty_step_conf_rate": 0.90234375, "calib/pce": 0.10959561033771054, "calib/std_conf": 0.2399938362844077, "calib/step_conf_rate": 0.90234375, "calib/step_q_c": 0.2613429411764706, "calib/step_q_c_n": 170.0, "calib/step_q_gap": 0.012324789999271113, "calib/step_q_w": 0.2490181511771995, "calib/step_q_w_n": 807.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 297.56640625, "completions/mean_terminated_length": 298.73333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0224, "grad_norm": 0.912643313407898, "learning_rate": 5e-06, "loss": 0.1296, "num_tokens": 5433216.0, "reward": 0.7877582907676697, "reward_std": 0.39034831523895264, "rewards/accuracy_reward_step": 0.18359375, "rewards/final_brier_reward_step": 0.6106728911399841, "rewards/format_reward_step": 0.78125, "step": 21 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9481630343943834, "aux_distill/mean_u": 0.3599154632144565, "aux_distill/n_active_tok": 57.625, "calib/answer_extract_rate": 0.91015625, "calib/avg_num_step_conf": 3.5390625, "calib/ece": 0.19719833333333334, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.8515625, "calib/frac_conf_gt_0.9": 0.025, "calib/gap": -0.0323800114220445, "calib/mean_conf": 0.2848016666666667, "calib/mu_c": 0.2570088235294118, "calib/mu_w": 0.2893888349514563, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.921875, "calib/pce": 0.1701666666666667, "calib/std_conf": 0.24047366504579154, "calib/step_conf_rate": 0.921875, "calib/step_q_c": 0.3154104575163399, "calib/step_q_c_n": 153.0, "calib/step_q_gap": -0.01267838710517405, "calib/step_q_w": 0.32808884462151394, "calib/step_q_w_n": 753.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 292.546875, "completions/mean_terminated_length": 293.6941223144531, "completions/min_length": 0.0, "completions/min_terminated_length": 23.0, "epoch": 0.023466666666666667, "grad_norm": 1.1575355529785156, "learning_rate": 4.9722222222222224e-06, "loss": 0.0719, "num_tokens": 5609924.0, "reward": 0.8310613632202148, "reward_std": 0.34224647283554077, "rewards/accuracy_reward_step": 0.13671875, "rewards/final_brier_reward_step": 0.6738414764404297, "rewards/format_reward_step": 0.8515625, "step": 22 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9651179648935795, "aux_distill/mean_u": 0.3030497133401629, "aux_distill/n_active_tok": 66.0, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 4.62109375, "calib/ece": 0.2240234610788449, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.028112449799196786, "calib/gap": -0.005876867276822795, "calib/mean_conf": 0.3204090032475196, "calib/mu_c": 0.31540540540540546, "calib/mu_w": 0.32128227268222825, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96875, "calib/pce": 0.1979190434081622, "calib/std_conf": 0.24102069592003547, "calib/step_conf_rate": 0.96875, "calib/step_q_c": 0.4652601156069365, "calib/step_q_c_n": 173.0, "calib/step_q_gap": 0.13476971956733252, "calib/step_q_w": 0.33049039603960395, "calib/step_q_w_n": 1010.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2817.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 330.66796875, "completions/mean_terminated_length": 331.9647216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.024533333333333334, "grad_norm": 1.0878534317016602, "learning_rate": 4.944444444444445e-06, "loss": 0.0965, "num_tokens": 5798511.0, "reward": 0.90683913230896, "reward_std": 0.24767723679542542, "rewards/accuracy_reward_step": 0.14453125, "rewards/final_brier_reward_step": 0.7355532646179199, "rewards/format_reward_step": 0.93359375, "step": 23 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.96570317260921, "aux_distill/mean_u": 0.3328278768185367, "aux_distill/n_active_tok": 78.375, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 5.0546875, "calib/ece": 0.21902016129032253, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.036290322580645164, "calib/gap": -0.04096695273472528, "calib/mean_conf": 0.3345846774193549, "calib/mu_c": 0.2997297297297297, "calib/mu_w": 0.340696682464455, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.20220564516129028, "calib/std_conf": 0.23942838701197067, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.36037037037037034, "calib/step_q_c_n": 189.0, "calib/step_q_gap": -0.026124652254064074, "calib/step_q_w": 0.3864950226244344, "calib/step_q_w_n": 1105.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 345.12109375, "completions/mean_terminated_length": 346.4745178222656, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 0.0256, "grad_norm": 1.195889949798584, "learning_rate": 4.9166666666666665e-06, "loss": 0.1758, "num_tokens": 5991374.0, "reward": 0.8862032294273376, "reward_std": 0.26511549949645996, "rewards/accuracy_reward_step": 0.14453125, "rewards/final_brier_reward_step": 0.7099063396453857, "rewards/format_reward_step": 0.91796875, "step": 24 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8784007225185633, "aux_distill/mean_u": 0.3437077893096381, "aux_distill/n_active_tok": 96.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 6.0546875, "calib/ece": 0.22140242914979763, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.94140625, "calib/frac_conf_gt_0.9": 0.016194331983805668, "calib/gap": -0.028569543147208054, "calib/mean_conf": 0.3309862348178138, "calib/mu_c": 0.30820000000000003, "calib/mu_w": 0.3367695431472081, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.984375, "calib/pce": 0.17497975708502028, "calib/std_conf": 0.21689816182725485, "calib/step_conf_rate": 0.984375, "calib/step_q_c": 0.3235438596491228, "calib/step_q_c_n": 285.0, "calib/step_q_gap": -0.04858870952083766, "calib/step_q_w": 0.37213256916996046, "calib/step_q_w_n": 1265.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 333.0390625, "completions/mean_terminated_length": 333.0390625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.02666666666666667, "grad_norm": 0.9269714951515198, "learning_rate": 4.888888888888889e-06, "loss": 0.1322, "num_tokens": 6179856.0, "reward": 0.9290375113487244, "reward_std": 0.24123907089233398, "rewards/accuracy_reward_step": 0.1953125, "rewards/final_brier_reward_step": 0.7213562726974487, "rewards/format_reward_step": 0.94140625, "step": 25 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9190471321344376, "aux_distill/mean_u": 0.3904257065951485, "aux_distill/n_active_tok": 92.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 5.9453125, "calib/ece": 0.22666126482213436, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.023715415019762844, "calib/gap": -0.01109248826291076, "calib/mean_conf": 0.3333387351778656, "calib/mu_c": 0.324, "calib/mu_w": 0.33509248826291077, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.20094861660079053, "calib/std_conf": 0.22792036210925268, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.26669642857142856, "calib/step_q_c_n": 224.0, "calib/step_q_gap": -0.1110853896103896, "calib/step_q_w": 0.37778181818181816, "calib/step_q_w_n": 1298.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 313.1171875, "completions/mean_terminated_length": 315.5826721191406, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.027733333333333332, "grad_norm": 0.9309002161026001, "learning_rate": 4.861111111111111e-06, "loss": 0.1212, "num_tokens": 6365254.0, "reward": 0.9508861303329468, "reward_std": 0.17603799700737, "rewards/accuracy_reward_step": 0.15625, "rewards/final_brier_reward_step": 0.7650535702705383, "rewards/format_reward_step": 0.98046875, "step": 26 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.880526764318347, "aux_distill/mean_u": 0.38630980740941306, "aux_distill/n_active_tok": 89.625, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.5234375, "calib/ece": 0.16436765534052672, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": 0.0424597766099421, "calib/mean_conf": 0.26168194105481246, "calib/mu_c": 0.29875, "calib/mu_w": 0.2562902233900579, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1495327347056061, "calib/std_conf": 0.18758624185278192, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.35888888888888887, "calib/step_q_c_n": 162.0, "calib/step_q_gap": 0.06739253610374296, "calib/step_q_w": 0.2914963527851459, "calib/step_q_w_n": 1508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2837.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 310.01953125, "completions/mean_terminated_length": 313.6956787109375, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.0288, "grad_norm": 0.6328495740890503, "learning_rate": 4.833333333333333e-06, "loss": 0.0679, "num_tokens": 6549835.0, "reward": 0.9706956148147583, "reward_std": 0.1332382708787918, "rewards/accuracy_reward_step": 0.125, "rewards/final_brier_reward_step": 0.8320162296295166, "rewards/format_reward_step": 0.984375, "step": 27 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8589735962450504, "aux_distill/mean_u": 0.3424635989604109, "aux_distill/n_active_tok": 99.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.57421875, "calib/ece": 0.11967999992745071, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": 0.029057711345396253, "calib/mean_conf": 0.2874447058098037, "calib/mu_c": 0.31057692307692303, "calib/mu_w": 0.2815192117315268, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.10160156855490168, "calib/std_conf": 0.196537932612208, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.34105919003115265, "calib/step_q_c_n": 321.0, "calib/step_q_gap": 0.032916616935873055, "calib/step_q_w": 0.3081425730952796, "calib/step_q_w_n": 1362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 316.87890625, "completions/mean_terminated_length": 319.3740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.029866666666666666, "grad_norm": 43.48025894165039, "learning_rate": 4.805555555555556e-06, "loss": 0.0589, "num_tokens": 6737900.0, "reward": 0.9931570887565613, "reward_std": 0.14155761897563934, "rewards/accuracy_reward_step": 0.203125, "rewards/final_brier_reward_step": 0.7949079275131226, "rewards/format_reward_step": 0.98828125, "step": 28 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8502343278378248, "aux_distill/mean_u": 0.3553671577059875, "aux_distill/n_active_tok": 101.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.75, "calib/ece": 0.15306274509803927, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.057138248847926315, "calib/mean_conf": 0.2413764705882353, "calib/mu_c": 0.29000000000000004, "calib/mu_w": 0.23286175115207372, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12270980392156865, "calib/std_conf": 0.17569065794601613, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3440454545454546, "calib/step_q_c_n": 220.0, "calib/step_q_gap": 0.05622582589823971, "calib/step_q_w": 0.2878196286472149, "calib/step_q_w_n": 1508.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 304.12109375, "completions/mean_terminated_length": 306.5157470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.030933333333333334, "grad_norm": 10.140074729919434, "learning_rate": 4.777777777777778e-06, "loss": 0.0734, "num_tokens": 6922883.0, "reward": 0.9947497844696045, "reward_std": 0.09830287843942642, "rewards/accuracy_reward_step": 0.1484375, "rewards/final_brier_reward_step": 0.8449683785438538, "rewards/format_reward_step": 0.99609375, "step": 29 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8902030065655708, "aux_distill/mean_u": 0.298180368346847, "aux_distill/n_active_tok": 103.125, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 7.24609375, "calib/ece": 0.15161501976284586, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.007905138339920948, "calib/gap": 0.04919317754499064, "calib/mean_conf": 0.2650648221343873, "calib/mu_c": 0.30764705882352944, "calib/mu_w": 0.2584538812785388, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.14114624505928855, "calib/std_conf": 0.20693506549564705, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3071291866028708, "calib/step_q_c_n": 209.0, "calib/step_q_gap": 0.021537023783915754, "calib/step_q_w": 0.28559216281895505, "calib/step_q_w_n": 1646.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 324.484375, "completions/mean_terminated_length": 328.33203125, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.032, "grad_norm": 4.368605613708496, "learning_rate": 4.75e-06, "loss": 0.0494, "num_tokens": 7112935.0, "reward": 0.9643255472183228, "reward_std": 0.15291661024093628, "rewards/accuracy_reward_step": 0.1328125, "rewards/final_brier_reward_step": 0.8192760944366455, "rewards/format_reward_step": 0.9765625, "step": 30 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8743925169110298, "aux_distill/mean_u": 0.3390971488901282, "aux_distill/n_active_tok": 103.625, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 7.328125, "calib/ece": 0.1774110671936759, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.007905138339920948, "calib/gap": -0.05309655172413791, "calib/mean_conf": 0.27140316205533593, "calib/mu_c": 0.2288, "calib/mu_w": 0.2818965517241379, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12559288537549407, "calib/std_conf": 0.19619026497013756, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30663663663663665, "calib/step_q_c_n": 333.0, "calib/step_q_gap": 0.03359710326009746, "calib/step_q_w": 0.2730395333765392, "calib/step_q_w_n": 1543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 291.5625, "completions/mean_terminated_length": 293.8582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.03306666666666667, "grad_norm": 24.946491241455078, "learning_rate": 4.722222222222222e-06, "loss": 0.059, "num_tokens": 7293487.0, "reward": 0.9699900150299072, "reward_std": 0.15348196029663086, "rewards/accuracy_reward_step": 0.1953125, "rewards/final_brier_reward_step": 0.7641987204551697, "rewards/format_reward_step": 0.98046875, "step": 31 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8389797694981098, "aux_distill/mean_u": 0.305542944566539, "aux_distill/n_active_tok": 96.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.23046875, "calib/ece": 0.15460937500000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.014638684061259205, "calib/mean_conf": 0.24156249999999999, "calib/mu_c": 0.22926829268292684, "calib/mu_w": 0.24390697674418604, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11800781249999998, "calib/std_conf": 0.17999104795447463, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24836909871244636, "calib/step_q_c_n": 233.0, "calib/step_q_gap": -0.01399507162529226, "calib/step_q_w": 0.2623641703377386, "calib/step_q_w_n": 1362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 261.515625, "completions/mean_terminated_length": 263.5747985839844, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.034133333333333335, "grad_norm": 10.89183235168457, "learning_rate": 4.694444444444445e-06, "loss": 0.1066, "num_tokens": 7467139.0, "reward": 0.9876136779785156, "reward_std": 0.10019958019256592, "rewards/accuracy_reward_step": 0.16015625, "rewards/final_brier_reward_step": 0.8189772963523865, "rewards/format_reward_step": 0.99609375, "step": 32 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.875149991363287, "aux_distill/mean_u": 0.32872740744453877, "aux_distill/n_active_tok": 108.75, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.609375, "calib/ece": 0.15937007874015743, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.0203460038986355, "calib/mean_conf": 0.2351968503937008, "calib/mu_c": 0.21789473684210525, "calib/mu_w": 0.23824074074074075, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12248031496062992, "calib/std_conf": 0.17148655461952939, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24656387665198237, "calib/step_q_c_n": 227.0, "calib/step_q_gap": -0.02186615747771048, "calib/step_q_w": 0.26843003412969285, "calib/step_q_w_n": 1465.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 268.30078125, "completions/mean_terminated_length": 270.41339111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.0352, "grad_norm": 2.0930428504943848, "learning_rate": 4.666666666666667e-06, "loss": 0.0706, "num_tokens": 7642696.0, "reward": 0.9824995994567871, "reward_std": 0.09917615354061127, "rewards/accuracy_reward_step": 0.1484375, "rewards/final_brier_reward_step": 0.8243741989135742, "rewards/format_reward_step": 0.9921875, "step": 33 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8333712462335825, "aux_distill/mean_u": 0.3157383508748938, "aux_distill/n_active_tok": 89.625, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.6796875, "calib/ece": 0.15984313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03704761904761905, "calib/mean_conf": 0.1971764705882353, "calib/mu_c": 0.16666666666666666, "calib/mu_w": 0.2037142857142857, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09027450980392157, "calib/std_conf": 0.14941148679918573, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.22977973568281937, "calib/step_q_c_n": 227.0, "calib/step_q_gap": -0.008932570755648434, "calib/step_q_w": 0.2387123064384678, "calib/step_q_w_n": 1227.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 232.14453125, "completions/mean_terminated_length": 233.97244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.03626666666666667, "grad_norm": 4.866479396820068, "learning_rate": 4.638888888888889e-06, "loss": 0.0829, "num_tokens": 7807237.0, "reward": 0.991145133972168, "reward_std": 0.08340153098106384, "rewards/accuracy_reward_step": 0.17578125, "rewards/final_brier_reward_step": 0.8143214583396912, "rewards/format_reward_step": 0.9921875, "step": 34 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8673066068440676, "aux_distill/mean_u": 0.3331934416142454, "aux_distill/n_active_tok": 95.375, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.0078125, "calib/ece": 0.13542890625, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.008839642684693394, "calib/mean_conf": 0.21121171875, "calib/mu_c": 0.2036842105263158, "calib/mu_w": 0.2125238532110092, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0991015625, "calib/std_conf": 0.17462509132115264, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21622727272727277, "calib/step_q_c_n": 220.0, "calib/step_q_gap": -0.017437446544350893, "calib/step_q_w": 0.23366471927162366, "calib/step_q_w_n": 1318.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 257.53125, "completions/mean_terminated_length": 259.5590515136719, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.037333333333333336, "grad_norm": 20.04242706298828, "learning_rate": 4.611111111111112e-06, "loss": 0.0487, "num_tokens": 7982421.0, "reward": 0.9849402904510498, "reward_std": 0.1031317412853241, "rewards/accuracy_reward_step": 0.1484375, "rewards/final_brier_reward_step": 0.8292554616928101, "rewards/format_reward_step": 0.9921875, "step": 35 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9580904059112072, "aux_distill/mean_u": 0.3945864164823561, "aux_distill/n_active_tok": 106.25, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.58203125, "calib/ece": 0.13430830039525696, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": 0.016415094339622627, "calib/mean_conf": 0.1934387351778656, "calib/mu_c": 0.20641509433962263, "calib/mu_w": 0.19, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.059130434782608696, "calib/std_conf": 0.16373250502921927, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22345323741007192, "calib/step_q_c_n": 278.0, "calib/step_q_gap": 0.03788820542712951, "calib/step_q_w": 0.1855650319829424, "calib/step_q_w_n": 1407.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 262.90234375, "completions/mean_terminated_length": 266.019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.0384, "grad_norm": 77.7988510131836, "learning_rate": 4.583333333333333e-06, "loss": 0.1055, "num_tokens": 8152436.0, "reward": 0.9992785453796387, "reward_std": 0.11146703362464905, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.8032445311546326, "rewards/format_reward_step": 0.98828125, "step": 36 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8078534454107285, "aux_distill/mean_u": 0.2900667277152228, "aux_distill/n_active_tok": 104.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 7.70703125, "calib/ece": 0.11493769841269842, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": 0.014841203703703704, "calib/mean_conf": 0.20589007936507936, "calib/mu_c": 0.21861111111111112, "calib/mu_w": 0.2037699074074074, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08898531746031746, "calib/std_conf": 0.15356669412485946, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22843478260869562, "calib/step_q_c_n": 230.0, "calib/step_q_gap": 0.005715333383222304, "calib/step_q_w": 0.22271944922547332, "calib/step_q_w_n": 1743.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 262.359375, "completions/mean_terminated_length": 265.4703674316406, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.039466666666666664, "grad_norm": 12.64301872253418, "learning_rate": 4.555555555555556e-06, "loss": 0.0472, "num_tokens": 8326696.0, "reward": 0.9826458692550659, "reward_std": 0.10451576858758926, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.8402917385101318, "rewards/format_reward_step": 0.984375, "step": 37 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8974905293434858, "aux_distill/mean_u": 0.3690486426132548, "aux_distill/n_active_tok": 103.625, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 6.98828125, "calib/ece": 0.20847222222222223, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.09633992094861665, "calib/mean_conf": 0.2448611111111111, "calib/mu_c": 0.15693181818181817, "calib/mu_w": 0.2532717391304348, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.18301587301587302, "calib/std_conf": 0.20042620894904456, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.1854905511811024, "calib/step_q_c_n": 127.0, "calib/step_q_gap": -0.07555920814501069, "calib/step_q_w": 0.2610497593261131, "calib/step_q_w_n": 1662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 256.73046875, "completions/mean_terminated_length": 259.77471923828125, "completions/min_length": 0.0, "completions/min_terminated_length": 49.0, "epoch": 0.04053333333333333, "grad_norm": 30.5622615814209, "learning_rate": 4.527777777777778e-06, "loss": 0.0467, "num_tokens": 8499307.0, "reward": 0.9338328838348389, "reward_std": 0.14586186408996582, "rewards/accuracy_reward_step": 0.0859375, "rewards/final_brier_reward_step": 0.8129782676696777, "rewards/format_reward_step": 0.96875, "step": 38 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8440385889261961, "aux_distill/mean_u": 0.27543343141819476, "aux_distill/n_active_tok": 90.375, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 6.4140625, "calib/ece": 0.10633858267716535, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.004351851851851857, "calib/mean_conf": 0.1812992125984252, "calib/mu_c": 0.185, "calib/mu_w": 0.18064814814814814, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06901574803149607, "calib/std_conf": 0.14902664851590786, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23062801932367155, "calib/step_q_c_n": 207.0, "calib/step_q_gap": 0.04025171270346245, "calib/step_q_w": 0.1903763066202091, "calib/step_q_w_n": 1435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 242.8828125, "completions/mean_terminated_length": 244.79527282714844, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.0416, "grad_norm": 3.65295147895813, "learning_rate": 4.5e-06, "loss": 0.0743, "num_tokens": 8667573.0, "reward": 0.9884814023971558, "reward_std": 0.09546145796775818, "rewards/accuracy_reward_step": 0.1484375, "rewards/final_brier_reward_step": 0.8402441740036011, "rewards/format_reward_step": 0.98828125, "step": 39 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8871491681784391, "aux_distill/mean_u": 0.344563438691629, "aux_distill/n_active_tok": 92.625, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.74609375, "calib/ece": 0.12785156250000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0375840707964602, "calib/mean_conf": 0.2108203125, "calib/mu_c": 0.24400000000000005, "calib/mu_w": 0.20641592920353985, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11074218750000002, "calib/std_conf": 0.1723587253735718, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24912499999999999, "calib/step_q_c_n": 160.0, "calib/step_q_gap": 0.015387395118230285, "calib/step_q_w": 0.2337376048817697, "calib/step_q_w_n": 1311.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 232.140625, "completions/mean_terminated_length": 233.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.042666666666666665, "grad_norm": 2.471982955932617, "learning_rate": 4.472222222222223e-06, "loss": 0.0831, "num_tokens": 8833761.0, "reward": 0.9915173649787903, "reward_std": 0.07971757650375366, "rewards/accuracy_reward_step": 0.1171875, "rewards/final_brier_reward_step": 0.8658472299575806, "rewards/format_reward_step": 1.0, "step": 40 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8687476757913828, "aux_distill/mean_u": 0.3529711466997749, "aux_distill/n_active_tok": 80.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.265625, "calib/ece": 0.20875, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0024202464202464635, "calib/mean_conf": 0.197890625, "calib/mu_c": 0.19945054945054944, "calib/mu_w": 0.19703030303030297, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.025585937499999992, "calib/std_conf": 0.15192335826695438, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2507223194748359, "calib/step_q_c_n": 457.0, "calib/step_q_gap": 0.02810728019312997, "calib/step_q_w": 0.22261503928170595, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 200.63671875, "completions/mean_terminated_length": 202.21653747558594, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.04373333333333333, "grad_norm": 13.761474609375, "learning_rate": 4.444444444444444e-06, "loss": 0.1044, "num_tokens": 8992372.0, "reward": 1.035874605178833, "reward_std": 0.10525625944137573, "rewards/accuracy_reward_step": 0.35546875, "rewards/final_brier_reward_step": 0.720186710357666, "rewards/format_reward_step": 0.99609375, "step": 41 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8690756633877754, "aux_distill/mean_u": 0.31748371317377855, "aux_distill/n_active_tok": 86.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 5.609375, "calib/ece": 0.106, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.0146590909090909, "calib/mean_conf": 0.17941176470588233, "calib/mu_c": 0.1909090909090909, "calib/mu_w": 0.17625, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0348627450980392, "calib/std_conf": 0.140377806255272, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22679531772575254, "calib/step_q_c_n": 299.0, "calib/step_q_gap": 0.0047460653071069725, "calib/step_q_w": 0.22204925241864557, "calib/step_q_w_n": 1137.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 210.7716522216797, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.0448, "grad_norm": 28.75571060180664, "learning_rate": 4.416666666666667e-06, "loss": 0.0837, "num_tokens": 9150276.0, "reward": 1.007413625717163, "reward_std": 0.09442334622144699, "rewards/accuracy_reward_step": 0.21484375, "rewards/final_brier_reward_step": 0.8077961206436157, "rewards/format_reward_step": 0.9921875, "step": 42 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8431951850652695, "aux_distill/mean_u": 0.28573219911255926, "aux_distill/n_active_tok": 86.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.27734375, "calib/ece": 0.1001953125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.01317864077669903, "calib/mean_conf": 0.1701953125, "calib/mu_c": 0.18080000000000002, "calib/mu_w": 0.167621359223301, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03753906250000001, "calib/std_conf": 0.13265159715596092, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21855513307984792, "calib/step_q_c_n": 263.0, "calib/step_q_gap": -0.007391558096622675, "calib/step_q_w": 0.2259466911764706, "calib/step_q_w_n": 1088.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 197.3515625, "completions/mean_terminated_length": 198.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.04586666666666667, "grad_norm": 38.969566345214844, "learning_rate": 4.388888888888889e-06, "loss": 0.099, "num_tokens": 9306022.0, "reward": 1.012031078338623, "reward_std": 0.05379870533943176, "rewards/accuracy_reward_step": 0.1953125, "rewards/final_brier_reward_step": 0.8287496566772461, "rewards/format_reward_step": 1.0, "step": 43 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8739052787423134, "aux_distill/mean_u": 0.306426789185847, "aux_distill/n_active_tok": 73.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 4.73046875, "calib/ece": 0.0817578125, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.042027777777777775, "calib/mean_conf": 0.2287890625, "calib/mu_c": 0.26425, "calib/mu_w": 0.2222222222222222, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0771484375, "calib/std_conf": 0.1638731003104875, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27608465608465604, "calib/step_q_c_n": 189.0, "calib/step_q_gap": 0.007816554323403557, "calib/step_q_w": 0.2682681017612525, "calib/step_q_w_n": 1022.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 179.6640625, "completions/mean_terminated_length": 181.0787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.046933333333333334, "grad_norm": 6.218605995178223, "learning_rate": 4.361111111111112e-06, "loss": 0.0856, "num_tokens": 9458336.0, "reward": 0.9939841628074646, "reward_std": 0.10380810499191284, "rewards/accuracy_reward_step": 0.15625, "rewards/final_brier_reward_step": 0.8395308256149292, "rewards/format_reward_step": 0.9921875, "step": 44 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8563596252351999, "aux_distill/mean_u": 0.30218692557266297, "aux_distill/n_active_tok": 75.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 4.83203125, "calib/ece": 0.13445312499999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.011745819397993296, "calib/mean_conf": 0.22406250000000003, "calib/mu_c": 0.23461538461538461, "calib/mu_w": 0.22286956521739132, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1284765625, "calib/std_conf": 0.17754153343302517, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3034090909090909, "calib/step_q_c_n": 132.0, "calib/step_q_gap": 0.03712646647470175, "calib/step_q_w": 0.26628262443438916, "calib/step_q_w_n": 1105.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 181.8203125, "completions/mean_terminated_length": 183.25196838378906, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.048, "grad_norm": 7.898387908935547, "learning_rate": 4.333333333333334e-06, "loss": 0.0952, "num_tokens": 9609930.0, "reward": 0.9790830612182617, "reward_std": 0.08909302949905396, "rewards/accuracy_reward_step": 0.1015625, "rewards/final_brier_reward_step": 0.8605098128318787, "rewards/format_reward_step": 0.99609375, "step": 45 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8764547742903233, "aux_distill/mean_u": 0.37825321606828577, "aux_distill/n_active_tok": 77.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 4.80078125, "calib/ece": 0.14183125, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.027243571210424533, "calib/mean_conf": 0.22371562500000003, "calib/mu_c": 0.24595744680851064, "calib/mu_w": 0.2187138755980861, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0909765625, "calib/std_conf": 0.1743838162813837, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2587906976744186, "calib/step_q_c_n": 215.0, "calib/step_q_gap": -0.010274095224989732, "calib/step_q_w": 0.2690647928994083, "calib/step_q_w_n": 1014.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 171.25390625, "completions/mean_terminated_length": 172.60235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.04906666666666667, "grad_norm": 9.550283432006836, "learning_rate": 4.305555555555556e-06, "loss": 0.0684, "num_tokens": 9758539.0, "reward": 0.9972727298736572, "reward_std": 0.09377779811620712, "rewards/accuracy_reward_step": 0.18359375, "rewards/final_brier_reward_step": 0.8187642693519592, "rewards/format_reward_step": 0.9921875, "step": 46 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8316536583006382, "aux_distill/mean_u": 0.3116091956334562, "aux_distill/n_active_tok": 92.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.70703125, "calib/ece": 0.14671875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.00349587702331261, "calib/mean_conf": 0.182890625, "calib/mu_c": 0.18574468085106383, "calib/mu_w": 0.18224880382775122, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0730078125, "calib/std_conf": 0.15703628334594963, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20154440154440156, "calib/step_q_c_n": 259.0, "calib/step_q_gap": -0.033763418755099306, "calib/step_q_w": 0.23530782029950087, "calib/step_q_w_n": 1202.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 197.16796875, "completions/mean_terminated_length": 198.72047424316406, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.050133333333333335, "grad_norm": 2.7363805770874023, "learning_rate": 4.277777777777778e-06, "loss": 0.0948, "num_tokens": 9914990.0, "reward": 1.0050468444824219, "reward_std": 0.0753011554479599, "rewards/accuracy_reward_step": 0.18359375, "rewards/final_brier_reward_step": 0.8264999985694885, "rewards/format_reward_step": 1.0, "step": 47 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8357707466930151, "aux_distill/mean_u": 0.3321379008617859, "aux_distill/n_active_tok": 97.625, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.16015625, "calib/ece": 0.148203125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": 0.0021084137013634985, "calib/mean_conf": 0.19453125000000002, "calib/mu_c": 0.19612903225806452, "calib/mu_w": 0.19402061855670102, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.050273437500000004, "calib/std_conf": 0.16847365973183315, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23870712401055408, "calib/step_q_c_n": 379.0, "calib/step_q_gap": -0.0011092365904475932, "calib/step_q_w": 0.23981636060100167, "calib/step_q_w_n": 1198.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 201.66796875, "completions/mean_terminated_length": 203.2559051513672, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.0512, "grad_norm": 1.6862789392471313, "learning_rate": 4.25e-06, "loss": 0.1036, "num_tokens": 10070305.0, "reward": 1.0143871307373047, "reward_std": 0.08464100956916809, "rewards/accuracy_reward_step": 0.2421875, "rewards/final_brier_reward_step": 0.7865867018699646, "rewards/format_reward_step": 1.0, "step": 48 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.843103788793087, "aux_distill/mean_u": 0.36999305802485677, "aux_distill/n_active_tok": 95.875, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.1796875, "calib/ece": 0.14652343750000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.016364903801468572, "calib/mean_conf": 0.1754296875, "calib/mu_c": 0.1624528301886792, "calib/mu_w": 0.17881773399014778, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.057460937499999996, "calib/std_conf": 0.1460649568467822, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2170418006430868, "calib/step_q_c_n": 311.0, "calib/step_q_gap": -0.0011414408990060299, "calib/step_q_w": 0.21818324154209284, "calib/step_q_w_n": 1271.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 201.6953125, "completions/mean_terminated_length": 203.28346252441406, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.05226666666666667, "grad_norm": 0.8191512823104858, "learning_rate": 4.222222222222223e-06, "loss": 0.0786, "num_tokens": 10226475.0, "reward": 1.0075775384902954, "reward_std": 0.07214483618736267, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.8081238269805908, "rewards/format_reward_step": 1.0, "step": 49 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8194594420492649, "aux_distill/mean_u": 0.37592474525324987, "aux_distill/n_active_tok": 114.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.9921875, "calib/ece": 0.1366796875, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.012047077492726793, "calib/mean_conf": 0.19410156250000002, "calib/mu_c": 0.18473684210526314, "calib/mu_w": 0.19678391959798994, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.054062500000000006, "calib/std_conf": 0.15603977557039295, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20697740112994353, "calib/step_q_c_n": 354.0, "calib/step_q_gap": -0.058934855137465963, "calib/step_q_w": 0.2659122562674095, "calib/step_q_w_n": 1436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 216.328125, "completions/mean_terminated_length": 218.031494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.05333333333333334, "grad_norm": 1.4355971813201904, "learning_rate": 4.194444444444445e-06, "loss": 0.0926, "num_tokens": 10387215.0, "reward": 1.0024211406707764, "reward_std": 0.0937405452132225, "rewards/accuracy_reward_step": 0.22265625, "rewards/final_brier_reward_step": 0.7899988293647766, "rewards/format_reward_step": 0.9921875, "step": 50 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8286003153771162, "aux_distill/mean_u": 0.3526831447679025, "aux_distill/n_active_tok": 97.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.375, "calib/ece": 0.16179687499999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0013156440022111249, "calib/mean_conf": 0.213359375, "calib/mu_c": 0.21238805970149258, "calib/mu_w": 0.2137037037037037, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.056718749999999984, "calib/std_conf": 0.16138452403997533, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29283333333333333, "calib/step_q_c_n": 420.0, "calib/step_q_gap": 0.014935313531353134, "calib/step_q_w": 0.2778980198019802, "calib/step_q_w_n": 1212.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 195.0234375, "completions/mean_terminated_length": 196.55905151367188, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.0544, "grad_norm": 1.832342505455017, "learning_rate": 4.166666666666667e-06, "loss": 0.0536, "num_tokens": 10546437.0, "reward": 1.0162572860717773, "reward_std": 0.09342889487743378, "rewards/accuracy_reward_step": 0.26171875, "rewards/final_brier_reward_step": 0.7747019529342651, "rewards/format_reward_step": 0.99609375, "step": 51 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7696451377123594, "aux_distill/mean_u": 0.3815194753008485, "aux_distill/n_active_tok": 107.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.2890625, "calib/ece": 0.12675781249999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.007892783054918667, "calib/mean_conf": 0.2121484375, "calib/mu_c": 0.2055813953488372, "calib/mu_w": 0.21347417840375588, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08546875000000001, "calib/std_conf": 0.16614562465291885, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28106761565836297, "calib/step_q_c_n": 281.0, "calib/step_q_gap": -0.003657936392110206, "calib/step_q_w": 0.2847255520504732, "calib/step_q_w_n": 1585.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 205.12109375, "completions/mean_terminated_length": 206.73622131347656, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.055466666666666664, "grad_norm": 0.7373725175857544, "learning_rate": 4.138888888888889e-06, "loss": 0.0685, "num_tokens": 10706900.0, "reward": 0.9982255697250366, "reward_std": 0.07990739494562149, "rewards/accuracy_reward_step": 0.16796875, "rewards/final_brier_reward_step": 0.8284823894500732, "rewards/format_reward_step": 1.0, "step": 52 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7436798084527254, "aux_distill/mean_u": 0.3827627011267692, "aux_distill/n_active_tok": 108.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.9921875, "calib/ece": 0.174921875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.01013415892672867, "calib/mean_conf": 0.203828125, "calib/mu_c": 0.19705882352941173, "calib/mu_w": 0.2071929824561404, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.023359374999999998, "calib/std_conf": 0.14288665248715282, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3082001655629139, "calib/step_q_c_n": 604.0, "calib/step_q_gap": -0.006960036797962976, "calib/step_q_w": 0.3151602023608769, "calib/step_q_w_n": 1186.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 197.81640625, "completions/mean_terminated_length": 199.37400817871094, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.05653333333333333, "grad_norm": 0.8482697010040283, "learning_rate": 4.111111111111111e-06, "loss": 0.0551, "num_tokens": 10863365.0, "reward": 1.034448504447937, "reward_std": 0.09234391897916794, "rewards/accuracy_reward_step": 0.33203125, "rewards/final_brier_reward_step": 0.7368656396865845, "rewards/format_reward_step": 1.0, "step": 53 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7325126752257347, "aux_distill/mean_u": 0.29912072543487983, "aux_distill/n_active_tok": 115.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.32421875, "calib/ece": 0.19642734375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.028345783132530084, "calib/mean_conf": 0.16661953125, "calib/mu_c": 0.185, "calib/mu_w": 0.15665421686746991, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.005742187499999999, "calib/std_conf": 0.14143963628445977, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.311476148409894, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.023771335575669406, "calib/step_q_w": 0.2877048128342246, "calib/step_q_w_n": 1309.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 201.3359375, "completions/mean_terminated_length": 202.9212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.0576, "grad_norm": 1.2834187746047974, "learning_rate": 4.083333333333334e-06, "loss": 0.0697, "num_tokens": 11021139.0, "reward": 1.041155457496643, "reward_std": 0.06632301211357117, "rewards/accuracy_reward_step": 0.3515625, "rewards/final_brier_reward_step": 0.7307484149932861, "rewards/format_reward_step": 1.0, "step": 54 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7187511073425412, "aux_distill/mean_u": 0.2917415166701098, "aux_distill/n_active_tok": 131.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.2890625, "calib/ece": 0.090625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.01761941747572815, "calib/mean_conf": 0.13742187500000003, "calib/mu_c": 0.1516, "calib/mu_w": 0.13398058252427186, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016367187499999995, "calib/std_conf": 0.12388720987851964, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.306264367816092, "calib/step_q_c_n": 348.0, "calib/step_q_gap": 0.008286295662766174, "calib/step_q_w": 0.29797807215332583, "calib/step_q_w_n": 1774.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 223.890625, "completions/mean_terminated_length": 225.65354919433594, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.058666666666666666, "grad_norm": 0.8885535001754761, "learning_rate": 4.055555555555556e-06, "loss": 0.075, "num_tokens": 11186279.0, "reward": 1.0124928951263428, "reward_std": 0.053454600274562836, "rewards/accuracy_reward_step": 0.1953125, "rewards/final_brier_reward_step": 0.8296734690666199, "rewards/format_reward_step": 1.0, "step": 55 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7426415104418993, "aux_distill/mean_u": 0.34381790715359856, "aux_distill/n_active_tok": 122.625, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.2421875, "calib/ece": 0.11554687500000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.021708273553323762, "calib/mean_conf": 0.1028125, "calib/mu_c": 0.12019607843137255, "calib/mu_w": 0.09848780487804878, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.009570312500000002, "calib/std_conf": 0.0958998297378572, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3300759493670886, "calib/step_q_c_n": 395.0, "calib/step_q_gap": 0.04942871904638885, "calib/step_q_w": 0.28064723032069977, "calib/step_q_w_n": 1715.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 229.9921875, "completions/mean_terminated_length": 231.8031463623047, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.05973333333333333, "grad_norm": 1.1358617544174194, "learning_rate": 4.027777777777779e-06, "loss": 0.0676, "num_tokens": 11351997.0, "reward": 1.014061689376831, "reward_std": 0.03824378177523613, "rewards/accuracy_reward_step": 0.19921875, "rewards/final_brier_reward_step": 0.8289046883583069, "rewards/format_reward_step": 1.0, "step": 56 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7687519304454327, "aux_distill/mean_u": 0.3781342782712104, "aux_distill/n_active_tok": 113.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.078125, "calib/ece": 0.21359375000000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.003723165155390215, "calib/mean_conf": 0.06953125, "calib/mu_c": 0.06681159420289856, "calib/mu_w": 0.07053475935828878, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.006796875000000001, "calib/std_conf": 0.06692298949865809, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30905154639175253, "calib/step_q_c_n": 485.0, "calib/step_q_gap": 0.058286587838625226, "calib/step_q_w": 0.2507649585531273, "calib/step_q_w_n": 1327.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 199.1015625, "completions/mean_terminated_length": 200.66929626464844, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.0608, "grad_norm": 0.972671389579773, "learning_rate": 4.000000000000001e-06, "loss": 0.0829, "num_tokens": 11509759.0, "reward": 1.0133512020111084, "reward_std": 0.026614950969815254, "rewards/accuracy_reward_step": 0.26953125, "rewards/final_brier_reward_step": 0.7571711540222168, "rewards/format_reward_step": 1.0, "step": 57 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7665616478770971, "aux_distill/mean_u": 0.3432783958461975, "aux_distill/n_active_tok": 97.875, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.95703125, "calib/ece": 0.1603515625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.01513244725346221, "calib/mean_conf": 0.046679687500000004, "calib/mu_c": 0.058679245283018856, "calib/mu_w": 0.043546798029556646, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.044224115592087804, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27230750853242325, "calib/step_q_c_n": 293.0, "calib/step_q_gap": 0.03393088515579992, "calib/step_q_w": 0.23837662337662333, "calib/step_q_w_n": 1232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 176.640625, "completions/mean_terminated_length": 178.031494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.06186666666666667, "grad_norm": 0.8129911422729492, "learning_rate": 3.972222222222223e-06, "loss": 0.0853, "num_tokens": 11661299.0, "reward": 1.0100810527801514, "reward_std": 0.014807004481554031, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.8131308555603027, "rewards/format_reward_step": 1.0, "step": 58 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7977718394249678, "aux_distill/mean_u": 0.3172356907549245, "aux_distill/n_active_tok": 85.625, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.47265625, "calib/ece": 0.2777734375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0054064327485380155, "calib/mean_conf": 0.019882812500000003, "calib/mu_c": 0.023684210526315794, "calib/mu_w": 0.018277777777777778, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.000390625, "calib/std_conf": 0.019970337806102422, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26880778588807785, "calib/step_q_c_n": 411.0, "calib/step_q_gap": 0.004716876797168701, "calib/step_q_w": 0.26409090909090915, "calib/step_q_w_n": 990.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 167.96875, "completions/mean_terminated_length": 169.2913360595703, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06293333333333333, "grad_norm": 1.336860179901123, "learning_rate": 3.944444444444445e-06, "loss": 0.0646, "num_tokens": 11810547.0, "reward": 1.0066341161727905, "reward_std": 0.01024669036269188, "rewards/accuracy_reward_step": 0.296875, "rewards/final_brier_reward_step": 0.7163933515548706, "rewards/format_reward_step": 1.0, "step": 59 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8566927462816238, "aux_distill/mean_u": 0.34854639763211626, "aux_distill/n_active_tok": 82.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.1953125, "calib/ece": 0.18334375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0006563106796116502, "calib/mean_conf": 0.015328125000000001, "calib/mu_c": 0.0148, "calib/mu_w": 0.01545631067961165, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0016796875, "calib/std_conf": 0.03026371315592941, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2923557692307692, "calib/step_q_c_n": 260.0, "calib/step_q_gap": 0.004906329978432766, "calib/step_q_w": 0.28744943925233646, "calib/step_q_w_n": 1070.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 159.59765625, "completions/mean_terminated_length": 160.8543243408203, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.064, "grad_norm": 1.2972129583358765, "learning_rate": 3.916666666666667e-06, "loss": 0.0789, "num_tokens": 11960260.0, "reward": 1.0023151636123657, "reward_std": 0.0054235332645475864, "rewards/accuracy_reward_step": 0.1953125, "rewards/final_brier_reward_step": 0.8093178868293762, "rewards/format_reward_step": 1.0, "step": 60 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8228755127638578, "aux_distill/mean_u": 0.3034650743786119, "aux_distill/n_active_tok": 81.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.015625, "calib/ece": 0.315859375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0015278945892907205, "calib/mean_conf": 0.004453125, "calib/mu_c": 0.003414634146341464, "calib/mu_w": 0.004942528735632184, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.008082445653041843, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26985330073349634, "calib/step_q_c_n": 409.0, "calib/step_q_gap": -0.035374813552218, "calib/step_q_w": 0.30522811428571434, "calib/step_q_w_n": 875.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 160.98046875, "completions/mean_terminated_length": 162.24803161621094, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.06506666666666666, "grad_norm": 1.093929648399353, "learning_rate": 3.88888888888889e-06, "loss": 0.0848, "num_tokens": 12105535.0, "reward": 1.0010511875152588, "reward_std": 0.0020470181480050087, "rewards/accuracy_reward_step": 0.3203125, "rewards/final_brier_reward_step": 0.6817898154258728, "rewards/format_reward_step": 1.0, "step": 61 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8450021678581834, "aux_distill/mean_u": 0.306607903342974, "aux_distill/n_active_tok": 80.875, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.09375, "calib/ece": 0.29804375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0013003845316694475, "calib/mean_conf": 0.0027375, "calib/mu_c": 0.0036467532467532467, "calib/mu_w": 0.002346368715083799, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.005092910206846376, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2560384020618557, "calib/step_q_c_n": 388.0, "calib/step_q_gap": -0.005121095754738181, "calib/step_q_w": 0.26115949781659387, "calib/step_q_w_n": 916.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 175.73828125, "completions/mean_terminated_length": 177.12203979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.06613333333333334, "grad_norm": 0.7330139875411987, "learning_rate": 3.861111111111112e-06, "loss": 0.0822, "num_tokens": 12257604.0, "reward": 1.0010801553726196, "reward_std": 0.002245472278445959, "rewards/accuracy_reward_step": 0.30078125, "rewards/final_brier_reward_step": 0.7013790607452393, "rewards/format_reward_step": 1.0, "step": 62 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8771042171865702, "aux_distill/mean_u": 0.3877518793078786, "aux_distill/n_active_tok": 80.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.27490150170769134, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0025972584145657392, "calib/mean_conf": 0.006348498292308652, "calib/mu_c": 0.008215277777777778, "calib/mu_w": 0.005618019363212039, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.00949263115705469, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.263, "calib/step_q_c_n": 364.0, "calib/step_q_gap": -0.01867301587301584, "calib/step_q_w": 0.28167301587301585, "calib/step_q_w_n": 945.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 173.4453125, "completions/mean_terminated_length": 174.81101989746094, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.0672, "grad_norm": 0.8441920280456543, "learning_rate": 3.833333333333334e-06, "loss": 0.0789, "num_tokens": 12410646.0, "reward": 1.0022454261779785, "reward_std": 0.004115952178835869, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.7232406139373779, "rewards/format_reward_step": 1.0, "step": 63 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7863265555351973, "aux_distill/mean_u": 0.2677313118227581, "aux_distill/n_active_tok": 82.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.19921875, "calib/ece": 0.236480078125, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0026867901731776125, "calib/mean_conf": 0.017426171875, "calib/mu_c": 0.019430769230769234, "calib/mu_w": 0.01674397905759162, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.017271152638734306, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3150280346820809, "calib/step_q_c_n": 346.0, "calib/step_q_gap": -0.03254881810979726, "calib/step_q_w": 0.34757685279187817, "calib/step_q_w_n": 985.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 183.87890625, "completions/mean_terminated_length": 185.32676696777344, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.06826666666666667, "grad_norm": 0.8380021452903748, "learning_rate": 3.8055555555555556e-06, "loss": 0.0648, "num_tokens": 12561495.0, "reward": 1.0007264614105225, "reward_std": 0.017975719645619392, "rewards/accuracy_reward_step": 0.25390625, "rewards/final_brier_reward_step": 0.7514531016349792, "rewards/format_reward_step": 0.99609375, "step": 64 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8005730956792831, "aux_distill/mean_u": 0.2784840038174951, "aux_distill/n_active_tok": 87.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.53125, "calib/ece": 0.23990404668604787, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.003159976684758207, "calib/mean_conf": 0.02392407831395213, "calib/mu_c": 0.02625702985074627, "calib/mu_w": 0.023097053165988063, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0010546875, "calib/std_conf": 0.025948336423693166, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32175229110512127, "calib/step_q_c_n": 371.0, "calib/step_q_gap": -0.0038337376030127546, "calib/step_q_w": 0.325586028708134, "calib/step_q_w_n": 1045.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 199.19140625, "completions/mean_terminated_length": 200.7598419189453, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.06933333333333333, "grad_norm": 2.7181830406188965, "learning_rate": 3.777777777777778e-06, "loss": 0.0959, "num_tokens": 12717512.0, "reward": 0.9650967121124268, "reward_std": 0.10888336598873138, "rewards/accuracy_reward_step": 0.26171875, "rewards/final_brier_reward_step": 0.7153497338294983, "rewards/format_reward_step": 0.953125, "step": 65 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8654420170933008, "aux_distill/mean_u": 0.3045119087832663, "aux_distill/n_active_tok": 92.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.90234375, "calib/ece": 0.24938525315024132, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0068889948120064645, "calib/mean_conf": 0.0288178718497587, "calib/mu_c": 0.03382315714285714, "calib/mu_w": 0.026934162330850678, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0023828125, "calib/std_conf": 0.026407395186875748, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3251741293532339, "calib/step_q_c_n": 402.0, "calib/step_q_gap": 0.02805050446594809, "calib/step_q_w": 0.29712362488728583, "calib/step_q_w_n": 1109.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 218.2890625, "completions/mean_terminated_length": 220.00787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.0704, "grad_norm": 0.7378050684928894, "learning_rate": 3.7500000000000005e-06, "loss": 0.0816, "num_tokens": 12879746.0, "reward": 1.0084846019744873, "reward_std": 0.01123389508575201, "rewards/accuracy_reward_step": 0.2734375, "rewards/final_brier_reward_step": 0.7435317039489746, "rewards/format_reward_step": 1.0, "step": 66 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8232519458979368, "aux_distill/mean_u": 0.2802565965867831, "aux_distill/n_active_tok": 99.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.30859375, "calib/ece": 0.35173385302248666, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.004611640985949166, "calib/mean_conf": 0.03156026462457222, "calib/mu_c": 0.028684770833333335, "calib/mu_w": 0.0332964118192825, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.003411764705882353, "calib/std_conf": 0.046358818478338545, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3011433172302737, "calib/step_q_c_n": 621.0, "calib/step_q_gap": -0.003653161642965752, "calib/step_q_w": 0.30479647887323946, "calib/step_q_w_n": 994.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 238.3671875, "completions/mean_terminated_length": 240.2440948486328, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.07146666666666666, "grad_norm": 0.6210587024688721, "learning_rate": 3.7222222222222225e-06, "loss": 0.1009, "num_tokens": 13045776.0, "reward": 1.0013848543167114, "reward_std": 0.038151249289512634, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.6355822086334229, "rewards/format_reward_step": 0.9921875, "step": 67 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8016217239201069, "aux_distill/mean_u": 0.2958295942596198, "aux_distill/n_active_tok": 111.125, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.79296875, "calib/ece": 0.2818664538399066, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.007091782746271546, "calib/mean_conf": 0.032306774506550104, "calib/mu_c": 0.037220765700817005, "calib/mu_w": 0.03012898295454546, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.003543307086614173, "calib/std_conf": 0.03559904487719555, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23322369477911645, "calib/step_q_c_n": 498.0, "calib/step_q_gap": -0.06897630522088358, "calib/step_q_w": 0.3022, "calib/step_q_w_n": 1497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 261.87890625, "completions/mean_terminated_length": 263.9409484863281, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.07253333333333334, "grad_norm": 0.5835959911346436, "learning_rate": 3.694444444444445e-06, "loss": 0.0625, "num_tokens": 13216905.0, "reward": 1.0023818016052246, "reward_std": 0.03678155690431595, "rewards/accuracy_reward_step": 0.3046875, "rewards/final_brier_reward_step": 0.7078884840011597, "rewards/format_reward_step": 0.9921875, "step": 68 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7951194914057851, "aux_distill/mean_u": 0.26222809183075535, "aux_distill/n_active_tok": 111.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.0390625, "calib/ece": 0.2928776784204442, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.006307577900800849, "calib/mean_conf": 0.031341071579555756, "calib/mu_c": 0.03560361445783132, "calib/mu_w": 0.02929603655703047, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.02452726551621772, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25492882562277575, "calib/step_q_c_n": 562.0, "calib/step_q_gap": 0.04194164820342089, "calib/step_q_w": 0.21298717741935486, "calib/step_q_w_n": 1240.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 267.80859375, "completions/mean_terminated_length": 269.9173278808594, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.0736, "grad_norm": 0.6954154968261719, "learning_rate": 3.6666666666666666e-06, "loss": 0.0831, "num_tokens": 13389960.0, "reward": 1.0107513666152954, "reward_std": 0.015205792151391506, "rewards/accuracy_reward_step": 0.32421875, "rewards/final_brier_reward_step": 0.6972841024398804, "rewards/format_reward_step": 1.0, "step": 69 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8393227551132441, "aux_distill/mean_u": 0.31709665386517005, "aux_distill/n_active_tok": 134.5, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 10.6953125, "calib/ece": 0.319916, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0001261223344556639, "calib/mean_conf": 0.038484000000000004, "calib/mu_c": 0.03840227272727273, "calib/mu_w": 0.03852839506172839, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0032, "calib/std_conf": 0.056075315638879825, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25926243567753, "calib/step_q_c_n": 583.0, "calib/step_q_gap": 0.08081201668311658, "calib/step_q_w": 0.1784504189944134, "calib/step_q_w_n": 2148.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 338.50390625, "completions/mean_terminated_length": 345.24700927734375, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.07466666666666667, "grad_norm": 0.8931414484977722, "learning_rate": 3.638888888888889e-06, "loss": 0.1097, "num_tokens": 13583609.0, "reward": 0.9836143255233765, "reward_std": 0.08913453668355942, "rewards/accuracy_reward_step": 0.34375, "rewards/final_brier_reward_step": 0.6508224010467529, "rewards/format_reward_step": 0.97265625, "step": 70 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8577479831874371, "aux_distill/mean_u": 0.36445067461176467, "aux_distill/n_active_tok": 126.25, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.2421875, "calib/ece": 0.2426550980392157, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0017006375227686665, "calib/mean_conf": 0.041501764705882356, "calib/mu_c": 0.04272222222222222, "calib/mu_w": 0.04102158469945355, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0009019607843137256, "calib/std_conf": 0.03134084547240277, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2365654648956357, "calib/step_q_c_n": 527.0, "calib/step_q_gap": 0.030497240006185283, "calib/step_q_w": 0.20606822488945042, "calib/step_q_w_n": 1583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 323.2265625, "completions/mean_terminated_length": 325.77166748046875, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.07573333333333333, "grad_norm": 0.8447468876838684, "learning_rate": 3.6111111111111115e-06, "loss": 0.0698, "num_tokens": 13770763.0, "reward": 1.0067623853683472, "reward_std": 0.027596548199653625, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.7361809015274048, "rewards/format_reward_step": 0.99609375, "step": 71 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8418899774551392, "aux_distill/mean_u": 0.2976037554519489, "aux_distill/n_active_tok": 126.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.8046875, "calib/ece": 0.2510117647058823, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0034823055099298186, "calib/mean_conf": 0.04420392156862745, "calib/mu_c": 0.046675675675675675, "calib/mu_w": 0.043193370165745856, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0025098039215686275, "calib/std_conf": 0.038148254454884734, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3083068783068783, "calib/step_q_c_n": 567.0, "calib/step_q_gap": 0.0850304704823377, "calib/step_q_w": 0.2232764078245406, "calib/step_q_w_n": 1687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 332.85040283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0768, "grad_norm": 0.6339616775512695, "learning_rate": 3.5833333333333335e-06, "loss": 0.0704, "num_tokens": 13959715.0, "reward": 1.0078879594802856, "reward_std": 0.030957698822021484, "rewards/accuracy_reward_step": 0.2890625, "rewards/final_brier_reward_step": 0.7306196689605713, "rewards/format_reward_step": 0.99609375, "step": 72 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8132553435862064, "aux_distill/mean_u": 0.2690545730196228, "aux_distill/n_active_tok": 137.875, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 10.86328125, "calib/ece": 0.2577836653386454, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.004715340909090905, "calib/mean_conf": 0.049706374501992036, "calib/mu_c": 0.046400000000000004, "calib/mu_w": 0.05111534090909091, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004342629482071713, "calib/std_conf": 0.047360112733088204, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24456730769230767, "calib/step_q_c_n": 624.0, "calib/step_q_gap": 0.02669062711743514, "calib/step_q_w": 0.21787668057487253, "calib/step_q_w_n": 2157.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 355.0390625, "completions/mean_terminated_length": 363.5600280761719, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.07786666666666667, "grad_norm": 0.5692130923271179, "learning_rate": 3.555555555555556e-06, "loss": 0.0473, "num_tokens": 14157637.0, "reward": 0.9917516708374023, "reward_std": 0.06734663248062134, "rewards/accuracy_reward_step": 0.29296875, "rewards/final_brier_reward_step": 0.7100658416748047, "rewards/format_reward_step": 0.98046875, "step": 73 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8305688640102744, "aux_distill/mean_u": 0.28209760097768316, "aux_distill/n_active_tok": 128.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.8125, "calib/ece": 0.22303107569721117, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.007932596685082872, "calib/mean_conf": 0.0592796812749004, "calib/mu_c": 0.065, "calib/mu_w": 0.05706740331491713, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0017131474103585656, "calib/std_conf": 0.05748070891965741, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29422413793103447, "calib/step_q_c_n": 580.0, "calib/step_q_gap": 0.07312682944242166, "calib/step_q_w": 0.2210973084886128, "calib/step_q_w_n": 1932.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 344.22265625, "completions/mean_terminated_length": 349.6865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.07893333333333333, "grad_norm": 0.7870764136314392, "learning_rate": 3.5277777777777784e-06, "loss": 0.0325, "num_tokens": 14349686.0, "reward": 0.9948996901512146, "reward_std": 0.08115790784358978, "rewards/accuracy_reward_step": 0.2734375, "rewards/final_brier_reward_step": 0.735893189907074, "rewards/format_reward_step": 0.98046875, "step": 74 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8252325486391783, "aux_distill/mean_u": 0.2802878247564068, "aux_distill/n_active_tok": 136.25, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 11.30078125, "calib/ece": 0.43045702811244974, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.0024954674586776804, "calib/mean_conf": 0.060948594377510046, "calib/mu_c": 0.06223140495867769, "calib/mu_w": 0.05973593750000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.002730923694779116, "calib/std_conf": 0.04042357964353128, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23575879396984922, "calib/step_q_c_n": 995.0, "calib/step_q_gap": -0.031047844597063334, "calib/step_q_w": 0.26680663856691256, "calib/step_q_w_n": 1898.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 366.7421875, "completions/mean_terminated_length": 377.05218505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.08, "grad_norm": 0.5298400521278381, "learning_rate": 3.5e-06, "loss": 0.0148, "num_tokens": 14548324.0, "reward": 0.9994690418243408, "reward_std": 0.10342110693454742, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.5536255836486816, "rewards/format_reward_step": 0.97265625, "step": 75 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8440918866544962, "aux_distill/mean_u": 0.3047469412064998, "aux_distill/n_active_tok": 144.5, "calib/answer_extract_rate": 0.93359375, "calib/avg_num_step_conf": 14.56640625, "calib/ece": 0.321673640167364, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.008939708939708968, "calib/mean_conf": 0.059079497907949786, "calib/mu_c": 0.06461538461538463, "calib/mu_w": 0.05567567567567566, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.03790527113297334, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24815864022662887, "calib/step_q_c_n": 706.0, "calib/step_q_gap": 0.03367302990575552, "calib/step_q_w": 0.21448561032087335, "calib/step_q_w_n": 3023.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 358.36328125, "completions/mean_terminated_length": 383.8535461425781, "completions/min_length": 0.0, "completions/min_terminated_length": 159.0, "epoch": 0.08106666666666666, "grad_norm": 1.081568717956543, "learning_rate": 3.4722222222222224e-06, "loss": -0.0567, "num_tokens": 14743121.0, "reward": 0.9542624950408936, "reward_std": 0.16976533830165863, "rewards/accuracy_reward_step": 0.35546875, "rewards/final_brier_reward_step": 0.6194624900817871, "rewards/format_reward_step": 0.93359375, "step": 76 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7728911470621824, "aux_distill/mean_u": 0.25746192158660536, "aux_distill/n_active_tok": 178.375, "calib/answer_extract_rate": 0.8828125, "calib/avg_num_step_conf": 19.1015625, "calib/ece": 0.30475770925110135, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.8828125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.0009261083743842269, "calib/mean_conf": 0.0785022026431718, "calib/mu_c": 0.07793103448275862, "calib/mu_w": 0.07885714285714285, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.0533086246916388, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2515760869565218, "calib/step_q_c_n": 736.0, "calib/step_q_gap": 0.07540889870423481, "calib/step_q_w": 0.17616718825228697, "calib/step_q_w_n": 4154.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 381.4921875, "completions/mean_terminated_length": 430.22906494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 176.0, "epoch": 0.08213333333333334, "grad_norm": 0.4085995554924011, "learning_rate": 3.444444444444445e-06, "loss": -0.135, "num_tokens": 14945447.0, "reward": 0.9053117036819458, "reward_std": 0.27409547567367554, "rewards/accuracy_reward_step": 0.33984375, "rewards/final_brier_reward_step": 0.5879671573638916, "rewards/format_reward_step": 0.8828125, "step": 77 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7340011494234204, "aux_distill/mean_u": 0.2834807404634447, "aux_distill/n_active_tok": 208.375, "calib/answer_extract_rate": 0.8203125, "calib/avg_num_step_conf": 27.3125, "calib/ece": 0.3156872037914692, "calib/final_conf_rate": 0.82421875, "calib/format_rate": 0.8203125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.02651637801204819, "calib/mean_conf": 0.0822274881516588, "calib/mu_c": 0.0983132530120482, "calib/mu_w": 0.07179687500000001, "calib/nonempty_final_conf_rate": 0.82421875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.002274881516587678, "calib/std_conf": 0.07320856010629861, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2361008230452675, "calib/step_q_c_n": 972.0, "calib/step_q_gap": 0.053976304772842265, "calib/step_q_w": 0.18212451827242523, "calib/step_q_w_n": 6020.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17578125, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 426.140625, "completions/mean_terminated_length": 517.0237426757812, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.0832, "grad_norm": 0.7913491129875183, "learning_rate": 3.416666666666667e-06, "loss": -0.2688, "num_tokens": 15162563.0, "reward": 0.8471955060958862, "reward_std": 0.3558165431022644, "rewards/accuracy_reward_step": 0.32421875, "rewards/final_brier_reward_step": 0.5498597621917725, "rewards/format_reward_step": 0.8203125, "step": 78 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8018683372065425, "aux_distill/mean_u": 0.3224408325214972, "aux_distill/n_active_tok": 233.5, "calib/answer_extract_rate": 0.79296875, "calib/avg_num_step_conf": 29.0078125, "calib/ece": 0.27322205882352935, "calib/final_conf_rate": 0.796875, "calib/format_rate": 0.7890625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.003921666840949509, "calib/mean_conf": 0.09638578431372549, "calib/mu_c": 0.09890410958904111, "calib/mu_w": 0.0949824427480916, "calib/nonempty_final_conf_rate": 0.796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0058823529411764705, "calib/std_conf": 0.08280033060860721, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2973452380952381, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.11331534134455484, "calib/step_q_w": 0.18402989675068326, "calib/step_q_w_n": 6586.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 437.3046875, "completions/mean_terminated_length": 548.7745361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.08426666666666667, "grad_norm": 0.5817393064498901, "learning_rate": 3.3888888888888893e-06, "loss": -0.3343, "num_tokens": 15380889.0, "reward": 0.811467170715332, "reward_std": 0.3905557692050934, "rewards/accuracy_reward_step": 0.28515625, "rewards/final_brier_reward_step": 0.5487155914306641, "rewards/format_reward_step": 0.7890625, "step": 79 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.735305467620492, "aux_distill/mean_u": 0.2073347987640853, "aux_distill/n_active_tok": 170.125, "calib/answer_extract_rate": 0.86328125, "calib/avg_num_step_conf": 22.1328125, "calib/ece": 0.31369369369369365, "calib/final_conf_rate": 0.8671875, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.010581327069876675, "calib/mean_conf": 0.1300900900900901, "calib/mu_c": 0.12384615384615384, "calib/mu_w": 0.13442748091603052, "calib/nonempty_final_conf_rate": 0.8671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016936936936936934, "calib/std_conf": 0.11121763013634592, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2721401673640168, "calib/step_q_c_n": 956.0, "calib/step_q_gap": 0.06341783190754122, "calib/step_q_w": 0.2087223354564756, "calib/step_q_w_n": 4710.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 389.6640625, "completions/mean_terminated_length": 449.34234619140625, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.08533333333333333, "grad_norm": 2.6019041538238525, "learning_rate": 3.3611111111111117e-06, "loss": -0.2047, "num_tokens": 15582803.0, "reward": 0.8946083784103394, "reward_std": 0.3351396322250366, "rewards/accuracy_reward_step": 0.35546875, "rewards/final_brier_reward_step": 0.5704667568206787, "rewards/format_reward_step": 0.86328125, "step": 80 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7553578931838274, "aux_distill/mean_u": 0.20833043231980844, "aux_distill/n_active_tok": 177.5, "calib/answer_extract_rate": 0.91015625, "calib/avg_num_step_conf": 17.73046875, "calib/ece": 0.28060085836909876, "calib/final_conf_rate": 0.91015625, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.008583690987124463, "calib/gap": 0.012084093673965901, "calib/mean_conf": 0.15424892703862664, "calib/mu_c": 0.16135416666666666, "calib/mu_w": 0.14927007299270076, "calib/nonempty_final_conf_rate": 0.91015625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011416309012875537, "calib/std_conf": 0.14436621236072153, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27140337922403, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.019378860507452467, "calib/step_q_w": 0.25202451871657755, "calib/step_q_w_n": 3740.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 385.62890625, "completions/mean_terminated_length": 423.6952819824219, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.0864, "grad_norm": 0.4589557945728302, "learning_rate": 3.3333333333333333e-06, "loss": -0.0633, "num_tokens": 15787772.0, "reward": 0.9503519535064697, "reward_std": 0.24027007818222046, "rewards/accuracy_reward_step": 0.375, "rewards/final_brier_reward_step": 0.6155476570129395, "rewards/format_reward_step": 0.91015625, "step": 81 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7371373372152448, "aux_distill/mean_u": 0.21507642836182117, "aux_distill/n_active_tok": 143.75, "calib/answer_extract_rate": 0.9453125, "calib/avg_num_step_conf": 13.14453125, "calib/ece": 0.30004115226337447, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.012345679012345678, "calib/gap": 0.048083049693669166, "calib/mean_conf": 0.18020576131687246, "calib/mu_c": 0.205929203539823, "calib/mu_w": 0.15784615384615383, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007613168724279836, "calib/std_conf": 0.16698105794269957, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3053854625550661, "calib/step_q_c_n": 908.0, "calib/step_q_gap": -0.011894920025316502, "calib/step_q_w": 0.3172803825803826, "calib/step_q_w_n": 2457.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 369.72265625, "completions/mean_terminated_length": 389.5020446777344, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.08746666666666666, "grad_norm": 2.2132375240325928, "learning_rate": 3.3055555555555558e-06, "loss": -0.0156, "num_tokens": 15987973.0, "reward": 1.007659673690796, "reward_std": 0.20080611109733582, "rewards/accuracy_reward_step": 0.44140625, "rewards/final_brier_reward_step": 0.6286004185676575, "rewards/format_reward_step": 0.9453125, "step": 82 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.794050301425159, "aux_distill/mean_u": 0.25047651542267396, "aux_distill/n_active_tok": 146.5, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 11.8515625, "calib/ece": 0.20485714285714285, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.00816326530612245, "calib/gap": 0.07423508771929829, "calib/mean_conf": 0.1979183673469388, "calib/mu_c": 0.24336842105263162, "calib/mu_w": 0.16913333333333333, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007510204081632653, "calib/std_conf": 0.18803862355472306, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.306963216957606, "calib/step_q_c_n": 802.0, "calib/step_q_gap": 0.035586156025706306, "calib/step_q_w": 0.27137706093189967, "calib/step_q_w_n": 2232.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 400.6875, "completions/mean_terminated_length": 418.6775207519531, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.08853333333333334, "grad_norm": 0.5713662505149841, "learning_rate": 3.277777777777778e-06, "loss": -0.0261, "num_tokens": 16197813.0, "reward": 1.0116798877716064, "reward_std": 0.18111923336982727, "rewards/accuracy_reward_step": 0.37109375, "rewards/final_brier_reward_step": 0.6952347755432129, "rewards/format_reward_step": 0.95703125, "step": 83 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8229641187936068, "aux_distill/mean_u": 0.2732668140313402, "aux_distill/n_active_tok": 147.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.39453125, "calib/ece": 0.2481746031746032, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": 0.02588289292505408, "calib/mean_conf": 0.17785714285714288, "calib/mu_c": 0.19336633663366334, "calib/mu_w": 0.16748344370860926, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.012619047619047615, "calib/std_conf": 0.1744565582113434, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25671852825229957, "calib/step_q_c_n": 761.0, "calib/step_q_gap": 0.017268345770547727, "calib/step_q_w": 0.23945018248175184, "calib/step_q_w_n": 1644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 352.953125, "completions/mean_terminated_length": 359.98406982421875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.0896, "grad_norm": 1.077708125114441, "learning_rate": 3.2500000000000002e-06, "loss": 0.0584, "num_tokens": 16394089.0, "reward": 1.0301148891448975, "reward_std": 0.13591772317886353, "rewards/accuracy_reward_step": 0.39453125, "rewards/final_brier_reward_step": 0.6813234090805054, "rewards/format_reward_step": 0.984375, "step": 84 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8132430296391249, "aux_distill/mean_u": 0.2754530526020663, "aux_distill/n_active_tok": 135.875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.79296875, "calib/ece": 0.3206719367588933, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.015810276679841896, "calib/gap": -0.010492700729926974, "calib/mean_conf": 0.18818181818181817, "calib/mu_c": 0.18250000000000002, "calib/mu_w": 0.192992700729927, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.025177865612648224, "calib/std_conf": 0.18922312191444773, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33029307359307364, "calib/step_q_c_n": 924.0, "calib/step_q_gap": 0.01789042040292832, "calib/step_q_w": 0.3124026531901453, "calib/step_q_w_n": 1583.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 375.21484375, "completions/mean_terminated_length": 379.6640319824219, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.09066666666666667, "grad_norm": 0.6015828847885132, "learning_rate": 3.2222222222222227e-06, "loss": 0.0769, "num_tokens": 16597968.0, "reward": 1.0357849597930908, "reward_std": 0.12696963548660278, "rewards/accuracy_reward_step": 0.453125, "rewards/final_brier_reward_step": 0.6301636695861816, "rewards/format_reward_step": 0.98828125, "step": 85 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7791156647726893, "aux_distill/mean_u": 0.22830810682716612, "aux_distill/n_active_tok": 148.125, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 11.9296875, "calib/ece": 0.2980566801619433, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.008097165991902834, "calib/gap": -0.009987755102040835, "calib/mean_conf": 0.16534412955465586, "calib/mu_c": 0.1594, "calib/mu_w": 0.16938775510204082, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.029271255060728744, "calib/std_conf": 0.17124391001302797, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2538078590785908, "calib/step_q_c_n": 738.0, "calib/step_q_gap": 0.021425907437830904, "calib/step_q_w": 0.23238195164075992, "calib/step_q_w_n": 2316.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 383.796875, "completions/mean_terminated_length": 397.7813720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.09173333333333333, "grad_norm": 1.1985909938812256, "learning_rate": 3.1944444444444443e-06, "loss": 0.0005, "num_tokens": 16801732.0, "reward": 0.9997738003730774, "reward_std": 0.16255082190036774, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6440788507461548, "rewards/format_reward_step": 0.96484375, "step": 86 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7769611058756709, "aux_distill/mean_u": 0.2535630387433568, "aux_distill/n_active_tok": 129.75, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 8.62109375, "calib/ece": 0.4245238095238096, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.016798926586160584, "calib/mean_conf": 0.17222222222222225, "calib/mu_c": 0.16482269503546101, "calib/mu_w": 0.1816216216216216, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018611111111111113, "calib/std_conf": 0.18810833120326595, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24014472876151483, "calib/step_q_c_n": 977.0, "calib/step_q_gap": 0.04641269624118963, "calib/step_q_w": 0.1937320325203252, "calib/step_q_w_n": 1230.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 331.5390625, "completions/mean_terminated_length": 335.4703674316406, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.0928, "grad_norm": 0.9272819757461548, "learning_rate": 3.1666666666666667e-06, "loss": 0.0383, "num_tokens": 16992102.0, "reward": 1.0431418418884277, "reward_std": 0.1451120376586914, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.5511273145675659, "rewards/format_reward_step": 0.984375, "step": 87 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8187052253633738, "aux_distill/mean_u": 0.23038298876880164, "aux_distill/n_active_tok": 117.375, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 8.796875, "calib/ece": 0.2947628458498024, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": 0.041013986013985954, "calib/mean_conf": 0.1631818181818182, "calib/mu_c": 0.18636363636363634, "calib/mu_w": 0.1453496503496504, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011581027667984188, "calib/std_conf": 0.17293974865677617, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2335702380952381, "calib/step_q_c_n": 840.0, "calib/step_q_gap": 0.024612731013085143, "calib/step_q_w": 0.20895750708215297, "calib/step_q_w_n": 1412.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 370.21875, "completions/mean_terminated_length": 376.0952453613281, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.09386666666666667, "grad_norm": 0.47781553864479065, "learning_rate": 3.138888888888889e-06, "loss": 0.0596, "num_tokens": 17196726.0, "reward": 1.0404224395751953, "reward_std": 0.12067631632089615, "rewards/accuracy_reward_step": 0.4296875, "rewards/final_brier_reward_step": 0.6628760695457458, "rewards/format_reward_step": 0.98828125, "step": 88 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8449487295001745, "aux_distill/mean_u": 0.30785709829699454, "aux_distill/n_active_tok": 117.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.51953125, "calib/ece": 0.23152343750000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": 0.04241266088783821, "calib/mean_conf": 0.19082031249999998, "calib/mu_c": 0.21765957446808512, "calib/mu_w": 0.1752469135802469, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027578125000000002, "calib/std_conf": 0.18813982223974365, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2421522693997072, "calib/step_q_c_n": 683.0, "calib/step_q_gap": 0.019251142185536474, "calib/step_q_w": 0.2229011272141707, "calib/step_q_w_n": 1242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1279.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 357.10546875, "completions/mean_terminated_length": 359.9173278808594, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.09493333333333333, "grad_norm": 0.6366252899169922, "learning_rate": 3.1111111111111116e-06, "loss": 0.0585, "num_tokens": 17397033.0, "reward": 1.0440173149108887, "reward_std": 0.10052931308746338, "rewards/accuracy_reward_step": 0.3671875, "rewards/final_brier_reward_step": 0.7208472490310669, "rewards/format_reward_step": 1.0, "step": 89 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8226105701178312, "aux_distill/mean_u": 0.2654090760485537, "aux_distill/n_active_tok": 115.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.99609375, "calib/ece": 0.32437007874015744, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": -0.01987114845938376, "calib/mean_conf": 0.2060236220472441, "calib/mu_c": 0.1954621848739496, "calib/mu_w": 0.21533333333333335, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.030944881889763784, "calib/std_conf": 0.1961006074975203, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2793063829787234, "calib/step_q_c_n": 799.0, "calib/step_q_gap": 0.004115036824877294, "calib/step_q_w": 0.2751913461538461, "calib/step_q_w_n": 1248.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 357.85546875, "completions/mean_terminated_length": 360.6732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.096, "grad_norm": 0.4627319872379303, "learning_rate": 3.0833333333333336e-06, "loss": 0.0536, "num_tokens": 17591964.0, "reward": 1.042912244796753, "reward_std": 0.13357305526733398, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6287933588027954, "rewards/format_reward_step": 0.9921875, "step": 90 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8454980216920376, "aux_distill/mean_u": 0.241689415357186, "aux_distill/n_active_tok": 106.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.84375, "calib/ece": 0.33191406250000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": -0.007789215686274442, "calib/mean_conf": 0.2439453125, "calib/mu_c": 0.24029411764705888, "calib/mu_w": 0.24808333333333332, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.022304687500000007, "calib/std_conf": 0.2144680486326048, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3091839393939394, "calib/step_q_c_n": 990.0, "calib/step_q_gap": 0.03560769267477931, "calib/step_q_w": 0.2735762467191601, "calib/step_q_w_n": 762.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 339.36328125, "completions/mean_terminated_length": 342.0354309082031, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.09706666666666666, "grad_norm": 0.7517727613449097, "learning_rate": 3.055555555555556e-06, "loss": 0.1172, "num_tokens": 17786553.0, "reward": 1.0749033689498901, "reward_std": 0.13493086397647858, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.6185566186904907, "rewards/format_reward_step": 1.0, "step": 91 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8148021893575788, "aux_distill/mean_u": 0.21645658987542504, "aux_distill/n_active_tok": 86.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.10546875, "calib/ece": 0.3235976562499999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": -0.006712854349951114, "calib/mean_conf": 0.23257421875, "calib/mu_c": 0.22911290322580646, "calib/mu_w": 0.23582575757575758, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0358984375, "calib/std_conf": 0.2086469112018607, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28883021390374336, "calib/step_q_c_n": 748.0, "calib/step_q_gap": -0.03785101309012162, "calib/step_q_w": 0.326681226993865, "calib/step_q_w_n": 815.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 300.97265625, "completions/mean_terminated_length": 303.342529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.09813333333333334, "grad_norm": 4.081338405609131, "learning_rate": 3.0277777777777776e-06, "loss": 0.0894, "num_tokens": 17970322.0, "reward": 1.0621644258499146, "reward_std": 0.13498783111572266, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6399537920951843, "rewards/format_reward_step": 1.0, "step": 92 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8725457116961479, "aux_distill/mean_u": 0.2621401066955912, "aux_distill/n_active_tok": 100.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.234375, "calib/ece": 0.26855468749999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0390625, "calib/gap": 0.04151270278543004, "calib/mean_conf": 0.2576953125, "calib/mu_c": 0.2795867768595041, "calib/mu_w": 0.23807407407407408, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.026796875, "calib/std_conf": 0.21198043315487244, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3198336434108527, "calib/step_q_c_n": 645.0, "calib/step_q_gap": -0.02491346489619256, "calib/step_q_w": 0.34474710830704525, "calib/step_q_w_n": 951.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 302.5078125, "completions/mean_terminated_length": 304.8897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.0992, "grad_norm": 0.8950026035308838, "learning_rate": 3e-06, "loss": 0.101, "num_tokens": 18153540.0, "reward": 1.0764771699905396, "reward_std": 0.13514947891235352, "rewards/accuracy_reward_step": 0.47265625, "rewards/final_brier_reward_step": 0.6802980899810791, "rewards/format_reward_step": 1.0, "step": 93 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8754433235153556, "aux_distill/mean_u": 0.2528739918905104, "aux_distill/n_active_tok": 77.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 4.98046875, "calib/ece": 0.32996093749999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.02433778107239931, "calib/mean_conf": 0.25042968749999994, "calib/mu_c": 0.2612676056338028, "calib/mu_w": 0.2369298245614035, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0128515625, "calib/std_conf": 0.17700252397537256, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27266932952924394, "calib/step_q_c_n": 701.0, "calib/step_q_gap": -0.04155540914671424, "calib/step_q_w": 0.3142247386759582, "calib/step_q_w_n": 574.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 257.109375, "completions/mean_terminated_length": 259.13385009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.10026666666666667, "grad_norm": 2.6950621604919434, "learning_rate": 2.9722222222222225e-06, "loss": 0.0702, "num_tokens": 18328040.0, "reward": 1.0978994369506836, "reward_std": 0.11955973505973816, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6411113739013672, "rewards/format_reward_step": 1.0, "step": 94 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9074766356498003, "aux_distill/mean_u": 0.2802716825590698, "aux_distill/n_active_tok": 85.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.55078125, "calib/ece": 0.2603125, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.0869743589743589, "calib/mean_conf": 0.2475, "calib/mu_c": 0.29030769230769227, "calib/mu_w": 0.20333333333333337, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.17327047137351478, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2969854466858789, "calib/step_q_c_n": 694.0, "calib/step_q_gap": 0.055014607621229644, "calib/step_q_w": 0.24197083906464925, "calib/step_q_w_n": 727.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 285.51953125, "completions/mean_terminated_length": 287.7677307128906, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.10133333333333333, "grad_norm": 1.4201782941818237, "learning_rate": 2.944444444444445e-06, "loss": 0.0709, "num_tokens": 18507261.0, "reward": 1.1017823219299316, "reward_std": 0.12060728669166565, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6957523226737976, "rewards/format_reward_step": 1.0, "step": 95 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8319747727364302, "aux_distill/mean_u": 0.25121979181888454, "aux_distill/n_active_tok": 77.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.3125, "calib/ece": 0.31843338671875004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.03609005367372353, "calib/mean_conf": 0.26641036328125, "calib/mu_c": 0.2819178082191781, "calib/mu_w": 0.24582775454545458, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.007265625000000001, "calib/std_conf": 0.18174814973820932, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3027449602122016, "calib/step_q_c_n": 754.0, "calib/step_q_gap": 0.02137933644982537, "calib/step_q_w": 0.2813656237623762, "calib/step_q_w_n": 606.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 270.23046875, "completions/mean_terminated_length": 272.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.1024, "grad_norm": 6.897700309753418, "learning_rate": 2.916666666666667e-06, "loss": 0.0658, "num_tokens": 18682256.0, "reward": 1.1087777614593506, "reward_std": 0.1243782639503479, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6472431421279907, "rewards/format_reward_step": 1.0, "step": 96 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9163010641932487, "aux_distill/mean_u": 0.2991586962353635, "aux_distill/n_active_tok": 80.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.1640625, "calib/ece": 0.25729411764705884, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": 0.02928615384615388, "calib/mean_conf": 0.25650980392156864, "calib/mu_c": 0.27144, "calib/mu_w": 0.24215384615384614, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011803921568627454, "calib/std_conf": 0.16145167097539087, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2120297674418605, "calib/step_q_c_n": 645.0, "calib/step_q_gap": -0.03514866682697257, "calib/step_q_w": 0.24717843426883307, "calib/step_q_w_n": 677.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 265.76953125, "completions/mean_terminated_length": 267.8622131347656, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.10346666666666667, "grad_norm": 4.537506103515625, "learning_rate": 2.888888888888889e-06, "loss": 0.0987, "num_tokens": 18855365.0, "reward": 1.0828802585601807, "reward_std": 0.12252596020698547, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6813855171203613, "rewards/format_reward_step": 0.99609375, "step": 97 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8761290339753032, "aux_distill/mean_u": 0.27758280371669913, "aux_distill/n_active_tok": 86.875, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.71484375, "calib/ece": 0.23363281249999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": 0.03753549190535499, "calib/mean_conf": 0.3366796875, "calib/mu_c": 0.35280821917808225, "calib/mu_w": 0.31527272727272726, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0, "calib/std_conf": 0.1586894290742214, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25460158730158733, "calib/step_q_c_n": 819.0, "calib/step_q_gap": 0.05324690814891295, "calib/step_q_w": 0.20135467915267438, "calib/step_q_w_n": 644.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 284.09765625, "completions/mean_terminated_length": 286.33465576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.10453333333333334, "grad_norm": 3.4571356773376465, "learning_rate": 2.861111111111111e-06, "loss": 0.0864, "num_tokens": 19034278.0, "reward": 1.1319432258605957, "reward_std": 0.12014371156692505, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6935738325119019, "rewards/format_reward_step": 1.0, "step": 98 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8914099540561438, "aux_distill/mean_u": 0.31211058932284613, "aux_distill/n_active_tok": 95.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 6.05859375, "calib/ece": 0.0665333333333333, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.08531102003642999, "calib/mean_conf": 0.32974901960784314, "calib/mu_c": 0.3909722222222223, "calib/mu_w": 0.3056612021857923, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05696470588235291, "calib/std_conf": 0.1499662698972071, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.18594666666666668, "calib/step_q_c_n": 375.0, "calib/step_q_gap": -0.021685221088435347, "calib/step_q_w": 0.20763188775510202, "calib/step_q_w_n": 1176.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 307.6875, "completions/mean_terminated_length": 310.1102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.1056, "grad_norm": 1.3000514507293701, "learning_rate": 2.8333333333333335e-06, "loss": 0.0563, "num_tokens": 19218846.0, "reward": 1.040698766708374, "reward_std": 0.11782108247280121, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.8040539026260376, "rewards/format_reward_step": 0.99609375, "step": 99 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.9092141855508089, "aux_distill/mean_u": 0.3021948987905085, "aux_distill/n_active_tok": 87.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.5390625, "calib/ece": 0.19819607843137255, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.08460488595288546, "calib/mean_conf": 0.36525490196078425, "calib/mu_c": 0.40274647887323944, "calib/mu_w": 0.318141592920354, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0032941176470588232, "calib/std_conf": 0.1601170947918998, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20816040609137054, "calib/step_q_c_n": 788.0, "calib/step_q_gap": -0.0857559431149787, "calib/step_q_w": 0.29391634920634924, "calib/step_q_w_n": 630.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 288.59375, "completions/mean_terminated_length": 290.86614990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.10666666666666667, "grad_norm": 0.8430328369140625, "learning_rate": 2.805555555555556e-06, "loss": 0.0924, "num_tokens": 19400134.0, "reward": 1.1422317028045654, "reward_std": 0.13820312917232513, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.7297757863998413, "rewards/format_reward_step": 0.99609375, "step": 100 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8671903610229492, "aux_distill/mean_u": 0.28633546362227774, "aux_distill/n_active_tok": 101.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.55859375, "calib/ece": 0.18968749999999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.019789987789987862, "calib/mean_conf": 0.406875, "calib/mu_c": 0.4166153846153846, "calib/mu_w": 0.39682539682539675, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04437499999999998, "calib/std_conf": 0.1950210325195721, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25435709606986895, "calib/step_q_c_n": 916.0, "calib/step_q_gap": -0.06151865753432506, "calib/step_q_w": 0.315875753604194, "calib/step_q_w_n": 763.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 335.12109375, "completions/mean_terminated_length": 337.75982666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.10773333333333333, "grad_norm": 1.9569110870361328, "learning_rate": 2.7777777777777783e-06, "loss": 0.0976, "num_tokens": 19592917.0, "reward": 1.1097722053527832, "reward_std": 0.13801291584968567, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.711732029914856, "rewards/format_reward_step": 1.0, "step": 101 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8509639389812946, "aux_distill/mean_u": 0.25738222654461373, "aux_distill/n_active_tok": 87.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.234375, "calib/ece": 0.09972103409260036, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0390625, "calib/gap": 0.17604035958991537, "calib/mean_conf": 0.48027896590739966, "calib/mu_c": 0.5586719385372838, "calib/mu_w": 0.38263157894736843, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.01265625, "calib/std_conf": 0.23970905810145982, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26079555555555556, "calib/step_q_c_n": 720.0, "calib/step_q_gap": -0.07308089605734769, "calib/step_q_w": 0.33387645161290325, "calib/step_q_w_n": 620.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 266.01953125, "completions/mean_terminated_length": 268.1141662597656, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.1088, "grad_norm": 1.224706768989563, "learning_rate": 2.7500000000000004e-06, "loss": 0.0801, "num_tokens": 19767714.0, "reward": 1.1658241748809814, "reward_std": 0.1503104567527771, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.7769608497619629, "rewards/format_reward_step": 1.0, "step": 102 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8516613645479083, "aux_distill/mean_u": 0.23035033735915972, "aux_distill/n_active_tok": 108.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.08203125, "calib/ece": 0.09985117187500002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.08203125, "calib/gap": 0.1841112862862863, "calib/mean_conf": 0.5346800781250001, "calib/mu_c": 0.6123520270270271, "calib/mu_w": 0.42824074074074076, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028203124999999996, "calib/std_conf": 0.2673069865020486, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28984356796116506, "calib/step_q_c_n": 824.0, "calib/step_q_gap": -0.02840131607703411, "calib/step_q_w": 0.31824488403819917, "calib/step_q_w_n": 733.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 334.1796875, "completions/mean_terminated_length": 336.81103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.10986666666666667, "grad_norm": 1.4121969938278198, "learning_rate": 2.7222222222222224e-06, "loss": 0.0908, "num_tokens": 19957816.0, "reward": 1.1753480434417725, "reward_std": 0.15455801784992218, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7725712060928345, "rewards/format_reward_step": 1.0, "step": 103 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8432601001113653, "aux_distill/mean_u": 0.3180295561600359, "aux_distill/n_active_tok": 93.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.63671875, "calib/ece": 0.30630980392156865, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.24705882352941178, "calib/gap": 0.15246913496683379, "calib/mean_conf": 0.6391725490196078, "calib/mu_c": 0.738426966292135, "calib/mu_w": 0.5859578313253012, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29823137254901966, "calib/std_conf": 0.3083625724843869, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30692622950819665, "calib/step_q_c_n": 488.0, "calib/step_q_gap": 0.002957014848510797, "calib/step_q_w": 0.30396921465968585, "calib/step_q_w_n": 955.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 296.39453125, "completions/mean_terminated_length": 298.72833251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.11093333333333333, "grad_norm": 3.2531869411468506, "learning_rate": 2.6944444444444444e-06, "loss": 0.0498, "num_tokens": 20140373.0, "reward": 1.0019816160202026, "reward_std": 0.24969851970672607, "rewards/accuracy_reward_step": 0.34765625, "rewards/final_brier_reward_step": 0.6602132320404053, "rewards/format_reward_step": 0.99609375, "step": 104 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8673386499285698, "aux_distill/mean_u": 0.26861608940092646, "aux_distill/n_active_tok": 100.25, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.59765625, "calib/ece": 0.23828627450980394, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.3764705882352941, "calib/gap": 0.17266130820399095, "calib/mean_conf": 0.7040117647058823, "calib/mu_c": 0.7872954545454546, "calib/mu_w": 0.6146341463414636, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21232549019607844, "calib/std_conf": 0.3037210082328581, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2936667892156863, "calib/step_q_c_n": 816.0, "calib/step_q_gap": -0.002520381460144161, "calib/step_q_w": 0.29618717067583045, "calib/step_q_w_n": 873.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 345.3359375, "completions/mean_terminated_length": 348.05511474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.112, "grad_norm": 1.3178426027297974, "learning_rate": 2.666666666666667e-06, "loss": 0.0841, "num_tokens": 20334539.0, "reward": 1.102736473083496, "reward_std": 0.305696964263916, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7015665769577026, "rewards/format_reward_step": 0.98828125, "step": 105 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8582525728270411, "aux_distill/mean_u": 0.2746546685465814, "aux_distill/n_active_tok": 93.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 5.69921875, "calib/ece": 0.2409288884941433, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.453125, "calib/gap": 0.1835228672186019, "calib/mean_conf": 0.7755554865058568, "calib/mu_c": 0.8587142857142857, "calib/mu_w": 0.6751914184956838, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23480468750000005, "calib/std_conf": 0.261164994761232, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3307972053462941, "calib/step_q_c_n": 823.0, "calib/step_q_gap": 0.03137896635258336, "calib/step_q_w": 0.29941823899371073, "calib/step_q_w_n": 636.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 309.47265625, "completions/mean_terminated_length": 311.9094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.11306666666666666, "grad_norm": 0.6310628652572632, "learning_rate": 2.6388888888888893e-06, "loss": 0.0944, "num_tokens": 20518348.0, "reward": 1.1347626447677612, "reward_std": 0.25099045038223267, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7226502895355225, "rewards/format_reward_step": 1.0, "step": 106 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8792069256305695, "aux_distill/mean_u": 0.34678596406839074, "aux_distill/n_active_tok": 98.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.50390625, "calib/ece": 0.23453125000000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.46875, "calib/gap": 0.16238051187172375, "calib/mean_conf": 0.7342187499999999, "calib/mu_c": 0.8071631205673758, "calib/mu_w": 0.6447826086956521, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.208984375, "calib/std_conf": 0.3006714085150723, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32097870762711866, "calib/step_q_c_n": 944.0, "calib/step_q_gap": 0.033084255477326685, "calib/step_q_w": 0.287894452149792, "calib/step_q_w_n": 721.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 321.4921875, "completions/mean_terminated_length": 324.02362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.11413333333333334, "grad_norm": 0.6019020080566406, "learning_rate": 2.6111111111111113e-06, "loss": 0.0717, "num_tokens": 20705266.0, "reward": 1.1298301219940186, "reward_std": 0.23322361707687378, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7088788747787476, "rewards/format_reward_step": 1.0, "step": 107 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7998718656599522, "aux_distill/mean_u": 0.25381538008380866, "aux_distill/n_active_tok": 105.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 6.96484375, "calib/ece": 0.16759842519685036, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5866141732283464, "calib/gap": 0.2511341772151898, "calib/mean_conf": 0.7742913385826772, "calib/mu_c": 0.8524, "calib/mu_w": 0.6012658227848102, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1264566929133858, "calib/std_conf": 0.30715498178991935, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.26813386243386245, "calib/step_q_c_n": 1134.0, "calib/step_q_gap": -0.08027908055535171, "calib/step_q_w": 0.34841294298921416, "calib/step_q_w_n": 649.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 353.328125, "completions/mean_terminated_length": 356.1102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.1152, "grad_norm": 0.7394620776176453, "learning_rate": 2.5833333333333337e-06, "loss": 0.0762, "num_tokens": 20898950.0, "reward": 1.2267515659332275, "reward_std": 0.24268360435962677, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7816281318664551, "rewards/format_reward_step": 0.98828125, "step": 108 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8652790896594524, "aux_distill/mean_u": 0.2592223691426132, "aux_distill/n_active_tok": 120.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.63671875, "calib/ece": 0.2507929133858267, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5433070866141733, "calib/gap": 0.3318445442635224, "calib/mean_conf": 0.6992181102362205, "calib/mu_c": 0.8782051282051282, "calib/mu_w": 0.5463605839416058, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2446905511811023, "calib/std_conf": 0.36504522142847917, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24525593008739074, "calib/step_q_c_n": 801.0, "calib/step_q_gap": -0.012741115574985612, "calib/step_q_w": 0.25799704566237636, "calib/step_q_w_n": 1154.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 368.7734375, "completions/mean_terminated_length": 373.1462707519531, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.11626666666666667, "grad_norm": 0.8723954558372498, "learning_rate": 2.5555555555555557e-06, "loss": 0.0752, "num_tokens": 21097956.0, "reward": 1.0849031209945679, "reward_std": 0.17993712425231934, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.7205872535705566, "rewards/format_reward_step": 0.9921875, "step": 109 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8240192476660013, "aux_distill/mean_u": 0.19362646090396934, "aux_distill/n_active_tok": 96.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 6.3671875, "calib/ece": 0.3861378906250001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.7890625, "calib/gap": 0.10433952366412202, "calib/mean_conf": 0.856762890625, "calib/mu_c": 0.9077099236641221, "calib/mu_w": 0.8033704, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.36559101562500007, "calib/std_conf": 0.27331232918943993, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.275311004784689, "calib/step_q_c_n": 836.0, "calib/step_q_gap": -0.021312073734772063, "calib/step_q_w": 0.29662307851946107, "calib/step_q_w_n": 794.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 349.12109375, "completions/mean_terminated_length": 351.8700866699219, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.11733333333333333, "grad_norm": 0.7028835415840149, "learning_rate": 2.5277777777777778e-06, "loss": 0.0932, "num_tokens": 21292251.0, "reward": 1.060120940208435, "reward_std": 0.25176236033439636, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6085233688354492, "rewards/format_reward_step": 1.0, "step": 110 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8519136030226946, "aux_distill/mean_u": 0.2509307208367431, "aux_distill/n_active_tok": 110.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.21484375, "calib/ece": 0.26287401574803154, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7834645669291339, "calib/gap": 0.26822805682394724, "calib/mean_conf": 0.8201968503937009, "calib/mu_c": 0.9342465753424658, "calib/mu_w": 0.6660185185185186, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2541338582677166, "calib/std_conf": 0.32242370859491043, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21969259259259255, "calib/step_q_c_n": 945.0, "calib/step_q_gap": -0.05836239632093293, "calib/step_q_w": 0.2780549889135255, "calib/step_q_w_n": 902.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 374.2421875, "completions/mean_terminated_length": 377.18896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.1184, "grad_norm": 1.3935075998306274, "learning_rate": 2.5e-06, "loss": 0.0767, "num_tokens": 21495465.0, "reward": 1.1396938562393188, "reward_std": 0.2531803250312805, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.716887891292572, "rewards/format_reward_step": 0.9921875, "step": 111 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7935940055176616, "aux_distill/mean_u": 0.27512142615491214, "aux_distill/n_active_tok": 119.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 7.546875, "calib/ece": 0.3260714285714286, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6944444444444444, "calib/gap": 0.23053679435483887, "calib/mean_conf": 0.7325793650793652, "calib/mu_c": 0.8496774193548389, "calib/mu_w": 0.619140625, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2832936507936508, "calib/std_conf": 0.38686993534751096, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21454656862745097, "calib/step_q_c_n": 816.0, "calib/step_q_gap": -0.016405492304448682, "calib/step_q_w": 0.23095206093189966, "calib/step_q_w_n": 1116.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 401.26171875, "completions/mean_terminated_length": 406.019775390625, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.11946666666666667, "grad_norm": 0.6045711636543274, "learning_rate": 2.4722222222222226e-06, "loss": 0.0266, "num_tokens": 21706108.0, "reward": 1.058129072189331, "reward_std": 0.25581157207489014, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.6475081443786621, "rewards/format_reward_step": 0.984375, "step": 112 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7709256280213594, "aux_distill/mean_u": 0.19761347736346177, "aux_distill/n_active_tok": 100.375, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.0859375, "calib/ece": 0.34094117647058825, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6549019607843137, "calib/gap": 0.16162422360248452, "calib/mean_conf": 0.6900392156862746, "calib/mu_c": 0.7629285714285714, "calib/mu_w": 0.6013043478260869, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24098039215686273, "calib/std_conf": 0.4073862146340922, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.1886348054679285, "calib/step_q_c_n": 951.0, "calib/step_q_gap": 0.002134226093652686, "calib/step_q_w": 0.1865005793742758, "calib/step_q_w_n": 863.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 344.7734375, "completions/mean_terminated_length": 348.8616638183594, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.12053333333333334, "grad_norm": 0.6675346493721008, "learning_rate": 2.4444444444444447e-06, "loss": 0.0652, "num_tokens": 21899570.0, "reward": 1.0935156345367432, "reward_std": 0.25981682538986206, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.6440625190734863, "rewards/format_reward_step": 0.99609375, "step": 113 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7940104631707072, "aux_distill/mean_u": 0.22985165603566762, "aux_distill/n_active_tok": 125.0, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 7.71875, "calib/ece": 0.18881023622047247, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7244094488188977, "calib/gap": 0.37727772357723577, "calib/mean_conf": 0.7528433070866142, "calib/mu_c": 0.8865243902439024, "calib/mu_w": 0.5092466666666666, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.147992125984252, "calib/std_conf": 0.38016471463101426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21010340699815838, "calib/step_q_c_n": 1086.0, "calib/step_q_gap": 0.012003069919506698, "calib/step_q_w": 0.19810033707865168, "calib/step_q_w_n": 890.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 377.83203125, "completions/mean_terminated_length": 382.312255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.1216, "grad_norm": 0.6961753368377686, "learning_rate": 2.4166666666666667e-06, "loss": 0.0901, "num_tokens": 22101319.0, "reward": 1.2033731937408447, "reward_std": 0.18372264504432678, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7778403759002686, "rewards/format_reward_step": 0.98828125, "step": 114 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7925838343799114, "aux_distill/mean_u": 0.21411440185498998, "aux_distill/n_active_tok": 129.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.734375, "calib/ece": 0.3548208661417324, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7007874015748031, "calib/gap": 0.12842446727549484, "calib/mean_conf": 0.7268326771653543, "calib/mu_c": 0.7814383561643836, "calib/mu_w": 0.6530138888888888, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25342519685039383, "calib/std_conf": 0.3961359234375426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23115589285714283, "calib/step_q_c_n": 1120.0, "calib/step_q_gap": -0.027523642026578088, "calib/step_q_w": 0.2586795348837209, "calib/step_q_w_n": 860.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 379.71484375, "completions/mean_terminated_length": 382.7047119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.12266666666666666, "grad_norm": 0.566737711429596, "learning_rate": 2.388888888888889e-06, "loss": 0.0575, "num_tokens": 22303790.0, "reward": 1.0979235172271729, "reward_std": 0.22625698149204254, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6333469152450562, "rewards/format_reward_step": 0.9921875, "step": 115 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8143215561285615, "aux_distill/mean_u": 0.22442365785776003, "aux_distill/n_active_tok": 110.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.9296875, "calib/ece": 0.3188671874999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.6796875, "calib/gap": 0.2612024427480917, "calib/mean_conf": 0.7157421875000001, "calib/mu_c": 0.8432824427480916, "calib/mu_w": 0.5820799999999999, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26144531249999997, "calib/std_conf": 0.4012434724175769, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.19985699693564862, "calib/step_q_c_n": 979.0, "calib/step_q_gap": -0.012093526375483638, "calib/step_q_w": 0.21195052331113226, "calib/step_q_w_n": 1051.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 416.99609375, "completions/mean_terminated_length": 420.279541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.12373333333333333, "grad_norm": 0.4892609119415283, "learning_rate": 2.361111111111111e-06, "loss": 0.0871, "num_tokens": 22515061.0, "reward": 1.0948817729949951, "reward_std": 0.2809078097343445, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6780449151992798, "rewards/format_reward_step": 1.0, "step": 116 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.814437004737556, "aux_distill/mean_u": 0.22517764619918906, "aux_distill/n_active_tok": 124.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 8.37109375, "calib/ece": 0.3279015450952209, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7193675889328063, "calib/gap": 0.25905888442679836, "calib/mean_conf": 0.7500431189363996, "calib/mu_c": 0.8841803278688525, "calib/mu_w": 0.6251214434420541, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.29786561264822126, "calib/std_conf": 0.38683644816285084, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21326670366259712, "calib/step_q_c_n": 901.0, "calib/step_q_gap": -0.03654110632129984, "calib/step_q_w": 0.24980780998389696, "calib/step_q_w_n": 1242.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 394.45703125, "completions/mean_terminated_length": 400.7182922363281, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.1248, "grad_norm": 0.7132987380027771, "learning_rate": 2.3333333333333336e-06, "loss": 0.0144, "num_tokens": 22722642.0, "reward": 1.0538398027420044, "reward_std": 0.23345988988876343, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6467421650886536, "rewards/format_reward_step": 0.984375, "step": 117 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8741880189627409, "aux_distill/mean_u": 0.30376733348721374, "aux_distill/n_active_tok": 120.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.91796875, "calib/ece": 0.2701960784313725, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6862745098039216, "calib/gap": 0.2668129032258063, "calib/mean_conf": 0.7229803921568628, "calib/mu_c": 0.8276129032258063, "calib/mu_w": 0.5608, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1926666666666666, "calib/std_conf": 0.3975500210330464, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25283794037940377, "calib/step_q_c_n": 1107.0, "calib/step_q_gap": -0.0044661835336397004, "calib/step_q_w": 0.25730412391304347, "calib/step_q_w_n": 920.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 412.984375, "completions/mean_terminated_length": 416.2362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.12586666666666665, "grad_norm": 0.6454364061355591, "learning_rate": 2.305555555555556e-06, "loss": 0.0796, "num_tokens": 22932374.0, "reward": 1.1581437587738037, "reward_std": 0.17849767208099365, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7147250175476074, "rewards/format_reward_step": 0.99609375, "step": 118 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7642288217321038, "aux_distill/mean_u": 0.1817720144921363, "aux_distill/n_active_tok": 118.5, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 8.58984375, "calib/ece": 0.23912350597609555, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5338645418326693, "calib/gap": 0.3801702544031311, "calib/mean_conf": 0.5819920318725099, "calib/mu_c": 0.741027397260274, "calib/mu_w": 0.3608571428571429, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11972111553784856, "calib/std_conf": 0.43989315971992726, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2825702912621359, "calib/step_q_c_n": 1030.0, "calib/step_q_gap": 0.032339324623983645, "calib/step_q_w": 0.25023096663815225, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 441.87890625, "completions/mean_terminated_length": 450.6812744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.12693333333333334, "grad_norm": 0.6065315008163452, "learning_rate": 2.277777777777778e-06, "loss": 0.018, "num_tokens": 23150559.0, "reward": 1.142172932624817, "reward_std": 0.2332339584827423, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.7335648536682129, "rewards/format_reward_step": 0.98046875, "step": 119 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8176003079861403, "aux_distill/mean_u": 0.22354107810224982, "aux_distill/n_active_tok": 123.875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 8.140625, "calib/ece": 0.19992094861660073, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6047430830039525, "calib/gap": 0.423583066425289, "calib/mean_conf": 0.6472727272727272, "calib/mu_c": 0.7912574850299401, "calib/mu_w": 0.3676744186046511, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09355731225296439, "calib/std_conf": 0.4286647076080423, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2459582322357019, "calib/step_q_c_n": 1154.0, "calib/step_q_gap": -0.01889520862451316, "calib/step_q_w": 0.26485344086021506, "calib/step_q_w_n": 930.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 411.6640625, "completions/mean_terminated_length": 418.19842529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.128, "grad_norm": 0.7400944828987122, "learning_rate": 2.25e-06, "loss": 0.0619, "num_tokens": 23362633.0, "reward": 1.2066268920898438, "reward_std": 0.21622836589813232, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.772628903388977, "rewards/format_reward_step": 0.98828125, "step": 120 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7588197402656078, "aux_distill/mean_u": 0.23710005069366824, "aux_distill/n_active_tok": 141.125, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 9.17578125, "calib/ece": 0.2766963855421687, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5542168674698795, "calib/gap": 0.2997629651390521, "calib/mean_conf": 0.6042859437751005, "calib/mu_c": 0.7379152173913044, "calib/mu_w": 0.4381522522522523, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1633827309236948, "calib/std_conf": 0.43625114971277235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25048216007714563, "calib/step_q_c_n": 1037.0, "calib/step_q_gap": 0.0437382576381212, "calib/step_q_w": 0.20674390243902444, "calib/step_q_w_n": 1312.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 462.2578125, "completions/mean_terminated_length": 475.25299072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.12906666666666666, "grad_norm": 0.8531492352485657, "learning_rate": 2.222222222222222e-06, "loss": 0.0536, "num_tokens": 23586027.0, "reward": 1.100294828414917, "reward_std": 0.25339704751968384, "rewards/accuracy_reward_step": 0.5390625, "rewards/final_brier_reward_step": 0.6888708472251892, "rewards/format_reward_step": 0.97265625, "step": 121 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8276457553729415, "aux_distill/mean_u": 0.2641006238337189, "aux_distill/n_active_tok": 126.375, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 9.01171875, "calib/ece": 0.21577510040160652, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7469879518072289, "calib/gap": 0.35342051649928263, "calib/mean_conf": 0.765269076305221, "calib/mu_c": 0.8859146341463416, "calib/mu_w": 0.532494117647059, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16120481927710853, "calib/std_conf": 0.3897101743185824, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23407108239095317, "calib/step_q_c_n": 1238.0, "calib/step_q_gap": 0.05060990371929741, "calib/step_q_w": 0.18346117867165576, "calib/step_q_w_n": 1069.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 418.12890625, "completions/mean_terminated_length": 429.8835144042969, "completions/min_length": 0.0, "completions/min_terminated_length": 182.0, "epoch": 0.13013333333333332, "grad_norm": 0.5496814250946045, "learning_rate": 2.1944444444444445e-06, "loss": 0.0207, "num_tokens": 23800412.0, "reward": 1.181523084640503, "reward_std": 0.2424289733171463, "rewards/accuracy_reward_step": 0.640625, "rewards/final_brier_reward_step": 0.7497649192810059, "rewards/format_reward_step": 0.97265625, "step": 122 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7614726750180125, "aux_distill/mean_u": 0.19446524486235212, "aux_distill/n_active_tok": 157.375, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 9.91015625, "calib/ece": 0.21352225086617072, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5991902834008097, "calib/gap": 0.41288853684169874, "calib/mean_conf": 0.6422671901374257, "calib/mu_c": 0.7977272465191181, "calib/mu_w": 0.38483870967741934, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.11615382981353915, "calib/std_conf": 0.4308034461532648, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.2534844621513944, "calib/step_q_c_n": 1255.0, "calib/step_q_gap": -0.07045063925266176, "calib/step_q_w": 0.3239351014040562, "calib/step_q_w_n": 1282.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 494.14453125, "completions/mean_terminated_length": 510.08465576171875, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.1312, "grad_norm": 1.6037333011627197, "learning_rate": 2.166666666666667e-06, "loss": 0.0292, "num_tokens": 24032201.0, "reward": 1.1502926349639893, "reward_std": 0.24385644495487213, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.741991400718689, "rewards/format_reward_step": 0.95703125, "step": 123 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.824720211327076, "aux_distill/mean_u": 0.24733909527073586, "aux_distill/n_active_tok": 115.125, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 7.8828125, "calib/ece": 0.24675849802371536, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7193675889328063, "calib/gap": 0.29872926829268287, "calib/mean_conf": 0.7500786561264823, "calib/mu_c": 0.8469000000000001, "calib/mu_w": 0.5481707317073172, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1604739130434782, "calib/std_conf": 0.3967623338775554, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24739656301145663, "calib/step_q_c_n": 1222.0, "calib/step_q_gap": -0.024907205832764512, "calib/step_q_w": 0.27230376884422114, "calib/step_q_w_n": 796.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 439.67578125, "completions/mean_terminated_length": 444.88934326171875, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.13226666666666667, "grad_norm": 0.6115875840187073, "learning_rate": 2.138888888888889e-06, "loss": 0.0567, "num_tokens": 24251574.0, "reward": 1.1981837749481201, "reward_std": 0.2457866370677948, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7401175498962402, "rewards/format_reward_step": 0.98828125, "step": 124 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7017071144655347, "aux_distill/mean_u": 0.15763443178974784, "aux_distill/n_active_tok": 116.5, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 8.8984375, "calib/ece": 0.3583236274653614, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7991967871485943, "calib/gap": 0.12134742182954572, "calib/mean_conf": 0.8181421013609436, "calib/mu_c": 0.8663886666666666, "calib/mu_w": 0.7450412448371209, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28702804513604413, "calib/std_conf": 0.3617559594279362, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23328895814781833, "calib/step_q_c_n": 1123.0, "calib/step_q_gap": 0.03644211832097849, "calib/step_q_w": 0.19684683982683984, "calib/step_q_w_n": 1155.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 444.05859375, "completions/mean_terminated_length": 456.5421447753906, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.13333333333333333, "grad_norm": 0.5504864454269409, "learning_rate": 2.1111111111111114e-06, "loss": 0.0249, "num_tokens": 24470061.0, "reward": 1.0911345481872559, "reward_std": 0.22019320726394653, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6236752271652222, "rewards/format_reward_step": 0.97265625, "step": 125 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7042123721912503, "aux_distill/mean_u": 0.19295040940696292, "aux_distill/n_active_tok": 128.5, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 8.70703125, "calib/ece": 0.3480152000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.788, "calib/gap": 0.18341957186544344, "calib/mean_conf": 0.8060624000000001, "calib/mu_c": 0.8860333333333333, "calib/mu_w": 0.7026137614678899, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2950388000000001, "calib/std_conf": 0.37368177331285507, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2567842105263158, "calib/step_q_c_n": 1045.0, "calib/step_q_gap": 0.005554987553342827, "calib/step_q_w": 0.25122922297297295, "calib/step_q_w_n": 1184.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 438.39453125, "completions/mean_terminated_length": 448.916015625, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1344, "grad_norm": 0.5972703695297241, "learning_rate": 2.0833333333333334e-06, "loss": 0.0183, "num_tokens": 24687754.0, "reward": 1.0752307176589966, "reward_std": 0.20789799094200134, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.6270239353179932, "rewards/format_reward_step": 0.97265625, "step": 126 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.6871523316949606, "aux_distill/mean_u": 0.17274493110363137, "aux_distill/n_active_tok": 130.25, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 10.15625, "calib/ece": 0.3320694891576859, "calib/final_conf_rate": 0.9375, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.8166666666666667, "calib/gap": 0.21511819003880417, "calib/mean_conf": 0.8359305108423143, "calib/mu_c": 0.9309410447761194, "calib/mu_w": 0.7158228547373152, "calib/nonempty_final_conf_rate": 0.9375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.30483333333333346, "calib/std_conf": 0.3469884047970489, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25272319277108435, "calib/step_q_c_n": 996.0, "calib/step_q_gap": 0.0026969132996860767, "calib/step_q_w": 0.25002627947139827, "calib/step_q_w_n": 1604.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 419.29296875, "completions/mean_terminated_length": 447.245849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.13546666666666668, "grad_norm": 0.6025396585464478, "learning_rate": 2.0555555555555555e-06, "loss": -0.0473, "num_tokens": 24898765.0, "reward": 1.0407984256744385, "reward_std": 0.25408264994621277, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6206594109535217, "rewards/format_reward_step": 0.9375, "step": 127 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8674097079783678, "aux_distill/mean_u": 0.26414057126039797, "aux_distill/n_active_tok": 113.75, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 8.3203125, "calib/ece": 0.26362549800796825, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8326693227091634, "calib/gap": 0.3066482431898936, "calib/mean_conf": 0.8496812749003984, "calib/mu_c": 0.974295302013423, "calib/mu_w": 0.6676470588235294, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2598406374501993, "calib/std_conf": 0.3359945367014103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24466067864271454, "calib/step_q_c_n": 1002.0, "calib/step_q_gap": -0.0230578497260798, "calib/step_q_w": 0.26771852836879434, "calib/step_q_w_n": 1128.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 423.8828125, "completions/mean_terminated_length": 434.0560302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.13653333333333334, "grad_norm": 0.6722913980484009, "learning_rate": 2.027777777777778e-06, "loss": 0.0224, "num_tokens": 25113943.0, "reward": 1.1382665634155273, "reward_std": 0.22327424585819244, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7140331268310547, "rewards/format_reward_step": 0.98046875, "step": 128 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8107666317373514, "aux_distill/mean_u": 0.30531200243128254, "aux_distill/n_active_tok": 140.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.48828125, "calib/ece": 0.2943433070866142, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8818897637795275, "calib/gap": 0.16955083199141174, "calib/mean_conf": 0.897703937007874, "calib/mu_c": 0.9591160493827161, "calib/mu_w": 0.7895652173913044, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.27712598425196855, "calib/std_conf": 0.2804184666550919, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2683291239147592, "calib/step_q_c_n": 1267.0, "calib/step_q_gap": 0.026084090802176413, "calib/step_q_w": 0.2422450331125828, "calib/step_q_w_n": 906.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 430.71875, "completions/mean_terminated_length": 435.82611083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.1376, "grad_norm": 0.5686946511268616, "learning_rate": 2.0000000000000003e-06, "loss": 0.0528, "num_tokens": 25326591.0, "reward": 1.160329818725586, "reward_std": 0.20995208621025085, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6956596374511719, "rewards/format_reward_step": 0.9921875, "step": 129 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8531761262565851, "aux_distill/mean_u": 0.21606510653578878, "aux_distill/n_active_tok": 113.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.3671875, "calib/ece": 0.25872795275590554, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8937007874015748, "calib/gap": 0.20760023250898318, "calib/mean_conf": 0.907074409448819, "calib/mu_c": 0.9749122807017543, "calib/mu_w": 0.7673120481927711, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24628700787401575, "calib/std_conf": 0.26971461735983954, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22322997746055598, "calib/step_q_c_n": 1331.0, "calib/step_q_gap": 0.031155748114070153, "calib/step_q_w": 0.19207422934648583, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 432.1796875, "completions/mean_terminated_length": 437.3043518066406, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.13866666666666666, "grad_norm": 0.35849037766456604, "learning_rate": 1.9722222222222224e-06, "loss": 0.0914, "num_tokens": 25542517.0, "reward": 1.1991316080093384, "reward_std": 0.1761476993560791, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7381070256233215, "rewards/format_reward_step": 0.9921875, "step": 130 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7969897780567408, "aux_distill/mean_u": 0.21326160858040874, "aux_distill/n_active_tok": 132.375, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 8.53125, "calib/ece": 0.3938095238095238, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.1807711693548386, "calib/mean_conf": 0.8728571428571428, "calib/mu_c": 0.9646774193548386, "calib/mu_w": 0.78390625, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.38730158730158726, "calib/std_conf": 0.30949358826821977, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23247561761546723, "calib/step_q_c_n": 931.0, "calib/step_q_gap": 0.012266838684900583, "calib/step_q_w": 0.22020877893056665, "calib/step_q_w_n": 1253.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 432.69140625, "completions/mean_terminated_length": 437.8221435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.13973333333333332, "grad_norm": 0.6910375952720642, "learning_rate": 1.944444444444445e-06, "loss": 0.0493, "num_tokens": 25759494.0, "reward": 1.029508113861084, "reward_std": 0.18506643176078796, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5902664065361023, "rewards/format_reward_step": 0.984375, "step": 131 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8498719027265906, "aux_distill/mean_u": 0.290067270262824, "aux_distill/n_active_tok": 120.125, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 8.43359375, "calib/ece": 0.2372379211918686, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8849206349206349, "calib/gap": 0.2252981316928686, "calib/mean_conf": 0.8945874756335284, "calib/mu_c": 0.9634285714285715, "calib/mu_w": 0.7381304397357029, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2186904761904762, "calib/std_conf": 0.2894467810542942, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2468620178041543, "calib/step_q_c_n": 1348.0, "calib/step_q_gap": 0.04331084641081273, "calib/step_q_w": 0.20355117139334156, "calib/step_q_w_n": 811.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 452.33984375, "completions/mean_terminated_length": 461.3506164550781, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.1408, "grad_norm": 0.5889887809753418, "learning_rate": 1.916666666666667e-06, "loss": 0.0537, "num_tokens": 25980885.0, "reward": 1.207842469215393, "reward_std": 0.29401665925979614, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7477160692214966, "rewards/format_reward_step": 0.984375, "step": 132 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.799846863374114, "aux_distill/mean_u": 0.26558087362637905, "aux_distill/n_active_tok": 160.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.68359375, "calib/ece": 0.36370354330708676, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8937007874015748, "calib/gap": 0.14455491085491112, "calib/mean_conf": 0.9048003937007874, "calib/mu_c": 0.9679720279720281, "calib/mu_w": 0.823417117117117, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3527559055118112, "calib/std_conf": 0.2743696667909248, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.24034103299856527, "calib/step_q_c_n": 1394.0, "calib/step_q_gap": -0.003924036125858704, "calib/step_q_w": 0.24426506912442397, "calib/step_q_w_n": 1085.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 534.34375, "completions/mean_terminated_length": 540.6798706054688, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.14186666666666667, "grad_norm": 0.40780898928642273, "learning_rate": 1.888888888888889e-06, "loss": 0.0611, "num_tokens": 26224021.0, "reward": 1.0855050086975098, "reward_std": 0.33582109212875366, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.6241352558135986, "rewards/format_reward_step": 0.98828125, "step": 133 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7864964846521616, "aux_distill/mean_u": 0.21162502770453695, "aux_distill/n_active_tok": 146.625, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 10.14453125, "calib/ece": 0.34138379446640316, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7707509881422925, "calib/gap": 0.25076517499999995, "calib/mean_conf": 0.7975885375494071, "calib/mu_c": 0.9214843749999999, "calib/mu_w": 0.6707192, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.31652173913043474, "calib/std_conf": 0.37247587807807786, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24591915444348578, "calib/step_q_c_n": 1159.0, "calib/step_q_gap": 0.02796393886629528, "calib/step_q_w": 0.2179552155771905, "calib/step_q_w_n": 1438.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 553.34765625, "completions/mean_terminated_length": 559.9091186523438, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.14293333333333333, "grad_norm": 0.34763002395629883, "learning_rate": 1.8611111111111113e-06, "loss": 0.0455, "num_tokens": 26474630.0, "reward": 1.0661208629608154, "reward_std": 0.26195263862609863, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.6439605355262756, "rewards/format_reward_step": 0.98828125, "step": 134 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7998809479176998, "aux_distill/mean_u": 0.23606169165538424, "aux_distill/n_active_tok": 139.875, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 10.41015625, "calib/ece": 0.3348012096774194, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8709677419354839, "calib/gap": 0.1498497541590893, "calib/mean_conf": 0.8940697580645163, "calib/mu_c": 0.9550972789115645, "calib/mu_w": 0.8052475247524752, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3180645161290323, "calib/std_conf": 0.2821037821153988, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24970825892857146, "calib/step_q_c_n": 1344.0, "calib/step_q_gap": -0.002882049343265969, "calib/step_q_w": 0.2525903082718374, "calib/step_q_w_n": 1321.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1928.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 491.953125, "completions/mean_terminated_length": 507.82257080078125, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.144, "grad_norm": 0.7895566821098328, "learning_rate": 1.8333333333333333e-06, "loss": 0.0048, "num_tokens": 26706450.0, "reward": 1.0914466381072998, "reward_std": 0.31719958782196045, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6399244666099548, "rewards/format_reward_step": 0.96875, "step": 135 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8619387056678534, "aux_distill/mean_u": 0.23585160573000902, "aux_distill/n_active_tok": 147.375, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 11.0859375, "calib/ece": 0.22354448979591832, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.636734693877551, "calib/gap": 0.40863818132757695, "calib/mean_conf": 0.6740065306122449, "calib/mu_c": 0.8558088235294118, "calib/mu_w": 0.44717064220183483, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1712244897959183, "calib/std_conf": 0.43286631545939436, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2619659426585577, "calib/step_q_c_n": 1151.0, "calib/step_q_gap": 0.0020058952370994865, "calib/step_q_w": 0.25996004742145823, "calib/step_q_w_n": 1687.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 472.9140625, "completions/mean_terminated_length": 494.14691162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.14506666666666668, "grad_norm": 0.5085462927818298, "learning_rate": 1.8055555555555557e-06, "loss": 0.036, "num_tokens": 26936004.0, "reward": 1.1046361923217773, "reward_std": 0.20290738344192505, "rewards/accuracy_reward_step": 0.53125, "rewards/final_brier_reward_step": 0.720991313457489, "rewards/format_reward_step": 0.95703125, "step": 136 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7779526645317674, "aux_distill/mean_u": 0.24285273704166116, "aux_distill/n_active_tok": 143.25, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.35546875, "calib/ece": 0.23234280236066573, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5866141732283464, "calib/gap": 0.3843206332307751, "calib/mean_conf": 0.6412005047259485, "calib/mu_c": 0.8257954545454547, "calib/mu_w": 0.4414748213146796, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17692913385826775, "calib/std_conf": 0.4305953745895403, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25945538847117794, "calib/step_q_c_n": 1197.0, "calib/step_q_gap": 0.03860481105481495, "calib/step_q_w": 0.220850577416363, "calib/step_q_w_n": 1198.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 485.73046875, "completions/mean_terminated_length": 489.55511474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.14613333333333334, "grad_norm": 0.46715131402015686, "learning_rate": 1.777777777777778e-06, "loss": 0.0587, "num_tokens": 27167335.0, "reward": 1.1220433712005615, "reward_std": 0.19661134481430054, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.7362741827964783, "rewards/format_reward_step": 0.9921875, "step": 137 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8091667117550969, "aux_distill/mean_u": 0.2716275383094954, "aux_distill/n_active_tok": 127.875, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 10.17578125, "calib/ece": 0.2742096385542168, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5863453815261044, "calib/gap": 0.30169601889338743, "calib/mean_conf": 0.6436208835341366, "calib/mu_c": 0.7381280701754387, "calib/mu_w": 0.43643205128205126, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11554176706827307, "calib/std_conf": 0.42848028822246104, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2989956616052061, "calib/step_q_c_n": 1383.0, "calib/step_q_gap": 0.02238592347099988, "calib/step_q_w": 0.2766097381342062, "calib/step_q_w_n": 1222.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2664.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 497.33203125, "completions/mean_terminated_length": 511.313232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.1472, "grad_norm": 0.4575296640396118, "learning_rate": 1.75e-06, "loss": 0.0189, "num_tokens": 27398988.0, "reward": 1.174954891204834, "reward_std": 0.2306070327758789, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7092845439910889, "rewards/format_reward_step": 0.97265625, "step": 138 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8192444164305925, "aux_distill/mean_u": 0.2686307010511263, "aux_distill/n_active_tok": 122.625, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 8.55078125, "calib/ece": 0.19777051792828676, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.5179282868525896, "calib/gap": 0.4742080321285141, "calib/mean_conf": 0.5787235059760957, "calib/mu_c": 0.7355333333333334, "calib/mu_w": 0.2613253012048193, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05358565737051783, "calib/std_conf": 0.43468715302071437, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2363246855345912, "calib/step_q_c_n": 1272.0, "calib/step_q_gap": -0.03599483463989081, "calib/step_q_w": 0.272319520174482, "calib/step_q_w_n": 917.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 423.83203125, "completions/mean_terminated_length": 432.27490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.14826666666666666, "grad_norm": 0.47275084257125854, "learning_rate": 1.7222222222222224e-06, "loss": 0.0267, "num_tokens": 27610585.0, "reward": 1.2063415050506592, "reward_std": 0.16486656665802002, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.7759643197059631, "rewards/format_reward_step": 0.98046875, "step": 139 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7892370466142893, "aux_distill/mean_u": 0.22641714346360392, "aux_distill/n_active_tok": 144.625, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.5234375, "calib/ece": 0.1941599999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.764, "calib/gap": 0.333031746031746, "calib/mean_conf": 0.79464, "calib/mu_c": 0.8878888888888888, "calib/mu_w": 0.5548571428571428, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1343999999999999, "calib/std_conf": 0.36501406876995857, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2524066025641026, "calib/step_q_c_n": 1560.0, "calib/step_q_gap": 0.0037251674843759297, "calib/step_q_w": 0.24868143507972665, "calib/step_q_w_n": 878.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 462.64453125, "completions/mean_terminated_length": 473.7480163574219, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.14933333333333335, "grad_norm": 0.3123050034046173, "learning_rate": 1.6944444444444446e-06, "loss": 0.0137, "num_tokens": 27834038.0, "reward": 1.2274765968322754, "reward_std": 0.21313458681106567, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7752655744552612, "rewards/format_reward_step": 0.9765625, "step": 140 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7888972610235214, "aux_distill/mean_u": 0.2564832817743102, "aux_distill/n_active_tok": 138.5, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.1015625, "calib/ece": 0.12324479999999999, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.74, "calib/gap": 0.5349787951345657, "calib/mean_conf": 0.7746048000000001, "calib/mu_c": 0.8880203045685279, "calib/mu_w": 0.35304150943396223, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05492480000000001, "calib/std_conf": 0.3798887015652875, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2544185230769231, "calib/step_q_c_n": 1625.0, "calib/step_q_gap": 0.0246826365521004, "calib/step_q_w": 0.22973588652482269, "calib/step_q_w_n": 705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 497.48828125, "completions/mean_terminated_length": 509.4280090332031, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.1504, "grad_norm": 0.26303279399871826, "learning_rate": 1.6666666666666667e-06, "loss": 0.0068, "num_tokens": 28068491.0, "reward": 1.296480417251587, "reward_std": 0.19722458720207214, "rewards/accuracy_reward_step": 0.76953125, "rewards/final_brier_reward_step": 0.8468671441078186, "rewards/format_reward_step": 0.9765625, "step": 141 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7698502484709024, "aux_distill/mean_u": 0.20123263576425643, "aux_distill/n_active_tok": 141.75, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 9.453125, "calib/ece": 0.18946000000000002, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.664, "calib/gap": 0.45957737148913624, "calib/mean_conf": 0.70006, "calib/mu_c": 0.8875675675675676, "calib/mu_w": 0.42799019607843136, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14876000000000003, "calib/std_conf": 0.41952937489525094, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2402120717781403, "calib/step_q_c_n": 1226.0, "calib/step_q_gap": -0.004463807618844656, "calib/step_q_w": 0.24467587939698496, "calib/step_q_w_n": 1194.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 506.0, "completions/mean_terminated_length": 516.0797119140625, "completions/min_length": 0.0, "completions/min_terminated_length": 177.0, "epoch": 0.15146666666666667, "grad_norm": 0.32701781392097473, "learning_rate": 1.638888888888889e-06, "loss": 0.0514, "num_tokens": 28303187.0, "reward": 1.1644487380981445, "reward_std": 0.23320737481117249, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7742100954055786, "rewards/format_reward_step": 0.9765625, "step": 142 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8147965837270021, "aux_distill/mean_u": 0.22807440234532067, "aux_distill/n_active_tok": 126.375, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 9.73828125, "calib/ece": 0.2558064516129033, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.7782258064516129, "calib/gap": 0.344020680055677, "calib/mean_conf": 0.807741935483871, "calib/mu_c": 0.9561702127659574, "calib/mu_w": 0.6121495327102804, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.24750000000000003, "calib/std_conf": 0.3621215236889883, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.24809659090909092, "calib/step_q_c_n": 1056.0, "calib/step_q_gap": -0.049370354115265414, "calib/step_q_w": 0.29746694502435633, "calib/step_q_w_n": 1437.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 480.73828125, "completions/mean_terminated_length": 494.25299072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.15253333333333333, "grad_norm": 0.4245584011077881, "learning_rate": 1.6111111111111113e-06, "loss": 0.0173, "num_tokens": 28533592.0, "reward": 1.112665057182312, "reward_std": 0.22011610865592957, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.709705114364624, "rewards/format_reward_step": 0.96484375, "step": 143 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8023251444101334, "aux_distill/mean_u": 0.24155563343629866, "aux_distill/n_active_tok": 126.375, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.10546875, "calib/ece": 0.20767667984189725, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7351778656126482, "calib/gap": 0.3498372549019608, "calib/mean_conf": 0.774299604743083, "calib/mu_c": 0.8655614973262032, "calib/mu_w": 0.5157242424242424, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12142292490118577, "calib/std_conf": 0.38275148920672053, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20582251407129454, "calib/step_q_c_n": 1599.0, "calib/step_q_gap": -0.04698650232214807, "calib/step_q_w": 0.2528090163934426, "calib/step_q_w_n": 732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 497.22265625, "completions/mean_terminated_length": 505.1151123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.1536, "grad_norm": 0.3189343810081482, "learning_rate": 1.5833333333333333e-06, "loss": 0.0464, "num_tokens": 28765009.0, "reward": 1.251899003982544, "reward_std": 0.1643814742565155, "rewards/accuracy_reward_step": 0.73046875, "rewards/final_brier_reward_step": 0.7850477695465088, "rewards/format_reward_step": 0.98828125, "step": 144 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7487006606534123, "aux_distill/mean_u": 0.22828780529426004, "aux_distill/n_active_tok": 154.25, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 9.53515625, "calib/ece": 0.24680318725099604, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8406374501992032, "calib/gap": 0.24577970233876667, "calib/mean_conf": 0.8661051792828686, "calib/mu_c": 0.9493373493975902, "calib/mu_w": 0.7035576470588235, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22577689243027887, "calib/std_conf": 0.31133066281148913, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2027402123424021, "calib/step_q_c_n": 1507.0, "calib/step_q_gap": -0.055428952539824855, "calib/step_q_w": 0.25816916488222696, "calib/step_q_w_n": 934.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 486.19921875, "completions/mean_terminated_length": 495.8844909667969, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.15466666666666667, "grad_norm": 0.3531167805194855, "learning_rate": 1.5555555555555558e-06, "loss": 0.0375, "num_tokens": 28992180.0, "reward": 1.1807942390441895, "reward_std": 0.2349383682012558, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7326824069023132, "rewards/format_reward_step": 0.98046875, "step": 145 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8367437515407801, "aux_distill/mean_u": 0.26077006896482124, "aux_distill/n_active_tok": 185.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.51171875, "calib/ece": 0.31686507936507935, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.753968253968254, "calib/gap": 0.34171163202122956, "calib/mean_conf": 0.7851984126984127, "calib/mu_c": 0.965546218487395, "calib/mu_w": 0.6238345864661654, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.3149206349206349, "calib/std_conf": 0.3751057527041757, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.20598406593406593, "calib/step_q_c_n": 1092.0, "calib/step_q_gap": -0.025591919056553197, "calib/step_q_w": 0.23157598499061913, "calib/step_q_w_n": 1599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 532.8828125, "completions/mean_terminated_length": 543.498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.15573333333333333, "grad_norm": 0.3752824366092682, "learning_rate": 1.527777777777778e-06, "loss": 0.04, "num_tokens": 29235814.0, "reward": 1.0565924644470215, "reward_std": 0.2701501250267029, "rewards/accuracy_reward_step": 0.46484375, "rewards/final_brier_reward_step": 0.6678722500801086, "rewards/format_reward_step": 0.98046875, "step": 146 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8423489462584257, "aux_distill/mean_u": 0.25418225832492053, "aux_distill/n_active_tok": 157.625, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 9.234375, "calib/ece": 0.33116553015758, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8152610441767069, "calib/gap": 0.16782206675464206, "calib/mean_conf": 0.8400649919307734, "calib/mu_c": 0.9108333333333333, "calib/mu_w": 0.7430112665786912, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2964586345381526, "calib/std_conf": 0.33575434298393025, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2360386038687973, "calib/step_q_c_n": 1189.0, "calib/step_q_gap": 0.021529858653178607, "calib/step_q_w": 0.2145087452156187, "calib/step_q_w_n": 1175.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 502.1875, "completions/mean_terminated_length": 514.2400512695312, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1568, "grad_norm": 0.33426088094711304, "learning_rate": 1.5e-06, "loss": 0.0549, "num_tokens": 29468054.0, "reward": 1.0869696140289307, "reward_std": 0.1892041116952896, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.638782799243927, "rewards/format_reward_step": 0.97265625, "step": 147 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7472190484404564, "aux_distill/mean_u": 0.20466814080151272, "aux_distill/n_active_tok": 133.5, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 9.3359375, "calib/ece": 0.19012244897959188, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.8816326530612245, "calib/gap": 0.2648595848595847, "calib/mean_conf": 0.894530612244898, "calib/mu_c": 0.9626373626373625, "calib/mu_w": 0.6977777777777778, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1708979591836735, "calib/std_conf": 0.28752619061118634, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2247715395480226, "calib/step_q_c_n": 1416.0, "calib/step_q_gap": -0.08958164320351741, "calib/step_q_w": 0.31435318275154, "calib/step_q_w_n": 974.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 455.68359375, "completions/mean_terminated_length": 474.2073059082031, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.15786666666666666, "grad_norm": 0.41133859753608704, "learning_rate": 1.4722222222222225e-06, "loss": -0.002, "num_tokens": 29689821.0, "reward": 1.2150394916534424, "reward_std": 0.21160367131233215, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.76601642370224, "rewards/format_reward_step": 0.953125, "step": 148 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7386438930407166, "aux_distill/mean_u": 0.16422576102609746, "aux_distill/n_active_tok": 129.75, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 9.6015625, "calib/ece": 0.25140161752988055, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8326693227091634, "calib/gap": 0.26694276221587043, "calib/mean_conf": 0.8634979840637451, "calib/mu_c": 0.9624050632911393, "calib/mu_w": 0.6954623010752689, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2427087649402391, "calib/std_conf": 0.3119481918486996, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.22707812499999996, "calib/step_q_c_n": 1280.0, "calib/step_q_gap": 0.023896622453310673, "calib/step_q_w": 0.2031815025466893, "calib/step_q_w_n": 1178.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 525.71484375, "completions/mean_terminated_length": 538.33203125, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.15893333333333334, "grad_norm": 0.2715623676776886, "learning_rate": 1.4444444444444445e-06, "loss": 0.0575, "num_tokens": 29928860.0, "reward": 1.157318115234375, "reward_std": 0.22797748446464539, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7208861112594604, "rewards/format_reward_step": 0.9765625, "step": 149 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8586932197213173, "aux_distill/mean_u": 0.2912806011103142, "aux_distill/n_active_tok": 152.125, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.04296875, "calib/ece": 0.2258000000000001, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.904, "calib/gap": 0.23218693284936498, "calib/mean_conf": 0.91476, "calib/mu_c": 0.985344827586207, "calib/mu_w": 0.753157894736842, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22228000000000012, "calib/std_conf": 0.2608864549952718, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24752561071710008, "calib/step_q_c_n": 1269.0, "calib/step_q_gap": -0.04520861490431485, "calib/step_q_w": 0.29273422562141493, "calib/step_q_w_n": 1046.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 428.546875, "completions/mean_terminated_length": 440.5943603515625, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.16, "grad_norm": 0.25994738936424255, "learning_rate": 1.4166666666666667e-06, "loss": 0.0239, "num_tokens": 30143528.0, "reward": 1.2044689655303955, "reward_std": 0.1882563829421997, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.752687931060791, "rewards/format_reward_step": 0.9765625, "step": 150 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7531673237681389, "aux_distill/mean_u": 0.2139044884626973, "aux_distill/n_active_tok": 176.0, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 10.06640625, "calib/ece": 0.30708063241106714, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8102766798418972, "calib/gap": 0.18073650793650786, "calib/mean_conf": 0.8372142292490119, "calib/mu_c": 0.902222222222222, "calib/mu_w": 0.7214857142857142, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2519893280632411, "calib/std_conf": 0.34190325867934074, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2547523510971787, "calib/step_q_c_n": 1595.0, "calib/step_q_gap": 0.004969968205121689, "calib/step_q_w": 0.249782382892057, "calib/step_q_w_n": 982.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2939.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 562.5859375, "completions/mean_terminated_length": 569.2569580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.16106666666666666, "grad_norm": 0.2744375765323639, "learning_rate": 1.3888888888888892e-06, "loss": 0.0603, "num_tokens": 30394574.0, "reward": 1.151229977607727, "reward_std": 0.18375049531459808, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.6852724552154541, "rewards/format_reward_step": 0.984375, "step": 151 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8130042403936386, "aux_distill/mean_u": 0.24612967474911507, "aux_distill/n_active_tok": 140.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.87109375, "calib/ece": 0.3172618954450678, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8214285714285714, "calib/gap": 0.18971410155844615, "calib/mean_conf": 0.8573412605244328, "calib/mu_c": 0.9333774678950798, "calib/mu_w": 0.7436633663366337, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2876984033815757, "calib/std_conf": 0.32317201413168617, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.23970624673771612, "calib/step_q_c_n": 1354.0, "calib/step_q_gap": -0.08222896212843905, "calib/step_q_w": 0.3219352088661552, "calib/step_q_w_n": 1173.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 514.3984375, "completions/mean_terminated_length": 522.5635375976562, "completions/min_length": 0.0, "completions/min_terminated_length": 168.0, "epoch": 0.16213333333333332, "grad_norm": 0.5380045175552368, "learning_rate": 1.3611111111111112e-06, "loss": 0.061, "num_tokens": 30631652.0, "reward": 1.1183058023452759, "reward_std": 0.2768405079841614, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.6662992238998413, "rewards/format_reward_step": 0.98046875, "step": 152 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8390794601291418, "aux_distill/mean_u": 0.26110398384518824, "aux_distill/n_active_tok": 144.125, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 10.3125, "calib/ece": 0.28084677419354837, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8588709677419355, "calib/gap": 0.17451219512195115, "calib/mean_conf": 0.8722983870967742, "calib/mu_c": 0.9299999999999999, "calib/mu_w": 0.7554878048780488, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24189516129032254, "calib/std_conf": 0.31580535756754713, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2394113524877365, "calib/step_q_c_n": 1427.0, "calib/step_q_gap": 7.087433439767477e-05, "calib/step_q_w": 0.23934047815333884, "calib/step_q_w_n": 1213.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2672.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 527.36328125, "completions/mean_terminated_length": 540.02001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.1632, "grad_norm": 0.38069459795951843, "learning_rate": 1.3333333333333334e-06, "loss": 0.0659, "num_tokens": 30873977.0, "reward": 1.1549255847930908, "reward_std": 0.22423149645328522, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.6926636695861816, "rewards/format_reward_step": 0.96875, "step": 153 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7914398023858666, "aux_distill/mean_u": 0.26568597258707727, "aux_distill/n_active_tok": 134.875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 8.640625, "calib/ece": 0.3906719367588934, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8616600790513834, "calib/gap": 0.17957687500000008, "calib/mean_conf": 0.8755731225296443, "calib/mu_c": 0.964296875, "calib/mu_w": 0.78472, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3801581027667985, "calib/std_conf": 0.31197725857903, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22771812080536916, "calib/step_q_c_n": 1043.0, "calib/step_q_gap": -0.026976233343476014, "calib/step_q_w": 0.25469435414884517, "calib/step_q_w_n": 1169.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 485.9296875, "completions/mean_terminated_length": 491.69171142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.16426666666666667, "grad_norm": 0.36173632740974426, "learning_rate": 1.3055555555555556e-06, "loss": 0.0564, "num_tokens": 31102815.0, "reward": 1.0435129404067993, "reward_std": 0.19690637290477753, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5987445116043091, "rewards/format_reward_step": 0.98828125, "step": 154 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7353885294869542, "aux_distill/mean_u": 0.17460190229666994, "aux_distill/n_active_tok": 140.0, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 9.6171875, "calib/ece": 0.37886639676113354, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8259109311740891, "calib/gap": 0.17066701680672258, "calib/mean_conf": 0.8434008097165993, "calib/mu_c": 0.9256249999999999, "calib/mu_w": 0.7549579831932773, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.35202429149797565, "calib/std_conf": 0.3417278276311164, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25213857998289135, "calib/step_q_c_n": 1169.0, "calib/step_q_gap": 0.007281658095807025, "calib/step_q_w": 0.24485692188708433, "calib/step_q_w_n": 1293.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 454.2578125, "completions/mean_terminated_length": 468.9112854003906, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.16533333333333333, "grad_norm": 0.3328990936279297, "learning_rate": 1.2777777777777779e-06, "loss": 0.0022, "num_tokens": 31326321.0, "reward": 1.0281612873077393, "reward_std": 0.30114853382110596, "rewards/accuracy_reward_step": 0.5, "rewards/final_brier_reward_step": 0.5914789438247681, "rewards/format_reward_step": 0.96484375, "step": 155 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7826054934412241, "aux_distill/mean_u": 0.23677276982612502, "aux_distill/n_active_tok": 145.375, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 9.84765625, "calib/ece": 0.32630081300813013, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.9609375, "calib/frac_conf_gt_0.9": 0.8252032520325203, "calib/gap": 0.042313883299798793, "calib/mean_conf": 0.8420731707317072, "calib/mu_c": 0.8542857142857143, "calib/mu_w": 0.8119718309859155, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2284959349593496, "calib/std_conf": 0.34261810427610406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22662320262747246, "calib/step_q_c_n": 1454.0, "calib/step_q_gap": -0.03780041499202144, "calib/step_q_w": 0.2644236176194939, "calib/step_q_w_n": 1067.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 478.33984375, "completions/mean_terminated_length": 493.7701416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.1664, "grad_norm": 0.48554399609565735, "learning_rate": 1.25e-06, "loss": 0.0245, "num_tokens": 31553536.0, "reward": 1.1478266716003418, "reward_std": 0.24589796364307404, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.6511222720146179, "rewards/format_reward_step": 0.9609375, "step": 156 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7190311551094055, "aux_distill/mean_u": 0.16654506377717385, "aux_distill/n_active_tok": 156.0, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 10.51171875, "calib/ece": 0.2189558232931727, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7831325301204819, "calib/gap": 0.27367736486486494, "calib/mean_conf": 0.8105220883534137, "calib/mu_c": 0.8808648648648649, "calib/mu_w": 0.6071875, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.1432530120481928, "calib/std_conf": 0.36241067105153557, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2257866342648846, "calib/step_q_c_n": 1646.0, "calib/step_q_gap": -0.09673547099827334, "calib/step_q_w": 0.32252210526315794, "calib/step_q_w_n": 1045.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 491.25, "completions/mean_terminated_length": 505.0602111816406, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.16746666666666668, "grad_norm": 0.3559320271015167, "learning_rate": 1.2222222222222223e-06, "loss": 0.0533, "num_tokens": 31783024.0, "reward": 1.2219460010528564, "reward_std": 0.23536671698093414, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7524859309196472, "rewards/format_reward_step": 0.96875, "step": 157 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.6704925671219826, "aux_distill/mean_u": 0.20121584100111614, "aux_distill/n_active_tok": 163.375, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.32421875, "calib/ece": 0.2774206349206349, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.9166666666666666, "calib/gap": 0.1412540610786226, "calib/mean_conf": 0.9242460317460317, "calib/mu_c": 0.9696491228070175, "calib/mu_w": 0.8283950617283949, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26154761904761903, "calib/std_conf": 0.25085691825547474, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22334996299037752, "calib/step_q_c_n": 1351.0, "calib/step_q_gap": 0.04315691279732731, "calib/step_q_w": 0.1801930501930502, "calib/step_q_w_n": 1036.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 494.26171875, "completions/mean_terminated_length": 502.107177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.16853333333333334, "grad_norm": 0.5057961940765381, "learning_rate": 1.1944444444444446e-06, "loss": 0.0602, "num_tokens": 32014795.0, "reward": 1.1806557178497314, "reward_std": 0.22892944514751434, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7089675664901733, "rewards/format_reward_step": 0.984375, "step": 158 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8052110997959971, "aux_distill/mean_u": 0.23945831289057012, "aux_distill/n_active_tok": 123.875, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.0078125, "calib/ece": 0.29416000000000003, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.836, "calib/gap": 0.25487091675447815, "calib/mean_conf": 0.85096, "calib/mu_c": 0.9569863013698628, "calib/mu_w": 0.7021153846153847, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.28056000000000003, "calib/std_conf": 0.3366788950914506, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23610054844606945, "calib/step_q_c_n": 1094.0, "calib/step_q_gap": 0.00771771016224107, "calib/step_q_w": 0.22838283828382838, "calib/step_q_w_n": 1212.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 451.89453125, "completions/mean_terminated_length": 462.7400207519531, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.1696, "grad_norm": 0.3434606194496155, "learning_rate": 1.1666666666666668e-06, "loss": 0.0371, "num_tokens": 32235264.0, "reward": 1.113415241241455, "reward_std": 0.22357870638370514, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6799554824829102, "rewards/format_reward_step": 0.9765625, "step": 159 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.825240034610033, "aux_distill/mean_u": 0.2686492410873999, "aux_distill/n_active_tok": 147.75, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 9.66015625, "calib/ece": 0.3138866396761135, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8421052631578947, "calib/gap": 0.15572612921669515, "calib/mean_conf": 0.8561538461538462, "calib/mu_c": 0.9116352201257861, "calib/mu_w": 0.755909090909091, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26315789473684226, "calib/std_conf": 0.3342975362280166, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.1681236203090508, "calib/step_q_c_n": 1359.0, "calib/step_q_gap": -0.03851013193511438, "calib/step_q_w": 0.20663375224416516, "calib/step_q_w_n": 1114.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 469.82421875, "completions/mean_terminated_length": 486.9433288574219, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.17066666666666666, "grad_norm": 0.406261146068573, "learning_rate": 1.138888888888889e-06, "loss": 0.0152, "num_tokens": 32460379.0, "reward": 1.1235268115997314, "reward_std": 0.24834099411964417, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.6611160039901733, "rewards/format_reward_step": 0.96484375, "step": 160 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7200377890840173, "aux_distill/mean_u": 0.20223440044865573, "aux_distill/n_active_tok": 121.75, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.734375, "calib/ece": 0.1636862745098039, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9098039215686274, "calib/gap": 0.2220439024390245, "calib/mean_conf": 0.9207058823529413, "calib/mu_c": 0.9642439024390245, "calib/mu_w": 0.7422, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.140235294117647, "calib/std_conf": 0.25349689978516904, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.17341063274624918, "calib/step_q_c_n": 1533.0, "calib/step_q_gap": -0.04077281244390743, "calib/step_q_w": 0.2141834451901566, "calib/step_q_w_n": 447.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 425.61328125, "completions/mean_terminated_length": 428.9645690917969, "completions/min_length": 0.0, "completions/min_terminated_length": 166.0, "epoch": 0.17173333333333332, "grad_norm": 0.3874889314174652, "learning_rate": 1.111111111111111e-06, "loss": 0.0667, "num_tokens": 32673256.0, "reward": 1.314043402671814, "reward_std": 0.12282635271549225, "rewards/accuracy_reward_step": 0.80078125, "rewards/final_brier_reward_step": 0.8312117457389832, "rewards/format_reward_step": 0.99609375, "step": 161 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7323440834879875, "aux_distill/mean_u": 0.17847615366741984, "aux_distill/n_active_tok": 126.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.1796875, "calib/ece": 0.1663529411764706, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8980392156862745, "calib/gap": 0.3037251984126985, "calib/mean_conf": 0.9102745098039215, "calib/mu_c": 0.9853125, "calib/mu_w": 0.6815873015873015, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16184313725490196, "calib/std_conf": 0.26846317003264225, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.1819727705922396, "calib/step_q_c_n": 1469.0, "calib/step_q_gap": -0.0333872294077604, "calib/step_q_w": 0.21536, "calib/step_q_w_n": 625.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 420.1328125, "completions/mean_terminated_length": 423.4409484863281, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.1728, "grad_norm": 0.27647775411605835, "learning_rate": 1.0833333333333335e-06, "loss": 0.0436, "num_tokens": 32884954.0, "reward": 1.286501169204712, "reward_std": 0.16905483603477478, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.8269085884094238, "rewards/format_reward_step": 0.99609375, "step": 162 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8068936876952648, "aux_distill/mean_u": 0.2444127955766234, "aux_distill/n_active_tok": 155.125, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.10546875, "calib/ece": 0.20146808055212456, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7142857142857143, "calib/gap": 0.4506941576834671, "calib/mean_conf": 0.7395633186473626, "calib/mu_c": 0.9201986754966889, "calib/mu_w": 0.4695045178132218, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.170912524996569, "calib/std_conf": 0.41283393997796536, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20412260536398466, "calib/step_q_c_n": 1305.0, "calib/step_q_gap": -0.02532357248312922, "calib/step_q_w": 0.22944617784711388, "calib/step_q_w_n": 1282.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 533.078125, "completions/mean_terminated_length": 541.5397338867188, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.17386666666666667, "grad_norm": 0.5397083759307861, "learning_rate": 1.0555555555555557e-06, "loss": 0.0504, "num_tokens": 33126254.0, "reward": 1.1740601062774658, "reward_std": 0.18381524085998535, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7739015221595764, "rewards/format_reward_step": 0.984375, "step": 163 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8425343632698059, "aux_distill/mean_u": 0.253130894448872, "aux_distill/n_active_tok": 147.5, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 11.15625, "calib/ece": 0.16938775510204088, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.6489795918367347, "calib/gap": 0.5163950770077008, "calib/mean_conf": 0.6816326530612244, "calib/mu_c": 0.8945138888888889, "calib/mu_w": 0.3781188118811881, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13163265306122457, "calib/std_conf": 0.43294127866318344, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20181670721816705, "calib/step_q_c_n": 1233.0, "calib/step_q_gap": -0.04759795698392785, "calib/step_q_w": 0.2494146642020949, "calib/step_q_w_n": 1623.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 541.52734375, "completions/mean_terminated_length": 565.8408203125, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.17493333333333333, "grad_norm": 0.738795280456543, "learning_rate": 1.0277777777777777e-06, "loss": -0.0139, "num_tokens": 33371021.0, "reward": 1.1481738090515137, "reward_std": 0.20172443985939026, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7768163681030273, "rewards/format_reward_step": 0.95703125, "step": 164 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.782092509791255, "aux_distill/mean_u": 0.2121803635043166, "aux_distill/n_active_tok": 142.375, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 9.75, "calib/ece": 0.2654612840032003, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.678714859437751, "calib/gap": 0.3788403057693124, "calib/mean_conf": 0.7032926093044052, "calib/mu_c": 0.88282335661677, "calib/mu_w": 0.5039830508474576, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22132473781846138, "calib/std_conf": 0.43055869360944665, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.15882145077672744, "calib/step_q_c_n": 1092.0, "calib/step_q_gap": -0.046880757200480516, "calib/step_q_w": 0.20570220797720795, "calib/step_q_w_n": 1404.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 509.64453125, "completions/mean_terminated_length": 523.9718627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.176, "grad_norm": 0.41305992007255554, "learning_rate": 1.0000000000000002e-06, "loss": 0.0166, "num_tokens": 33607066.0, "reward": 1.0937097072601318, "reward_std": 0.20582950115203857, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.7030445337295532, "rewards/format_reward_step": 0.97265625, "step": 165 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7836181567981839, "aux_distill/mean_u": 0.2615975476655479, "aux_distill/n_active_tok": 181.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.03125, "calib/ece": 0.15141732283464576, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7007874015748031, "calib/gap": 0.5479756401790299, "calib/mean_conf": 0.728740157480315, "calib/mu_c": 0.8948587570621468, "calib/mu_w": 0.3468831168831169, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0916535433070867, "calib/std_conf": 0.4158020276317233, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20698069046225864, "calib/step_q_c_n": 1709.0, "calib/step_q_gap": -0.004050699672270508, "calib/step_q_w": 0.21103139013452915, "calib/step_q_w_n": 1115.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 558.41796875, "completions/mean_terminated_length": 565.03955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.17706666666666668, "grad_norm": 0.3993739187717438, "learning_rate": 9.722222222222224e-07, "loss": 0.0384, "num_tokens": 33856205.0, "reward": 1.261671543121338, "reward_std": 0.1854332834482193, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.8397492170333862, "rewards/format_reward_step": 0.9921875, "step": 166 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7592298779636621, "aux_distill/mean_u": 0.22087851047994195, "aux_distill/n_active_tok": 140.0, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.703125, "calib/ece": 0.20209486166007906, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8221343873517787, "calib/gap": 0.3174265767934522, "calib/mean_conf": 0.8354545454545454, "calib/mu_c": 0.9195161290322581, "calib/mu_w": 0.6020895522388059, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15118577075098819, "calib/std_conf": 0.34964732500972096, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2433490011750881, "calib/step_q_c_n": 1702.0, "calib/step_q_gap": -0.06839012925969448, "calib/step_q_w": 0.3117391304347826, "calib/step_q_w_n": 782.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 495.65234375, "completions/mean_terminated_length": 501.5296630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.17813333333333334, "grad_norm": 0.36381569504737854, "learning_rate": 9.444444444444445e-07, "loss": 0.0563, "num_tokens": 34088700.0, "reward": 1.2510545253753662, "reward_std": 0.18535427749156952, "rewards/accuracy_reward_step": 0.7265625, "rewards/final_brier_reward_step": 0.7872651815414429, "rewards/format_reward_step": 0.98828125, "step": 167 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8132620574906468, "aux_distill/mean_u": 0.31038632137509864, "aux_distill/n_active_tok": 166.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.69140625, "calib/ece": 0.1765869047619048, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6825396825396826, "calib/gap": 0.4661973958333333, "calib/mean_conf": 0.7226980158730157, "calib/mu_c": 0.8336973958333332, "calib/mu_w": 0.36749999999999994, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06869007936507943, "calib/std_conf": 0.4104957785922956, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22837146783947224, "calib/step_q_c_n": 1819.0, "calib/step_q_gap": -0.004222214077739123, "calib/step_q_w": 0.23259368191721136, "calib/step_q_w_n": 918.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 547.50390625, "completions/mean_terminated_length": 556.1944580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.1792, "grad_norm": 0.35889482498168945, "learning_rate": 9.166666666666666e-07, "loss": 0.0273, "num_tokens": 34333533.0, "reward": 1.2696452140808105, "reward_std": 0.17705899477005005, "rewards/accuracy_reward_step": 0.75, "rewards/final_brier_reward_step": 0.8049156069755554, "rewards/format_reward_step": 0.984375, "step": 168 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8193460293114185, "aux_distill/mean_u": 0.2598738432131758, "aux_distill/n_active_tok": 144.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.30859375, "calib/ece": 0.2248809523809523, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.7222222222222222, "calib/gap": 0.3845061728395063, "calib/mean_conf": 0.7449603174603175, "calib/mu_c": 0.882283950617284, "calib/mu_w": 0.49777777777777776, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1634920634920634, "calib/std_conf": 0.40791718154145534, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25499312500000004, "calib/step_q_c_n": 1600.0, "calib/step_q_gap": 0.03606502105389803, "calib/step_q_w": 0.21892810394610202, "calib/step_q_w_n": 1039.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2248.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 554.4375, "completions/mean_terminated_length": 558.8031616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.18026666666666666, "grad_norm": 0.267215371131897, "learning_rate": 8.88888888888889e-07, "loss": 0.0669, "num_tokens": 34579653.0, "reward": 1.1876498460769653, "reward_std": 0.14423486590385437, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.7581120729446411, "rewards/format_reward_step": 0.984375, "step": 169 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7837082333862782, "aux_distill/mean_u": 0.2228750338044867, "aux_distill/n_active_tok": 160.75, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 10.40234375, "calib/ece": 0.1924701195219125, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.6812749003984063, "calib/gap": 0.45554193093727974, "calib/mean_conf": 0.7129482071713147, "calib/mu_c": 0.8690303030303029, "calib/mu_w": 0.4134883720930232, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.12402390438247027, "calib/std_conf": 0.41963472223631293, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.23673456397199238, "calib/step_q_c_n": 1571.0, "calib/step_q_gap": -0.018082285844857438, "calib/step_q_w": 0.2548168498168498, "calib/step_q_w_n": 1092.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 513.48828125, "completions/mean_terminated_length": 523.7171630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.18133333333333335, "grad_norm": 0.4967786371707916, "learning_rate": 8.611111111111112e-07, "loss": 0.0372, "num_tokens": 34815258.0, "reward": 1.2031220197677612, "reward_std": 0.16210168600082397, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.785150408744812, "rewards/format_reward_step": 0.9765625, "step": 170 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7938511082902551, "aux_distill/mean_u": 0.29352478921058617, "aux_distill/n_active_tok": 132.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.83203125, "calib/ece": 0.20181102362204736, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6141732283464567, "calib/gap": 0.4881854636591479, "calib/mean_conf": 0.6434645669291339, "calib/mu_c": 0.8625714285714287, "calib/mu_w": 0.37438596491228077, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1470472440944883, "calib/std_conf": 0.4485635480269359, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2466503667481663, "calib/step_q_c_n": 1227.0, "calib/step_q_gap": 0.004727886128011255, "calib/step_q_w": 0.24192248062015503, "calib/step_q_w_n": 1290.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 515.5390625, "completions/mean_terminated_length": 521.6522216796875, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.1824, "grad_norm": 0.3575134873390198, "learning_rate": 8.333333333333333e-07, "loss": 0.0805, "num_tokens": 35054132.0, "reward": 1.1586816310882568, "reward_std": 0.12559093534946442, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7783007621765137, "rewards/format_reward_step": 0.9921875, "step": 171 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7738157333806157, "aux_distill/mean_u": 0.1982485849683548, "aux_distill/n_active_tok": 145.75, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.375, "calib/ece": 0.16667999999999994, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.752, "calib/gap": 0.4780542529399034, "calib/mean_conf": 0.77428, "calib/mu_c": 0.9329940119760479, "calib/mu_w": 0.45493975903614453, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13647999999999996, "calib/std_conf": 0.39090009158351446, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24634110787172012, "calib/step_q_c_n": 1372.0, "calib/step_q_gap": -0.009428541933727358, "calib/step_q_w": 0.2557696498054475, "calib/step_q_w_n": 1028.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 472.53515625, "completions/mean_terminated_length": 483.87603759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.18346666666666667, "grad_norm": 0.2834770977497101, "learning_rate": 8.055555555555557e-07, "loss": 0.0369, "num_tokens": 35278453.0, "reward": 1.2178552150726318, "reward_std": 0.18590107560157776, "rewards/accuracy_reward_step": 0.65234375, "rewards/final_brier_reward_step": 0.8068042993545532, "rewards/format_reward_step": 0.9765625, "step": 172 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.6961722830310464, "aux_distill/mean_u": 0.18221597900992778, "aux_distill/n_active_tok": 160.875, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 9.8515625, "calib/ece": 0.19749003984063757, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7609561752988048, "calib/gap": 0.40618126574773983, "calib/mean_conf": 0.7747011952191236, "calib/mu_c": 0.9009248554913296, "calib/mu_w": 0.4947435897435898, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14147410358565748, "calib/std_conf": 0.3952963112197103, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25296927822331894, "calib/step_q_c_n": 1621.0, "calib/step_q_gap": 0.04764186423885722, "calib/step_q_w": 0.20532741398446172, "calib/step_q_w_n": 901.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 514.015625, "completions/mean_terminated_length": 524.2550048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.18453333333333333, "grad_norm": 0.4571469724178314, "learning_rate": 7.777777777777779e-07, "loss": -0.0008, "num_tokens": 35513201.0, "reward": 1.218473196029663, "reward_std": 0.18532350659370422, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7806965112686157, "rewards/format_reward_step": 0.98046875, "step": 173 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8278934434056282, "aux_distill/mean_u": 0.24226153047754964, "aux_distill/n_active_tok": 142.0, "calib/answer_extract_rate": 0.94921875, "calib/avg_num_step_conf": 11.37890625, "calib/ece": 0.280493827160494, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.49382716049382713, "calib/gap": 0.37458674609084147, "calib/mean_conf": 0.5306172839506172, "calib/mu_c": 0.6616455696202532, "calib/mu_w": 0.28705882352941176, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.08045267489711945, "calib/std_conf": 0.4577571345912221, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.27785419532324623, "calib/step_q_c_n": 1454.0, "calib/step_q_gap": -0.038314413312805884, "calib/step_q_w": 0.3161686086360521, "calib/step_q_w_n": 1459.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 527.046875, "completions/mean_terminated_length": 552.9671630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.1856, "grad_norm": 0.4958760142326355, "learning_rate": 7.5e-07, "loss": 0.0298, "num_tokens": 35752357.0, "reward": 1.1205930709838867, "reward_std": 0.2511502206325531, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.6786863207817078, "rewards/format_reward_step": 0.9453125, "step": 174 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7686019539833069, "aux_distill/mean_u": 0.2133538263066181, "aux_distill/n_active_tok": 129.125, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 11.33203125, "calib/ece": 0.3505306122448979, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.5061224489795918, "calib/gap": 0.2137369349803177, "calib/mean_conf": 0.536734693877551, "calib/mu_c": 0.6292086330935253, "calib/mu_w": 0.41547169811320755, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1599591836734693, "calib/std_conf": 0.4636016400241371, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2988099808061421, "calib/step_q_c_n": 1042.0, "calib/step_q_gap": -0.01760206868283054, "calib/step_q_w": 0.3164120494889726, "calib/step_q_w_n": 1859.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2092.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 487.95703125, "completions/mean_terminated_length": 509.86529541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.18666666666666668, "grad_norm": 0.434774786233902, "learning_rate": 7.222222222222222e-07, "loss": 0.0155, "num_tokens": 35983098.0, "reward": 1.0579733848571777, "reward_std": 0.18678444623947144, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.615946888923645, "rewards/format_reward_step": 0.95703125, "step": 175 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8209396656602621, "aux_distill/mean_u": 0.3008576483306273, "aux_distill/n_active_tok": 128.875, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 10.97265625, "calib/ece": 0.23026572580645144, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.5403225806451613, "calib/gap": 0.45330585785485106, "calib/mean_conf": 0.5659439516129033, "calib/mu_c": 0.7432450331125829, "calib/mu_w": 0.2899391752577319, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09366935483870951, "calib/std_conf": 0.4650180486912076, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2576894223555889, "calib/step_q_c_n": 1333.0, "calib/step_q_gap": -0.02182954783411306, "calib/step_q_w": 0.27951897018970195, "calib/step_q_w_n": 1476.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2911.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 507.81640625, "completions/mean_terminated_length": 524.1975708007812, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.18773333333333334, "grad_norm": 0.24123993515968323, "learning_rate": 6.944444444444446e-07, "loss": 0.0473, "num_tokens": 36217163.0, "reward": 1.1472645998001099, "reward_std": 0.14933854341506958, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7359354496002197, "rewards/format_reward_step": 0.96875, "step": 176 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8030837085098028, "aux_distill/mean_u": 0.2057992942512779, "aux_distill/n_active_tok": 130.625, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 9.81640625, "calib/ece": 0.11093669354838712, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6008064516129032, "calib/gap": 0.6747169407894738, "calib/mean_conf": 0.6255955645161291, "calib/mu_c": 0.8867763157894738, "calib/mu_w": 0.21205937500000002, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.061814516129032264, "calib/std_conf": 0.4567104782993171, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26664710813076276, "calib/step_q_c_n": 1193.0, "calib/step_q_gap": 0.03225316873682332, "calib/step_q_w": 0.23439393939393943, "calib/step_q_w_n": 1320.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 506.3359375, "completions/mean_terminated_length": 522.6693115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.1888, "grad_norm": 0.4691639840602875, "learning_rate": 6.666666666666667e-07, "loss": 0.0359, "num_tokens": 36450617.0, "reward": 1.2046705484390259, "reward_std": 0.1261509507894516, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.8468411564826965, "rewards/format_reward_step": 0.96875, "step": 177 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8158742636442184, "aux_distill/mean_u": 0.2410105040041707, "aux_distill/n_active_tok": 172.5, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 10.5390625, "calib/ece": 0.21916996047430828, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5928853754940712, "calib/gap": 0.48188432267884324, "calib/mean_conf": 0.6205138339920949, "calib/mu_c": 0.7595555555555555, "calib/mu_w": 0.2776712328767123, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06411067193675889, "calib/std_conf": 0.4542858409665569, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23063775510204082, "calib/step_q_c_n": 1568.0, "calib/step_q_gap": -0.01617020949972911, "calib/step_q_w": 0.24680796460176993, "calib/step_q_w_n": 1130.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 541.33984375, "completions/mean_terminated_length": 549.9325561523438, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.18986666666666666, "grad_norm": 0.27010011672973633, "learning_rate": 6.388888888888889e-07, "loss": 0.0427, "num_tokens": 36695272.0, "reward": 1.2301025390625, "reward_std": 0.16309009492397308, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.768798828125, "rewards/format_reward_step": 0.98828125, "step": 178 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7807596288621426, "aux_distill/mean_u": 0.25980170657934115, "aux_distill/n_active_tok": 185.625, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 10.578125, "calib/ece": 0.16274193548387095, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6895161290322581, "calib/gap": 0.5231907514450866, "calib/mean_conf": 0.7109677419354838, "calib/mu_c": 0.8691907514450867, "calib/mu_w": 0.34600000000000003, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08806451612903225, "calib/std_conf": 0.4258197395287359, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24101198402130491, "calib/step_q_c_n": 1502.0, "calib/step_q_gap": 0.015945649029596776, "calib/step_q_w": 0.22506633499170814, "calib/step_q_w_n": 1206.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 501.41796875, "completions/mean_terminated_length": 517.5927124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.19093333333333334, "grad_norm": 0.5760951638221741, "learning_rate": 6.111111111111112e-07, "loss": 0.0235, "num_tokens": 36929899.0, "reward": 1.2234652042388916, "reward_std": 0.22339525818824768, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.802399218082428, "rewards/format_reward_step": 0.96875, "step": 179 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7465346790850163, "aux_distill/mean_u": 0.195027078707378, "aux_distill/n_active_tok": 164.125, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 11.4609375, "calib/ece": 0.20440000000000014, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.748, "calib/gap": 0.3948253968253968, "calib/mean_conf": 0.76456, "calib/mu_c": 0.8751111111111111, "calib/mu_w": 0.48028571428571426, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12448000000000012, "calib/std_conf": 0.4017452008425241, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2172933549432739, "calib/step_q_c_n": 1851.0, "calib/step_q_gap": 0.001965561776145569, "calib/step_q_w": 0.21532779316712833, "calib/step_q_w_n": 1083.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 615.8203125, "completions/mean_terminated_length": 630.6000366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.192, "grad_norm": 0.5065380334854126, "learning_rate": 5.833333333333334e-07, "loss": 0.0231, "num_tokens": 37191405.0, "reward": 1.2276408672332764, "reward_std": 0.21545855700969696, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.7755945324897766, "rewards/format_reward_step": 0.9765625, "step": 180 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7801644625142217, "aux_distill/mean_u": 0.21185583020148446, "aux_distill/n_active_tok": 139.75, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.74609375, "calib/ece": 0.2775062992125985, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7598425196850394, "calib/gap": 0.25334444140197143, "calib/mean_conf": 0.7777692913385827, "calib/mu_c": 0.8655421686746988, "calib/mu_w": 0.6121977272727274, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20086614173228357, "calib/std_conf": 0.39219924853706833, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25771151178918167, "calib/step_q_c_n": 1442.0, "calib/step_q_gap": 0.04990724579169106, "calib/step_q_w": 0.2078042659974906, "calib/step_q_w_n": 797.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 473.95703125, "completions/mean_terminated_length": 479.57708740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.19306666666666666, "grad_norm": 0.44195133447647095, "learning_rate": 5.555555555555555e-07, "loss": 0.0639, "num_tokens": 37419002.0, "reward": 1.1770286560058594, "reward_std": 0.19044727087020874, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7134323716163635, "rewards/format_reward_step": 0.9921875, "step": 181 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8013272704556584, "aux_distill/mean_u": 0.23888848999273496, "aux_distill/n_active_tok": 161.75, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.640625, "calib/ece": 0.2185039370078741, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8110236220472441, "calib/gap": 0.3231657925407927, "calib/mean_conf": 0.8245669291338582, "calib/mu_c": 0.9238068181818183, "calib/mu_w": 0.6006410256410256, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1750787401574804, "calib/std_conf": 0.3598145977209505, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22807674463866073, "calib/step_q_c_n": 1515.0, "calib/step_q_gap": -0.10879628789019552, "calib/step_q_w": 0.33687303252885625, "calib/step_q_w_n": 953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 528.34375, "completions/mean_terminated_length": 532.50390625, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.19413333333333332, "grad_norm": 0.9773353338241577, "learning_rate": 5.277777777777779e-07, "loss": 0.0592, "num_tokens": 37660418.0, "reward": 1.2257777452468872, "reward_std": 0.17774598300457, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7718679904937744, "rewards/format_reward_step": 0.9921875, "step": 182 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7841676194220781, "aux_distill/mean_u": 0.2744013052323904, "aux_distill/n_active_tok": 180.5, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 11.27734375, "calib/ece": 0.22580645161290328, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.7862903225806451, "calib/gap": 0.2790196078431372, "calib/mean_conf": 0.8076612903225807, "calib/mu_c": 0.8841666666666667, "calib/mu_w": 0.6051470588235295, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15383064516129039, "calib/std_conf": 0.36774839941970433, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27389355742296917, "calib/step_q_c_n": 1785.0, "calib/step_q_gap": 0.039664882286853026, "calib/step_q_w": 0.23422867513611614, "calib/step_q_w_n": 1102.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 559.9375, "completions/mean_terminated_length": 578.0, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.1952, "grad_norm": 0.32870110869407654, "learning_rate": 5.000000000000001e-07, "loss": 0.0065, "num_tokens": 37910442.0, "reward": 1.2089574337005615, "reward_std": 0.265907347202301, "rewards/accuracy_reward_step": 0.703125, "rewards/final_brier_reward_step": 0.746039867401123, "rewards/format_reward_step": 0.96875, "step": 183 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8048588903620839, "aux_distill/mean_u": 0.26784059578838393, "aux_distill/n_active_tok": 139.375, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.76953125, "calib/ece": 0.2058730158730159, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8650793650793651, "calib/gap": 0.26646551028640586, "calib/mean_conf": 0.8741269841269841, "calib/mu_c": 0.9449729729729729, "calib/mu_w": 0.678507462686567, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17293650793650797, "calib/std_conf": 0.31427485150379103, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2232556750298686, "calib/step_q_c_n": 1674.0, "calib/step_q_gap": -0.07265726330144945, "calib/step_q_w": 0.29591293833131804, "calib/step_q_w_n": 827.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2177.0, "completions/max_terminated_length": 2177.0, "completions/mean_length": 515.34375, "completions/mean_terminated_length": 521.45458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.19626666666666667, "grad_norm": 0.32190728187561035, "learning_rate": 4.7222222222222226e-07, "loss": 0.1, "num_tokens": 38147650.0, "reward": 1.2386672496795654, "reward_std": 0.24186360836029053, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7742093801498413, "rewards/format_reward_step": 0.98046875, "step": 184 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.740824181586504, "aux_distill/mean_u": 0.2537910775115197, "aux_distill/n_active_tok": 157.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.3359375, "calib/ece": 0.20019841269841274, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.75, "calib/gap": 0.4772727272727273, "calib/mean_conf": 0.7669444444444443, "calib/mu_c": 0.9544444444444444, "calib/mu_w": 0.47717171717171714, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18000000000000005, "calib/std_conf": 0.4024061914848307, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22706200432588317, "calib/step_q_c_n": 1387.0, "calib/step_q_gap": -0.0007854936884139074, "calib/step_q_w": 0.22784749801429707, "calib/step_q_w_n": 1259.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 515.328125, "completions/mean_terminated_length": 525.5936279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.19733333333333333, "grad_norm": 0.3035050928592682, "learning_rate": 4.444444444444445e-07, "loss": 0.0102, "num_tokens": 38386494.0, "reward": 1.1855978965759277, "reward_std": 0.2117016315460205, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7891644239425659, "rewards/format_reward_step": 0.984375, "step": 185 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7983589041978121, "aux_distill/mean_u": 0.25068229766670735, "aux_distill/n_active_tok": 128.375, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 9.6875, "calib/ece": 0.18947791164658628, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8192771084337349, "calib/gap": 0.3847062820900029, "calib/mean_conf": 0.8313253012048193, "calib/mu_c": 0.9502906976744185, "calib/mu_w": 0.5655844155844156, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1650200803212851, "calib/std_conf": 0.3565624738998539, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2070189701897019, "calib/step_q_c_n": 1476.0, "calib/step_q_gap": -0.035988997937788175, "calib/step_q_w": 0.24300796812749006, "calib/step_q_w_n": 1004.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 513.640625, "completions/mean_terminated_length": 528.080322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.1984, "grad_norm": 0.29591473937034607, "learning_rate": 4.1666666666666667e-07, "loss": 0.0232, "num_tokens": 38623026.0, "reward": 1.213200330734253, "reward_std": 0.22806373238563538, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7818695306777954, "rewards/format_reward_step": 0.97265625, "step": 186 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7855865238234401, "aux_distill/mean_u": 0.27453464204647304, "aux_distill/n_active_tok": 147.875, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 10.73046875, "calib/ece": 0.29519718875502016, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8594377510040161, "calib/gap": 0.2540338741853969, "calib/mean_conf": 0.871904016064257, "calib/mu_c": 0.976986301369863, "calib/mu_w": 0.7229524271844661, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2903779116465864, "calib/std_conf": 0.3149587983252822, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.1995216400911162, "calib/step_q_c_n": 1317.0, "calib/step_q_gap": 0.00855919253866863, "calib/step_q_w": 0.19096244755244757, "calib/step_q_w_n": 1430.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 530.7421875, "completions/mean_terminated_length": 545.6626586914062, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.19946666666666665, "grad_norm": 0.5334581732749939, "learning_rate": 3.8888888888888895e-07, "loss": 0.0159, "num_tokens": 38860440.0, "reward": 1.1118857860565186, "reward_std": 0.29268577694892883, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6808027029037476, "rewards/format_reward_step": 0.97265625, "step": 187 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7995928060263395, "aux_distill/mean_u": 0.2580252115995233, "aux_distill/n_active_tok": 169.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.96875, "calib/ece": 0.20089843750000014, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.921875, "calib/gap": 0.1752726970402394, "calib/mean_conf": 0.9362109375000001, "calib/mu_c": 0.978659793814433, "calib/mu_w": 0.8033870967741936, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18964843750000013, "calib/std_conf": 0.22592020688812034, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.22686349115801482, "calib/step_q_c_n": 1753.0, "calib/step_q_gap": -0.017692203460258016, "calib/step_q_w": 0.24455569461827284, "calib/step_q_w_n": 799.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 581.9609375, "completions/mean_terminated_length": 586.5433349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.20053333333333334, "grad_norm": 0.24379847943782806, "learning_rate": 3.611111111111111e-07, "loss": 0.0696, "num_tokens": 39113494.0, "reward": 1.2739689350128174, "reward_std": 0.14255055785179138, "rewards/accuracy_reward_step": 0.7578125, "rewards/final_brier_reward_step": 0.7940316200256348, "rewards/format_reward_step": 0.99609375, "step": 188 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7341479770839214, "aux_distill/mean_u": 0.23114255566897401, "aux_distill/n_active_tok": 134.125, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.6171875, "calib/ece": 0.24019685039370073, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8346456692913385, "calib/gap": 0.24245139664804471, "calib/mean_conf": 0.8536614173228346, "calib/mu_c": 0.9252513966480447, "calib/mu_w": 0.6828, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1945669291338582, "calib/std_conf": 0.33396859016985175, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21389307745030842, "calib/step_q_c_n": 1459.0, "calib/step_q_gap": -0.06229166150551488, "calib/step_q_w": 0.2761847389558233, "calib/step_q_w_n": 747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 481.73046875, "completions/mean_terminated_length": 487.4427185058594, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.2016, "grad_norm": 0.43488699197769165, "learning_rate": 3.3333333333333335e-07, "loss": 0.0585, "num_tokens": 39344585.0, "reward": 1.222286581993103, "reward_std": 0.1747114658355713, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7531667947769165, "rewards/format_reward_step": 0.9921875, "step": 189 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7943196892738342, "aux_distill/mean_u": 0.28323400437481766, "aux_distill/n_active_tok": 155.0, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 10.48828125, "calib/ece": 0.24609561752988054, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8645418326693227, "calib/gap": 0.3083033891384239, "calib/mean_conf": 0.8734262948207171, "calib/mu_c": 0.9876582278481013, "calib/mu_w": 0.6793548387096774, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24501992031872516, "calib/std_conf": 0.3173474059160562, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25361627260083447, "calib/step_q_c_n": 1438.0, "calib/step_q_gap": -0.031147159636535193, "calib/step_q_w": 0.28476343223736966, "calib/step_q_w_n": 1247.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 550.19921875, "completions/mean_terminated_length": 561.1593627929688, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.20266666666666666, "grad_norm": 0.3425717055797577, "learning_rate": 3.055555555555556e-07, "loss": 0.0328, "num_tokens": 39591044.0, "reward": 1.1666810512542725, "reward_std": 0.2564879357814789, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7357058525085449, "rewards/format_reward_step": 0.98046875, "step": 190 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7623467231169343, "aux_distill/mean_u": 0.20448913400170793, "aux_distill/n_active_tok": 195.125, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 11.90625, "calib/ece": 0.298991935483871, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.8951612903225806, "calib/gap": 0.20007046145344032, "calib/mean_conf": 0.8995564516129033, "calib/mu_c": 0.9753896103896105, "calib/mu_w": 0.7753191489361702, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2887903225806452, "calib/std_conf": 0.2888328236870281, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22404383077160353, "calib/step_q_c_n": 1430.0, "calib/step_q_gap": -0.04238225081059674, "calib/step_q_w": 0.26642608158220027, "calib/step_q_w_n": 1618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 548.07421875, "completions/mean_terminated_length": 565.7540283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.20373333333333332, "grad_norm": 0.23078249394893646, "learning_rate": 2.7777777777777776e-07, "loss": 0.011, "num_tokens": 39835519.0, "reward": 1.1231420040130615, "reward_std": 0.21966737508773804, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.675971508026123, "rewards/format_reward_step": 0.96875, "step": 191 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7831371380016208, "aux_distill/mean_u": 0.20444626811200128, "aux_distill/n_active_tok": 146.5, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.640625, "calib/ece": 0.20777777777777776, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8690476190476191, "calib/gap": 0.3182753246753247, "calib/mean_conf": 0.8820634920634921, "calib/mu_c": 0.9793142857142857, "calib/mu_w": 0.661038961038961, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19769841269841268, "calib/std_conf": 0.30560457244438705, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22681944672962867, "calib/step_q_c_n": 1373.0, "calib/step_q_gap": -0.038139457379960384, "calib/step_q_w": 0.26495890410958906, "calib/step_q_w_n": 1095.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 551.92578125, "completions/mean_terminated_length": 560.6865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.2048, "grad_norm": 0.24515196681022644, "learning_rate": 2.5000000000000004e-07, "loss": 0.0516, "num_tokens": 40081788.0, "reward": 1.2249211072921753, "reward_std": 0.18614095449447632, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7818734049797058, "rewards/format_reward_step": 0.984375, "step": 192 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7868331456556916, "aux_distill/mean_u": 0.223379436485949, "aux_distill/n_active_tok": 125.25, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 10.15234375, "calib/ece": 0.33192771084337347, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9397590361445783, "calib/gap": 0.07807935393258403, "calib/mean_conf": 0.9436546184738956, "calib/mu_c": 0.9715624999999999, "calib/mu_w": 0.8934831460674159, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3165060240963855, "calib/std_conf": 0.21971480507931, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2257960056061668, "calib/step_q_c_n": 1427.0, "calib/step_q_gap": 0.017220920281934726, "calib/step_q_w": 0.20857508532423208, "calib/step_q_w_n": 1172.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 492.33984375, "completions/mean_terminated_length": 506.1806945800781, "completions/min_length": 0.0, "completions/min_terminated_length": 204.0, "epoch": 0.20586666666666667, "grad_norm": 0.3053726553916931, "learning_rate": 2.2222222222222224e-07, "loss": 0.0685, "num_tokens": 40313539.0, "reward": 1.123337984085083, "reward_std": 0.25543320178985596, "rewards/accuracy_reward_step": 0.625, "rewards/final_brier_reward_step": 0.6490199565887451, "rewards/format_reward_step": 0.97265625, "step": 193 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8021860085427761, "aux_distill/mean_u": 0.2652982515224455, "aux_distill/n_active_tok": 133.625, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.234375, "calib/ece": 0.23719367588932805, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8814229249011858, "calib/gap": 0.2746909992912827, "calib/mean_conf": 0.8877075098814229, "calib/mu_c": 0.9778235294117648, "calib/mu_w": 0.703132530120482, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22648221343873517, "calib/std_conf": 0.30340668096512574, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20807055435565155, "calib/step_q_c_n": 1389.0, "calib/step_q_gap": -0.01781662513152793, "calib/step_q_w": 0.22588717948717948, "calib/step_q_w_n": 975.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 479.71484375, "completions/mean_terminated_length": 485.4031677246094, "completions/min_length": 0.0, "completions/min_terminated_length": 210.0, "epoch": 0.20693333333333333, "grad_norm": 0.29109108448028564, "learning_rate": 1.9444444444444447e-07, "loss": 0.0248, "num_tokens": 40542290.0, "reward": 1.2027337551116943, "reward_std": 0.17572781443595886, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7531238198280334, "rewards/format_reward_step": 0.98828125, "step": 194 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7851871335878968, "aux_distill/mean_u": 0.24789486043876469, "aux_distill/n_active_tok": 156.625, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 11.4375, "calib/ece": 0.33289795918367343, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.8979591836734694, "calib/gap": 0.16312925170068038, "calib/mean_conf": 0.9019591836734694, "calib/mu_c": 0.9672108843537416, "calib/mu_w": 0.8040816326530612, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3174285714285714, "calib/std_conf": 0.2854159842200734, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25532035842293904, "calib/step_q_c_n": 1395.0, "calib/step_q_gap": 0.05504638582019933, "calib/step_q_w": 0.2002739726027397, "calib/step_q_w_n": 1533.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 528.6640625, "completions/mean_terminated_length": 552.3999633789062, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.208, "grad_norm": 0.37227320671081543, "learning_rate": 1.6666666666666668e-07, "loss": -0.0018, "num_tokens": 40783612.0, "reward": 1.0841538906097412, "reward_std": 0.26760923862457275, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6370578408241272, "rewards/format_reward_step": 0.95703125, "step": 195 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.8097772235050797, "aux_distill/mean_u": 0.250261366145977, "aux_distill/n_active_tok": 138.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.828125, "calib/ece": 0.273461921647051, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9490196078431372, "calib/gap": 0.1354934395147117, "calib/mean_conf": 0.9515011373333255, "calib/mu_c": 0.9945402298850575, "calib/mu_w": 0.8590467903703458, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.271305058901953, "calib/std_conf": 0.20554215236891232, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20119723183391006, "calib/step_q_c_n": 1445.0, "calib/step_q_gap": -0.04124816693909611, "calib/step_q_w": 0.24244539877300617, "calib/step_q_w_n": 815.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 439.47265625, "completions/mean_terminated_length": 442.9330749511719, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.20906666666666668, "grad_norm": 0.32618144154548645, "learning_rate": 1.3888888888888888e-07, "loss": 0.0864, "num_tokens": 40998661.0, "reward": 1.2001200914382935, "reward_std": 0.16445036232471466, "rewards/accuracy_reward_step": 0.6796875, "rewards/final_brier_reward_step": 0.7244589328765869, "rewards/format_reward_step": 0.99609375, "step": 196 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.806681577116251, "aux_distill/mean_u": 0.28128467985306665, "aux_distill/n_active_tok": 136.0, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 10.25, "calib/ece": 0.35570281124497993, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.9156626506024096, "calib/gap": 0.10875353535353527, "calib/mean_conf": 0.9234939759036144, "calib/mu_c": 0.9667333333333332, "calib/mu_w": 0.857979797979798, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.33839357429718875, "calib/std_conf": 0.25452420562463374, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2311550632911392, "calib/step_q_c_n": 1264.0, "calib/step_q_gap": -0.009418466120625552, "calib/step_q_w": 0.24057352941176474, "calib/step_q_w_n": 1360.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 501.43359375, "completions/mean_terminated_length": 515.5300903320312, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.21013333333333334, "grad_norm": 0.31283727288246155, "learning_rate": 1.1111111111111112e-07, "loss": 0.004, "num_tokens": 41232084.0, "reward": 1.0928354263305664, "reward_std": 0.29931554198265076, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6270769238471985, "rewards/format_reward_step": 0.97265625, "step": 197 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7634249338880181, "aux_distill/mean_u": 0.20665586842475142, "aux_distill/n_active_tok": 138.625, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.84765625, "calib/ece": 0.26354581673306765, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.900398406374502, "calib/gap": 0.1527189393939392, "calib/mean_conf": 0.9052191235059761, "calib/mu_c": 0.9508522727272727, "calib/mu_w": 0.7981333333333335, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23378486055776887, "calib/std_conf": 0.2810514049492244, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2543802749600839, "calib/step_q_c_n": 1548.0, "calib/step_q_gap": -0.016051379716175096, "calib/step_q_w": 0.270431654676259, "calib/step_q_w_n": 973.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 476.20703125, "completions/mean_terminated_length": 487.6360168457031, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.2112, "grad_norm": 0.27833330631256104, "learning_rate": 8.333333333333334e-08, "loss": 0.0462, "num_tokens": 41459377.0, "reward": 1.1917555332183838, "reward_std": 0.2348858118057251, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.719448447227478, "rewards/format_reward_step": 0.9765625, "step": 198 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7831273702904582, "aux_distill/mean_u": 0.23099978917405076, "aux_distill/n_active_tok": 133.875, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 9.484375, "calib/ece": 0.19971999999999998, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.888, "calib/gap": 0.2780963154492566, "calib/mean_conf": 0.8971600000000001, "calib/mu_c": 0.9728021978021979, "calib/mu_w": 0.6947058823529413, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18443999999999997, "calib/std_conf": 0.2912763883324565, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22950873846153844, "calib/step_q_c_n": 1625.0, "calib/step_q_gap": -0.006829991301848831, "calib/step_q_w": 0.23633872976338727, "calib/step_q_w_n": 803.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 529.06640625, "completions/mean_terminated_length": 541.7640380859375, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.21226666666666666, "grad_norm": 0.24881617724895477, "learning_rate": 5.555555555555556e-08, "loss": 0.0162, "num_tokens": 41699018.0, "reward": 1.2316912412643433, "reward_std": 0.23456567525863647, "rewards/accuracy_reward_step": 0.7109375, "rewards/final_brier_reward_step": 0.7797886729240417, "rewards/format_reward_step": 0.97265625, "step": 199 }, { "aux_distill/lambda": 0.10000000000000005, "aux_distill/loss": 0.7529831556603312, "aux_distill/mean_u": 0.220185384755039, "aux_distill/n_active_tok": 161.875, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 10.34375, "calib/ece": 0.23032128514056222, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.8875502008032129, "calib/gap": 0.22098116790631428, "calib/mean_conf": 0.8986746987951807, "calib/mu_c": 0.9616853932584271, "calib/mu_w": 0.7407042253521128, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20706827309236944, "calib/std_conf": 0.28799865190030877, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22154745529573588, "calib/step_q_c_n": 1454.0, "calib/step_q_gap": -0.06527540902587214, "calib/step_q_w": 0.28682286432160803, "calib/step_q_w_n": 1194.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 553.09375, "completions/mean_terminated_length": 568.642578125, "completions/min_length": 0.0, "completions/min_terminated_length": 130.0, "epoch": 0.21333333333333335, "grad_norm": 0.2786003649234772, "learning_rate": 2.777777777777778e-08, "loss": 0.0135, "num_tokens": 41948658.0, "reward": 1.2082239389419556, "reward_std": 0.16592733561992645, "rewards/accuracy_reward_step": 0.6953125, "rewards/final_brier_reward_step": 0.7484792470932007, "rewards/format_reward_step": 0.97265625, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.04889526396640576, "train_runtime": 15994.4075, "train_samples_per_second": 3.201, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 41948658, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }