{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21333333333333335, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.884798833302089, "aux_distill/mean_u": 0.2680448395277542, "aux_distill/n_active_tok": 22.285714285714285, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.44921875, "calib/ece": 0.16135714285714264, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.12215151515151501, "calib/mean_conf": 0.9256428571428571, "calib/mu_c": 0.9518181818181817, "calib/mu_w": 0.8296666666666667, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.08203125, "calib/nonempty_step_conf_rate": 0.078125, "calib/pce": 0.15064285714285694, "calib/std_conf": 0.1237495207164497, "calib/step_conf_rate": 0.078125, "calib/step_q_c": 0.8832727272727273, "calib/step_q_c_n": 55.0, "calib/step_q_gap": 0.042872727272727285, "calib/step_q_w": 0.8404, "calib/step_q_w_n": 60.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 579.68359375, "completions/mean_terminated_length": 648.0305786132812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.0010666666666666667, "grad_norm": 2.8604674339294434, "learning_rate": 0.0, "loss": 0.1208, "num_tokens": 255983.0, "reward": 0.06787636131048203, "reward_std": 0.1474149525165558, "rewards/accuracy_reward_step": 0.04296875, "rewards/final_brier_reward_step": 0.04200273007154465, "rewards/format_reward_step": 0.05078125, "step": 1 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.3874275883038838, "aux_distill/mean_u": 0.329474540462036, "aux_distill/n_active_tok": 27.333333333333332, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.359375, "calib/ece": 0.6484615384615385, "calib/final_conf_rate": 0.05078125, "calib/format_rate": 0.046875, "calib/frac_conf_gt_0.9": 0.9230769230769231, "calib/gap": 0.016388888888888786, "calib/mean_conf": 0.9561538461538462, "calib/mu_c": 0.9674999999999999, "calib/mu_w": 0.9511111111111111, "calib/nonempty_final_conf_rate": 0.05078125, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.06640625, "calib/pce": 0.6484615384615385, "calib/std_conf": 0.024663414679817527, "calib/step_conf_rate": 0.06640625, "calib/step_q_c": 0.8756521739130435, "calib/step_q_c_n": 23.0, "calib/step_q_gap": 0.015072463768115885, "calib/step_q_w": 0.8605797101449276, "calib/step_q_w_n": 69.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 785.5390625, "completions/mean_terminated_length": 827.5637817382812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0021333333333333334, "grad_norm": 1.514235496520996, "learning_rate": 2.5000000000000004e-07, "loss": 0.0819, "num_tokens": 560369.0, "reward": 0.040345899760723114, "reward_std": 0.1000000610947609, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.01819179579615593, "rewards/format_reward_step": 0.046875, "step": 2 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.5618272125720978, "aux_distill/mean_u": 0.10840428648371338, "aux_distill/n_active_tok": 31.0, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.16015625, "calib/ece": 0.6399999999999999, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": -0.012500000000000067, "calib/mean_conf": 0.9733333333333333, "calib/mu_c": 0.965, "calib/mu_w": 0.9775, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.0234375, "calib/pce": 0.6399999999999999, "calib/std_conf": 0.019720265943665403, "calib/step_conf_rate": 0.0234375, "calib/step_q_c": 0.8381818181818183, "calib/step_q_c_n": 11.0, "calib/step_q_gap": -0.06198484848484842, "calib/step_q_w": 0.9001666666666667, "calib/step_q_w_n": 30.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 649.97265625, "completions/mean_terminated_length": 696.2050170898438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0032, "grad_norm": 0.8159453868865967, "learning_rate": 5.000000000000001e-07, "loss": 0.0354, "num_tokens": 832018.0, "reward": 0.015773242339491844, "reward_std": 0.038874901831150055, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008108984678983688, "rewards/format_reward_step": 0.015625, "step": 3 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.4118732213974, "aux_distill/mean_u": 0.3019396197880862, "aux_distill/n_active_tok": 24.0, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.109375, "calib/ece": 0.7300000000000001, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.01953125, "calib/frac_conf_gt_0.9": 0.8888888888888888, "calib/gap": -0.04785714285714282, "calib/mean_conf": 0.9522222222222223, "calib/mu_c": 0.915, "calib/mu_w": 0.9628571428571429, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.03125, "calib/nonempty_step_conf_rate": 0.01953125, "calib/pce": 0.7300000000000001, "calib/std_conf": 0.02657391276244754, "calib/step_conf_rate": 0.01953125, "calib/step_q_c": 0.892, "calib/step_q_c_n": 10.0, "calib/step_q_gap": 0.015888888888888952, "calib/step_q_w": 0.8761111111111111, "calib/step_q_w_n": 18.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 685.22265625, "completions/mean_terminated_length": 752.8626708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.004266666666666667, "grad_norm": 0.6611509919166565, "learning_rate": 7.5e-07, "loss": 0.04, "num_tokens": 1113603.0, "reward": 0.018045702949166298, "reward_std": 0.04522097483277321, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.008747655898332596, "rewards/format_reward_step": 0.01953125, "step": 4 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.2684955596923828, "aux_distill/mean_u": 0.2648181885977936, "aux_distill/n_active_tok": 31.333333333333332, "calib/answer_extract_rate": 0.03125, "calib/avg_num_step_conf": 0.203125, "calib/ece": 0.6962499999999999, "calib/final_conf_rate": 0.03125, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.875, "calib/gap": 0.04499999999999993, "calib/mean_conf": 0.94625, "calib/mu_c": 0.98, "calib/mu_w": 0.935, "calib/nonempty_final_conf_rate": 0.03125, "calib/nonempty_reasoning_rate": 0.03515625, "calib/nonempty_step_conf_rate": 0.03125, "calib/pce": 0.6962499999999999, "calib/std_conf": 0.07532222447591415, "calib/step_conf_rate": 0.03125, "calib/step_q_c": 0.9345454545454545, "calib/step_q_c_n": 22.0, "calib/step_q_gap": 0.07754545454545458, "calib/step_q_w": 0.8569999999999999, "calib/step_q_w_n": 30.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 621.35546875, "completions/mean_terminated_length": 694.61572265625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.005333333333333333, "grad_norm": 1.1881181001663208, "learning_rate": 1.0000000000000002e-06, "loss": 0.0332, "num_tokens": 1379358.0, "reward": 0.022720899432897568, "reward_std": 0.06022542715072632, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.010285547003149986, "rewards/format_reward_step": 0.02734375, "step": 5 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 2.126640498638153, "aux_distill/mean_u": 0.2676678498777883, "aux_distill/n_active_tok": 14.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.21484375, "calib/ece": 0.4745454545454545, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.03515625, "calib/frac_conf_gt_0.9": 0.7272727272727273, "calib/gap": 0.04933333333333334, "calib/mean_conf": 0.929090909090909, "calib/mu_c": 0.9560000000000001, "calib/mu_w": 0.9066666666666667, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.04296875, "calib/pce": 0.4745454545454545, "calib/std_conf": 0.07774720637536474, "calib/step_conf_rate": 0.04296875, "calib/step_q_c": 0.915, "calib/step_q_c_n": 24.0, "calib/step_q_gap": 0.08887096774193559, "calib/step_q_w": 0.8261290322580644, "calib/step_q_w_n": 31.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 622.8046875, "completions/mean_terminated_length": 667.1046142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 1.6805046796798706, "learning_rate": 1.25e-06, "loss": 0.075, "num_tokens": 1644748.0, "reward": 0.03894374892115593, "reward_std": 0.08292990177869797, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.01929374970495701, "rewards/format_reward_step": 0.03515625, "step": 6 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.2389460972377233, "aux_distill/mean_u": 0.28400473774278345, "aux_distill/n_active_tok": 34.285714285714285, "calib/answer_extract_rate": 0.05078125, "calib/avg_num_step_conf": 0.32421875, "calib/ece": 0.7879166666666666, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.9166666666666666, "calib/gap": 0.036499999999999866, "calib/mean_conf": 0.9545833333333333, "calib/mu_c": 0.985, "calib/mu_w": 0.9485000000000001, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.046875, "calib/pce": 0.7879166666666666, "calib/std_conf": 0.048796274334101455, "calib/step_conf_rate": 0.046875, "calib/step_q_c": 0.869, "calib/step_q_c_n": 20.0, "calib/step_q_gap": -0.005761904761904746, "calib/step_q_w": 0.8747619047619047, "calib/step_q_w_n": 63.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 722.31640625, "completions/mean_terminated_length": 807.4803466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.007466666666666667, "grad_norm": 1.6011173725128174, "learning_rate": 1.5e-06, "loss": 0.0944, "num_tokens": 1937085.0, "reward": 0.031165577471256256, "reward_std": 0.08389748632907867, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.011549903079867363, "rewards/format_reward_step": 0.0390625, "step": 7 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.5897511541843414, "aux_distill/mean_u": 0.29507590785209553, "aux_distill/n_active_tok": 23.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.1875, "calib/ece": 0.5155555555555555, "calib/final_conf_rate": 0.03515625, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.6666666666666666, "calib/gap": -0.012000000000000122, "calib/mean_conf": 0.9266666666666666, "calib/mu_c": 0.9199999999999999, "calib/mu_w": 0.932, "calib/nonempty_final_conf_rate": 0.03515625, "calib/nonempty_reasoning_rate": 0.046875, "calib/nonempty_step_conf_rate": 0.03515625, "calib/pce": 0.4988888888888889, "calib/std_conf": 0.03399346342395188, "calib/step_conf_rate": 0.03515625, "calib/step_q_c": 0.7973913043478259, "calib/step_q_c_n": 23.0, "calib/step_q_gap": -0.043008695652174134, "calib/step_q_w": 0.8404, "calib/step_q_w_n": 25.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 670.5859375, "completions/mean_terminated_length": 706.4608764648438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.008533333333333334, "grad_norm": 1.4276400804519653, "learning_rate": 1.75e-06, "loss": 0.0649, "num_tokens": 2215267.0, "reward": 0.024509180337190628, "reward_std": 0.05961586534976959, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.009955858811736107, "rewards/format_reward_step": 0.0234375, "step": 8 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.7846717834472656, "aux_distill/mean_u": 0.2995953277024473, "aux_distill/n_active_tok": 16.0, "calib/answer_extract_rate": 0.0390625, "calib/avg_num_step_conf": 0.109375, "calib/ece": 0.9800000000000001, "calib/final_conf_rate": 0.01953125, "calib/format_rate": 0.015625, "calib/frac_conf_gt_0.9": 1.0, "calib/mean_conf": 0.9800000000000001, "calib/mu_c": NaN, "calib/mu_w": 0.9800000000000001, "calib/nonempty_final_conf_rate": 0.01953125, "calib/nonempty_reasoning_rate": 0.05078125, "calib/nonempty_step_conf_rate": 0.02734375, "calib/pce": 0.9800000000000001, "calib/std_conf": 0.016733200530681527, "calib/step_conf_rate": 0.02734375, "calib/step_q_w": 0.8967857142857143, "calib/step_q_w_n": 28.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 3060.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 653.74609375, "completions/mean_terminated_length": 700.246826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0096, "grad_norm": 2.4460411071777344, "learning_rate": 2.0000000000000003e-06, "loss": 0.0439, "num_tokens": 2490162.0, "reward": 0.008043359033763409, "reward_std": 0.02275005541741848, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.0004617187660187483, "rewards/format_reward_step": 0.015625, "step": 9 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.5699462890625, "aux_distill/mean_u": 0.14541823796785602, "aux_distill/n_active_tok": 4.0, "calib/answer_extract_rate": 0.03515625, "calib/avg_num_step_conf": 0.11328125, "calib/ece": 0.40714285714285725, "calib/final_conf_rate": 0.02734375, "calib/format_rate": 0.01171875, "calib/frac_conf_gt_0.9": 1.0, "calib/gap": 0.014999999999999902, "calib/mean_conf": 0.9785714285714286, "calib/mu_c": 0.985, "calib/mu_w": 0.9700000000000001, "calib/nonempty_final_conf_rate": 0.02734375, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.03125, "calib/pce": 0.40714285714285725, "calib/std_conf": 0.01551825784457175, "calib/step_conf_rate": 0.03125, "calib/step_q_c": 0.985, "calib/step_q_c_n": 2.0, "calib/step_q_gap": 0.33018518518518514, "calib/step_q_w": 0.6548148148148148, "calib/step_q_w_n": 27.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 766.7890625, "completions/mean_terminated_length": 835.3106079101562, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.010666666666666666, "grad_norm": 4.1238274574279785, "learning_rate": 2.25e-06, "loss": 0.0581, "num_tokens": 2793260.0, "reward": 0.01581699214875698, "reward_std": 0.04473721235990524, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.004290234297513962, "rewards/format_reward_step": 0.01171875, "step": 10 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.314156413078308, "aux_distill/mean_u": 0.16583724328311117, "aux_distill/n_active_tok": 17.6, "calib/answer_extract_rate": 0.05859375, "calib/avg_num_step_conf": 0.3203125, "calib/ece": 0.6253333333333335, "calib/final_conf_rate": 0.05859375, "calib/format_rate": 0.05078125, "calib/frac_conf_gt_0.9": 0.7333333333333333, "calib/gap": 0.10636363636363633, "calib/mean_conf": 0.8919999999999999, "calib/mu_c": 0.97, "calib/mu_w": 0.8636363636363636, "calib/nonempty_final_conf_rate": 0.05859375, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.0703125, "calib/pce": 0.6253333333333335, "calib/std_conf": 0.14043741191956888, "calib/step_conf_rate": 0.0703125, "calib/step_q_c": 0.663125, "calib/step_q_c_n": 16.0, "calib/step_q_gap": -0.10748409090909095, "calib/step_q_w": 0.7706090909090909, "calib/step_q_w_n": 66.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 641.875, "completions/mean_terminated_length": 693.3333129882812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.011733333333333333, "grad_norm": 1.7187901735305786, "learning_rate": 2.5e-06, "loss": 0.0682, "num_tokens": 3062060.0, "reward": 0.042952343821525574, "reward_std": 0.08737139403820038, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.019498437643051147, "rewards/format_reward_step": 0.05078125, "step": 11 }, { "aux_distill/lambda": 0.29999999999999993, "aux_distill/loss": 1.278182077407837, "aux_distill/mean_u": 0.24745318100648223, "aux_distill/n_active_tok": 21.6, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.3515625, "calib/ece": 0.37499999999999994, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.03125, "calib/frac_conf_gt_0.9": 0.7, "calib/gap": -0.012499999999999956, "calib/mean_conf": 0.945, "calib/mu_c": 0.9400000000000001, "calib/mu_w": 0.9525, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.078125, "calib/nonempty_step_conf_rate": 0.0625, "calib/pce": 0.35999999999999993, "calib/std_conf": 0.04341658669218481, "calib/step_conf_rate": 0.0625, "calib/step_q_c": 0.8353846153846152, "calib/step_q_c_n": 39.0, "calib/step_q_gap": 0.016953242835595628, "calib/step_q_w": 0.8184313725490195, "calib/step_q_w_n": 51.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 605.703125, "completions/mean_terminated_length": 659.8297729492188, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 1.3734772205352783, "learning_rate": 2.7500000000000004e-06, "loss": 0.0688, "num_tokens": 3321296.0, "reward": 0.03764042630791664, "reward_std": 0.10646320879459381, "rewards/accuracy_reward_step": 0.0234375, "rewards/final_brier_reward_step": 0.02059336006641388, "rewards/format_reward_step": 0.03125, "step": 12 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.0337275192141533, "aux_distill/mean_u": 0.1903723552664163, "aux_distill/n_active_tok": 25.0, "calib/answer_extract_rate": 0.03515625, "calib/avg_num_step_conf": 0.30078125, "calib/ece": 0.43499999999999994, "calib/final_conf_rate": 0.0390625, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.4, "calib/gap": 0.3252380952380952, "calib/mean_conf": 0.679, "calib/mu_c": 0.9066666666666667, "calib/mu_w": 0.5814285714285715, "calib/nonempty_final_conf_rate": 0.0390625, "calib/nonempty_reasoning_rate": 0.0546875, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.407, "calib/std_conf": 0.37479194228264834, "calib/step_conf_rate": 0.05078125, "calib/step_q_c": 0.7618181818181817, "calib/step_q_c_n": 11.0, "calib/step_q_gap": 0.15924242424242419, "calib/step_q_w": 0.6025757575757575, "calib/step_q_w_n": 66.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 673.48828125, "completions/mean_terminated_length": 739.969970703125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.013866666666666666, "grad_norm": 1.3304049968719482, "learning_rate": 3e-06, "loss": 0.0784, "num_tokens": 3598301.0, "reward": 0.028053514659404755, "reward_std": 0.07934731990098953, "rewards/accuracy_reward_step": 0.01171875, "rewards/final_brier_reward_step": 0.01704453118145466, "rewards/format_reward_step": 0.02734375, "step": 13 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.1433793703715007, "aux_distill/mean_u": 0.4785981771689191, "aux_distill/n_active_tok": 9.333333333333334, "calib/answer_extract_rate": 0.02734375, "calib/avg_num_step_conf": 0.265625, "calib/ece": 0.6233333333333333, "calib/final_conf_rate": 0.01171875, "calib/format_rate": 0.00390625, "calib/frac_conf_gt_0.9": 0.3333333333333333, "calib/gap": -0.31499999999999995, "calib/mean_conf": 0.44999999999999996, "calib/mu_c": 0.24, "calib/mu_w": 0.5549999999999999, "calib/nonempty_final_conf_rate": 0.01171875, "calib/nonempty_reasoning_rate": 0.0703125, "calib/nonempty_step_conf_rate": 0.05078125, "calib/pce": 0.37, "calib/std_conf": 0.34765883660086455, "calib/step_conf_rate": 0.05078125, "calib/step_q_c": 0.262, "calib/step_q_c_n": 5.0, "calib/step_q_gap": -0.07657142857142857, "calib/step_q_w": 0.3385714285714286, "calib/step_q_w_n": 63.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 624.95703125, "completions/mean_terminated_length": 675.0590209960938, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.014933333333333333, "grad_norm": 1.457983374595642, "learning_rate": 3.2500000000000002e-06, "loss": 0.0601, "num_tokens": 3863690.0, "reward": 0.004731249995529652, "reward_std": 0.01338199619203806, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.0016499999910593033, "rewards/format_reward_step": 0.00390625, "step": 14 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.2922158042589824, "aux_distill/mean_u": 0.3207821715853844, "aux_distill/n_active_tok": 10.0, "calib/answer_extract_rate": 0.04296875, "calib/avg_num_step_conf": 0.3125, "calib/ece": 0.4483333333333333, "calib/final_conf_rate": 0.0234375, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.16666666666666666, "calib/gap": -0.14600000000000002, "calib/mean_conf": 0.36166666666666664, "calib/mu_c": 0.24, "calib/mu_w": 0.386, "calib/nonempty_final_conf_rate": 0.0234375, "calib/nonempty_reasoning_rate": 0.07421875, "calib/nonempty_step_conf_rate": 0.0625, "calib/pce": 0.32166666666666666, "calib/std_conf": 0.33012203130896245, "calib/step_conf_rate": 0.0625, "calib/step_q_c": 0.42400000000000004, "calib/step_q_c_n": 5.0, "calib/step_q_gap": -0.0019999999999999463, "calib/step_q_w": 0.426, "calib/step_q_w_n": 75.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 3057.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 497.453125, "completions/mean_terminated_length": 528.4149780273438, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.016, "grad_norm": 1.143528938293457, "learning_rate": 3.5e-06, "loss": 0.0974, "num_tokens": 4098918.0, "reward": 0.021565038710832596, "reward_std": 0.06099514290690422, "rewards/accuracy_reward_step": 0.00390625, "rewards/final_brier_reward_step": 0.01578632742166519, "rewards/format_reward_step": 0.0234375, "step": 15 }, { "aux_distill/lambda": 0.29999999999999993, "aux_distill/loss": 0.8635341763496399, "aux_distill/mean_u": 0.31292070845885867, "aux_distill/n_active_tok": 51.2, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.7578125, "calib/ece": 0.37249999999999994, "calib/final_conf_rate": 0.046875, "calib/format_rate": 0.0390625, "calib/frac_conf_gt_0.9": 0.08333333333333333, "calib/gap": 0.10875000000000001, "calib/mean_conf": 0.3475, "calib/mu_c": 0.42, "calib/mu_w": 0.31124999999999997, "calib/nonempty_final_conf_rate": 0.046875, "calib/nonempty_reasoning_rate": 0.11328125, "calib/nonempty_step_conf_rate": 0.09765625, "calib/pce": 0.1933333333333333, "calib/std_conf": 0.34285869295284516, "calib/step_conf_rate": 0.09765625, "calib/step_q_c": 0.42625, "calib/step_q_c_n": 16.0, "calib/step_q_gap": 0.17636797752808991, "calib/step_q_w": 0.2498820224719101, "calib/step_q_w_n": 178.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 624.296875, "completions/mean_terminated_length": 677.203369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.017066666666666667, "grad_norm": 1.7086925506591797, "learning_rate": 3.7500000000000005e-06, "loss": 0.0955, "num_tokens": 4367586.0, "reward": 0.04019277170300484, "reward_std": 0.0873384177684784, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.025698047131299973, "rewards/format_reward_step": 0.0390625, "step": 16 }, { "aux_distill/lambda": 0.3, "aux_distill/loss": 1.1117850542068481, "aux_distill/mean_u": 0.3679425643558005, "aux_distill/n_active_tok": 38.0, "calib/answer_extract_rate": 0.0546875, "calib/avg_num_step_conf": 0.54296875, "calib/ece": 0.4572727272727272, "calib/final_conf_rate": 0.04296875, "calib/format_rate": 0.0234375, "calib/frac_conf_gt_0.9": 0.2727272727272727, "calib/mean_conf": 0.4572727272727272, "calib/mu_c": NaN, "calib/mu_w": 0.4572727272727272, "calib/nonempty_final_conf_rate": 0.04296875, "calib/nonempty_reasoning_rate": 0.1015625, "calib/nonempty_step_conf_rate": 0.07421875, "calib/pce": 0.4572727272727272, "calib/std_conf": 0.4130395294551866, "calib/step_conf_rate": 0.07421875, "calib/step_q_w": 0.3162045755422472, "calib/step_q_w_n": 139.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 555.11328125, "completions/mean_terminated_length": 597.0966796875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.018133333333333335, "grad_norm": 0.8777731657028198, "learning_rate": 4.000000000000001e-06, "loss": 0.0304, "num_tokens": 4613223.0, "reward": 0.02120097726583481, "reward_std": 0.05278696119785309, "rewards/accuracy_reward_step": 0.0, "rewards/final_brier_reward_step": 0.018964452669024467, "rewards/format_reward_step": 0.0234375, "step": 17 }, { "aux_distill/lambda": 0.29999999999999993, "aux_distill/loss": 0.9870762129624685, "aux_distill/mean_u": 0.3191481519811337, "aux_distill/n_active_tok": 22.0, "calib/answer_extract_rate": 0.05078125, "calib/avg_num_step_conf": 0.3203125, "calib/ece": 0.3189693576383376, "calib/final_conf_rate": 0.0546875, "calib/format_rate": 0.02734375, "calib/frac_conf_gt_0.9": 0.14285714285714285, "calib/gap": -0.1487975839113939, "calib/mean_conf": 0.34754078620976614, "calib/mu_c": 0.22, "calib/mu_w": 0.3687975839113939, "calib/nonempty_final_conf_rate": 0.0546875, "calib/nonempty_reasoning_rate": 0.08984375, "calib/nonempty_step_conf_rate": 0.06640625, "calib/pce": 0.26182650049548045, "calib/std_conf": 0.3387916578440022, "calib/step_conf_rate": 0.06640625, "calib/step_q_c": 0.10200000000000001, "calib/step_q_c_n": 5.0, "calib/step_q_gap": -0.19884221498891771, "calib/step_q_w": 0.3008422149889177, "calib/step_q_w_n": 77.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 602.00390625, "completions/mean_terminated_length": 636.83056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0192, "grad_norm": 2.490189790725708, "learning_rate": 4.25e-06, "loss": 0.0671, "num_tokens": 4878056.0, "reward": 0.02656581997871399, "reward_std": 0.06706564873456955, "rewards/accuracy_reward_step": 0.0078125, "rewards/final_brier_reward_step": 0.01797538995742798, "rewards/format_reward_step": 0.02734375, "step": 18 }, { "aux_distill/lambda": 0.29999999999999993, "aux_distill/loss": 0.9927313059568406, "aux_distill/mean_u": 0.30662449710636475, "aux_distill/n_active_tok": 31.8, "calib/answer_extract_rate": 0.09765625, "calib/avg_num_step_conf": 1.18359375, "calib/ece": 0.276476284445, "calib/final_conf_rate": 0.078125, "calib/format_rate": 0.0625, "calib/frac_conf_gt_0.9": 0.05, "calib/gap": 0.02218476339803918, "calib/mean_conf": 0.22447628444499998, "calib/mu_c": 0.24333333333333332, "calib/mu_w": 0.22114856993529414, "calib/nonempty_final_conf_rate": 0.078125, "calib/nonempty_reasoning_rate": 0.21875, "calib/nonempty_step_conf_rate": 0.19921875, "calib/pce": 0.17547628444500002, "calib/std_conf": 0.246312235256632, "calib/step_conf_rate": 0.19921875, "calib/step_q_c": 0.23055555555555557, "calib/step_q_c_n": 18.0, "calib/step_q_gap": 0.02889905699921752, "calib/step_q_w": 0.20165649855633805, "calib/step_q_w_n": 284.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2978.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 639.859375, "completions/mean_terminated_length": 685.3723754882812, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.020266666666666665, "grad_norm": 2.305959939956665, "learning_rate": 4.5e-06, "loss": 0.1893, "num_tokens": 5146620.0, "reward": 0.06604452431201935, "reward_std": 0.1502785086631775, "rewards/accuracy_reward_step": 0.015625, "rewards/final_brier_reward_step": 0.05396406352519989, "rewards/format_reward_step": 0.0625, "step": 19 }, { "aux_distill/lambda": 0.2999999999999999, "aux_distill/loss": 1.0178489529568215, "aux_distill/mean_u": 0.44129956033095913, "aux_distill/n_active_tok": 34.08695652173913, "calib/answer_extract_rate": 0.2421875, "calib/avg_num_step_conf": 1.69921875, "calib/ece": 0.1661569117133794, "calib/final_conf_rate": 0.23046875, "calib/format_rate": 0.1484375, "calib/frac_conf_gt_0.9": 0.01694915254237288, "calib/gap": -0.05294987261431322, "calib/mean_conf": 0.21577022886999955, "calib/mu_c": 0.17, "calib/mu_w": 0.22294987261431323, "calib/nonempty_final_conf_rate": 0.23046875, "calib/nonempty_reasoning_rate": 0.375, "calib/nonempty_step_conf_rate": 0.29296875, "calib/pce": 0.12316696012219798, "calib/std_conf": 0.20314595471244742, "calib/step_conf_rate": 0.29296875, "calib/step_q_c": 0.21828571428571428, "calib/step_q_c_n": 35.0, "calib/step_q_gap": -0.005639482913952498, "calib/step_q_w": 0.22392519719966678, "calib/step_q_w_n": 362.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 573.02734375, "completions/mean_terminated_length": 596.3211059570312, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.021333333333333333, "grad_norm": 2.455202579498291, "learning_rate": 4.75e-06, "loss": 0.2093, "num_tokens": 5398187.0, "reward": 0.15179908275604248, "reward_std": 0.2831573188304901, "rewards/accuracy_reward_step": 0.03515625, "rewards/final_brier_reward_step": 0.12000440806150436, "rewards/format_reward_step": 0.1484375, "step": 20 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9270459283143282, "aux_distill/mean_u": 0.35117479873859514, "aux_distill/n_active_tok": 41.25, "calib/answer_extract_rate": 0.4296875, "calib/avg_num_step_conf": 2.91015625, "calib/ece": 0.1688007413558772, "calib/final_conf_rate": 0.37109375, "calib/format_rate": 0.24609375, "calib/frac_conf_gt_0.9": 0.010526315789473684, "calib/gap": 0.02934422203383541, "calib/mean_conf": 0.2818624165388597, "calib/mu_c": 0.30750000000000005, "calib/mu_w": 0.27815577796616464, "calib/nonempty_final_conf_rate": 0.37109375, "calib/nonempty_reasoning_rate": 0.62109375, "calib/nonempty_step_conf_rate": 0.48828125, "calib/pce": 0.1621736842105263, "calib/std_conf": 0.2205194295691477, "calib/step_conf_rate": 0.48828125, "calib/step_q_c": 0.21250000000000002, "calib/step_q_c_n": 64.0, "calib/step_q_gap": -0.010935235866823712, "calib/step_q_w": 0.22343523586682373, "calib/step_q_w_n": 681.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 480.07421875, "completions/mean_terminated_length": 493.57025146484375, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.0224, "grad_norm": 2.3599891662597656, "learning_rate": 5e-06, "loss": 0.35, "num_tokens": 5624046.0, "reward": 0.2571793496608734, "reward_std": 0.3987439274787903, "rewards/accuracy_reward_step": 0.05859375, "rewards/final_brier_reward_step": 0.20967116951942444, "rewards/format_reward_step": 0.24609375, "step": 21 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9610700905323029, "aux_distill/mean_u": 0.4553320713262558, "aux_distill/n_active_tok": 53.25, "calib/answer_extract_rate": 0.58984375, "calib/avg_num_step_conf": 3.44921875, "calib/ece": 0.17043563629820027, "calib/final_conf_rate": 0.56640625, "calib/format_rate": 0.4375, "calib/frac_conf_gt_0.9": 0.013793103448275862, "calib/gap": 0.04415089183342563, "calib/mean_conf": 0.2646309511539849, "calib/mu_c": 0.30391002044027393, "calib/mu_w": 0.2597591286068483, "calib/nonempty_final_conf_rate": 0.56640625, "calib/nonempty_reasoning_rate": 0.78515625, "calib/nonempty_step_conf_rate": 0.66796875, "calib/pce": 0.16236087993298912, "calib/std_conf": 0.21311021832354388, "calib/step_conf_rate": 0.66796875, "calib/step_q_c": 0.31052220008859993, "calib/step_q_c_n": 61.0, "calib/step_q_gap": 0.0386094263659722, "calib/step_q_w": 0.27191277372262773, "calib/step_q_w_n": 822.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 442.33203125, "completions/mean_terminated_length": 458.44940185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 4.0, "epoch": 0.023466666666666667, "grad_norm": 3.0896050930023193, "learning_rate": 4.9722222222222224e-06, "loss": 0.3149, "num_tokens": 5839099.0, "reward": 0.4424505829811096, "reward_std": 0.483396053314209, "rewards/accuracy_reward_step": 0.078125, "rewards/final_brier_reward_step": 0.36927616596221924, "rewards/format_reward_step": 0.4375, "step": 22 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9353764075785875, "aux_distill/mean_u": 0.35294600129183773, "aux_distill/n_active_tok": 72.625, "calib/answer_extract_rate": 0.625, "calib/avg_num_step_conf": 3.7578125, "calib/ece": 0.27355118955660707, "calib/final_conf_rate": 0.63671875, "calib/format_rate": 0.47265625, "calib/frac_conf_gt_0.9": 0.03680981595092025, "calib/gap": 0.028277245810025498, "calib/mean_conf": 0.35243953311488924, "calib/mu_c": 0.37846153846153846, "calib/mu_w": 0.35018429265151296, "calib/nonempty_final_conf_rate": 0.63671875, "calib/nonempty_reasoning_rate": 0.84765625, "calib/nonempty_step_conf_rate": 0.71484375, "calib/pce": 0.2731180607222512, "calib/std_conf": 0.2499565020687635, "calib/step_conf_rate": 0.71484375, "calib/step_q_c": 0.33299999999999996, "calib/step_q_c_n": 50.0, "calib/step_q_gap": -0.0760557458927632, "calib/step_q_w": 0.40905574589276317, "calib/step_q_w_n": 912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 468.2890625, "completions/mean_terminated_length": 483.3951416015625, "completions/min_length": 0.0, "completions/min_terminated_length": 6.0, "epoch": 0.024533333333333334, "grad_norm": 3.023226737976074, "learning_rate": 4.944444444444445e-06, "loss": 0.2974, "num_tokens": 6062917.0, "reward": 0.45471012592315674, "reward_std": 0.4595857262611389, "rewards/accuracy_reward_step": 0.05078125, "rewards/final_brier_reward_step": 0.3859827518463135, "rewards/format_reward_step": 0.47265625, "step": 23 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.918319652788341, "aux_distill/mean_u": 0.3763038000716115, "aux_distill/n_active_tok": 54.5, "calib/answer_extract_rate": 0.7421875, "calib/avg_num_step_conf": 3.53125, "calib/ece": 0.25175623019718174, "calib/final_conf_rate": 0.71875, "calib/format_rate": 0.57421875, "calib/frac_conf_gt_0.9": 0.021739130434782608, "calib/gap": -0.017374254595942218, "calib/mean_conf": 0.35396275193631216, "calib/mu_c": 0.3385714285714286, "calib/mu_w": 0.3559456831673708, "calib/nonempty_final_conf_rate": 0.71875, "calib/nonempty_reasoning_rate": 0.87890625, "calib/nonempty_step_conf_rate": 0.75390625, "calib/pce": 0.2457942736754426, "calib/std_conf": 0.21846927693240498, "calib/step_conf_rate": 0.75390625, "calib/step_q_c": 0.4223333333333334, "calib/step_q_c_n": 90.0, "calib/step_q_gap": 0.05739322645372652, "calib/step_q_w": 0.3649401068796069, "calib/step_q_w_n": 814.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 432.51171875, "completions/mean_terminated_length": 448.2712707519531, "completions/min_length": 0.0, "completions/min_terminated_length": 15.0, "epoch": 0.0256, "grad_norm": 1.662244439125061, "learning_rate": 4.9166666666666665e-06, "loss": 0.3306, "num_tokens": 6278152.0, "reward": 0.5555780529975891, "reward_std": 0.4755654036998749, "rewards/accuracy_reward_step": 0.08203125, "rewards/final_brier_reward_step": 0.45490607619285583, "rewards/format_reward_step": 0.57421875, "step": 24 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.940506137907505, "aux_distill/mean_u": 0.37342990083807837, "aux_distill/n_active_tok": 66.0, "calib/answer_extract_rate": 0.78125, "calib/avg_num_step_conf": 3.96875, "calib/ece": 0.2819637535040166, "calib/final_conf_rate": 0.80078125, "calib/format_rate": 0.703125, "calib/frac_conf_gt_0.9": 0.03414634146341464, "calib/gap": -0.0317800534933525, "calib/mean_conf": 0.3949393632601142, "calib/mu_c": 0.36750000000000005, "calib/mu_w": 0.39928005349335255, "calib/nonempty_final_conf_rate": 0.80078125, "calib/nonempty_reasoning_rate": 0.91796875, "calib/nonempty_step_conf_rate": 0.84375, "calib/pce": 0.2701588754552361, "calib/std_conf": 0.23788335954319495, "calib/step_conf_rate": 0.84375, "calib/step_q_c": 0.34856725146198836, "calib/step_q_c_n": 171.0, "calib/step_q_gap": -0.08577948073599456, "calib/step_q_w": 0.4343467321979829, "calib/step_q_w_n": 845.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 364.23046875, "completions/mean_terminated_length": 374.4698791503906, "completions/min_length": 0.0, "completions/min_terminated_length": 24.0, "epoch": 0.02666666666666667, "grad_norm": 1.7440599203109741, "learning_rate": 4.888888888888889e-06, "loss": 0.3463, "num_tokens": 6474619.0, "reward": 0.6718124151229858, "reward_std": 0.4372907280921936, "rewards/accuracy_reward_step": 0.11328125, "rewards/final_brier_reward_step": 0.5272184610366821, "rewards/format_reward_step": 0.703125, "step": 25 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9505161102861166, "aux_distill/mean_u": 0.43465847075130665, "aux_distill/n_active_tok": 70.75, "calib/answer_extract_rate": 0.87890625, "calib/avg_num_step_conf": 4.234375, "calib/ece": 0.35119691629955946, "calib/final_conf_rate": 0.88671875, "calib/format_rate": 0.80859375, "calib/frac_conf_gt_0.9": 0.048458149779735685, "calib/gap": -0.11122185534591206, "calib/mean_conf": 0.4052057268722467, "calib/mu_c": 0.3013333333333333, "calib/mu_w": 0.41255518867924534, "calib/nonempty_final_conf_rate": 0.88671875, "calib/nonempty_reasoning_rate": 0.9765625, "calib/nonempty_step_conf_rate": 0.9296875, "calib/pce": 0.34516167400881054, "calib/std_conf": 0.23691412078640622, "calib/step_conf_rate": 0.9296875, "calib/step_q_c": 0.39131578947368423, "calib/step_q_c_n": 76.0, "calib/step_q_gap": -0.013101670843776114, "calib/step_q_w": 0.40441746031746034, "calib/step_q_w_n": 1008.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 351.65625, "completions/mean_terminated_length": 368.9508056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.027733333333333332, "grad_norm": 1.5558923482894897, "learning_rate": 4.861111111111111e-06, "loss": 0.2922, "num_tokens": 6669883.0, "reward": 0.7352690696716309, "reward_std": 0.3406623303890228, "rewards/accuracy_reward_step": 0.05859375, "rewards/final_brier_reward_step": 0.6033506393432617, "rewards/format_reward_step": 0.80859375, "step": 26 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8715469744056463, "aux_distill/mean_u": 0.4131292154875933, "aux_distill/n_active_tok": 68.875, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 4.6640625, "calib/ece": 0.32931550512155855, "calib/final_conf_rate": 0.94140625, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.04564315352697095, "calib/gap": 0.014844166011933668, "calib/mean_conf": 0.42060181217550047, "calib/mu_c": 0.434090909090909, "calib/mu_w": 0.41924674307897536, "calib/nonempty_final_conf_rate": 0.94140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.32931550512155855, "calib/std_conf": 0.23038737837096063, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.423968253968254, "calib/step_q_c_n": 126.0, "calib/step_q_gap": -0.006461334046727274, "calib/step_q_w": 0.43042958801498127, "calib/step_q_w_n": 1068.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 354.55859375, "completions/mean_terminated_length": 358.7628479003906, "completions/min_length": 0.0, "completions/min_terminated_length": 24.0, "epoch": 0.0288, "grad_norm": 1.6147425174713135, "learning_rate": 4.833333333333333e-06, "loss": 0.2992, "num_tokens": 6865866.0, "reward": 0.8195099830627441, "reward_std": 0.28160202503204346, "rewards/accuracy_reward_step": 0.08984375, "rewards/final_brier_reward_step": 0.6702699661254883, "rewards/format_reward_step": 0.87890625, "step": 27 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8409523535519838, "aux_distill/mean_u": 0.39175565836695353, "aux_distill/n_active_tok": 96.0, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 5.765625, "calib/ece": 0.3076613089487777, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.91015625, "calib/frac_conf_gt_0.9": 0.024691358024691357, "calib/gap": -0.04317698626550687, "calib/mean_conf": 0.4128464941339629, "calib/mu_c": 0.37500000000000006, "calib/mu_w": 0.4181769862655069, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.2985255064796419, "calib/std_conf": 0.22269757326949044, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3717714285714286, "calib/step_q_c_n": 175.0, "calib/step_q_gap": -0.047476346667450486, "calib/step_q_w": 0.4192477752388791, "calib/step_q_w_n": 1301.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 360.7265625, "completions/mean_terminated_length": 366.452392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 50.0, "epoch": 0.029866666666666666, "grad_norm": 1.035765290260315, "learning_rate": 4.805555555555556e-06, "loss": 0.2905, "num_tokens": 7065156.0, "reward": 0.8525996804237366, "reward_std": 0.2693730294704437, "rewards/accuracy_reward_step": 0.1171875, "rewards/final_brier_reward_step": 0.6778556108474731, "rewards/format_reward_step": 0.91015625, "step": 28 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8522766791284084, "aux_distill/mean_u": 0.37731298414794345, "aux_distill/n_active_tok": 85.25, "calib/answer_extract_rate": 0.94140625, "calib/avg_num_step_conf": 5.11328125, "calib/ece": 0.3199395161290322, "calib/final_conf_rate": 0.96875, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.012096774193548387, "calib/gap": 0.011023706896551744, "calib/mean_conf": 0.384375, "calib/mu_c": 0.39468749999999997, "calib/mu_w": 0.3836637931034482, "calib/nonempty_final_conf_rate": 0.96875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9765625, "calib/pce": 0.3198991935483871, "calib/std_conf": 0.21358195266462066, "calib/step_conf_rate": 0.9765625, "calib/step_q_c": 0.4183333333333333, "calib/step_q_c_n": 84.0, "calib/step_q_gap": 0.05758231292517002, "calib/step_q_w": 0.36075102040816326, "calib/step_q_w_n": 1225.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 308.078125, "completions/mean_terminated_length": 314.21514892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 27.0, "epoch": 0.030933333333333334, "grad_norm": 1.1891764402389526, "learning_rate": 4.777777777777778e-06, "loss": 0.2311, "num_tokens": 7251152.0, "reward": 0.8489661812782288, "reward_std": 0.25030654668807983, "rewards/accuracy_reward_step": 0.0625, "rewards/final_brier_reward_step": 0.7213698029518127, "rewards/format_reward_step": 0.9140625, "step": 29 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9309793021529913, "aux_distill/mean_u": 0.3487368012615765, "aux_distill/n_active_tok": 76.625, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 5.078125, "calib/ece": 0.28369047619047616, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.007936507936507936, "calib/gap": -0.023942428035043872, "calib/mean_conf": 0.3511507936507937, "calib/mu_c": 0.3288235294117647, "calib/mu_w": 0.35276595744680855, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.28369047619047616, "calib/std_conf": 0.195815006942304, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.35017094017094014, "calib/step_q_c_n": 117.0, "calib/step_q_gap": -0.006929651545036164, "calib/step_q_w": 0.3571005917159763, "calib/step_q_w_n": 1183.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 272.4453125, "completions/mean_terminated_length": 275.6759033203125, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.032, "grad_norm": 1.1468839645385742, "learning_rate": 4.75e-06, "loss": 0.206, "num_tokens": 7427882.0, "reward": 0.9133496284484863, "reward_std": 0.16822901368141174, "rewards/accuracy_reward_step": 0.06640625, "rewards/final_brier_reward_step": 0.7915430068969727, "rewards/format_reward_step": 0.96875, "step": 30 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9184452090412378, "aux_distill/mean_u": 0.3828412534240243, "aux_distill/n_active_tok": 76.625, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 4.96484375, "calib/ece": 0.25460396083361886, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.01568627450980392, "calib/gap": -0.0025642798847438364, "calib/mean_conf": 0.3598588627944031, "calib/mu_c": 0.3575862068965517, "calib/mu_w": 0.3601504867812955, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2503686667159718, "calib/std_conf": 0.18381201605366868, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3186231884057971, "calib/step_q_c_n": 138.0, "calib/step_q_gap": -0.037029460320658014, "calib/step_q_w": 0.35565264872645513, "calib/step_q_w_n": 1133.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 247.328125, "completions/mean_terminated_length": 249.2755889892578, "completions/min_length": 0.0, "completions/min_terminated_length": 51.0, "epoch": 0.03306666666666667, "grad_norm": 1.1739736795425415, "learning_rate": 4.722222222222222e-06, "loss": 0.2572, "num_tokens": 7597110.0, "reward": 0.9391323328018188, "reward_std": 0.15756180882453918, "rewards/accuracy_reward_step": 0.1171875, "rewards/final_brier_reward_step": 0.7845146656036377, "rewards/format_reward_step": 0.9765625, "step": 31 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8915242683142424, "aux_distill/mean_u": 0.3488786209027339, "aux_distill/n_active_tok": 91.375, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 5.765625, "calib/ece": 0.22366141732283462, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.025626875407697258, "calib/mean_conf": 0.33295275590551177, "calib/mu_c": 0.3108571428571429, "calib/mu_w": 0.33648401826484015, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.20940944881889761, "calib/std_conf": 0.19328373978999924, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.310981308411215, "calib/step_q_c_n": 214.0, "calib/step_q_gap": 0.002391767999170624, "calib/step_q_w": 0.3085895404120444, "calib/step_q_w_n": 1262.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 256.578125, "completions/mean_terminated_length": 258.5984191894531, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 0.034133333333333335, "grad_norm": 1.5438235998153687, "learning_rate": 4.694444444444445e-06, "loss": 0.2457, "num_tokens": 7769498.0, "reward": 0.9572702646255493, "reward_std": 0.1368747055530548, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.7895406484603882, "rewards/format_reward_step": 0.984375, "step": 32 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.917936485260725, "aux_distill/mean_u": 0.3906123472827693, "aux_distill/n_active_tok": 113.0, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 7.078125, "calib/ece": 0.20196850393700788, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.00590214067278283, "calib/mean_conf": 0.32173228346456695, "calib/mu_c": 0.3166666666666667, "calib/mu_w": 0.32256880733944954, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.19098425196850397, "calib/std_conf": 0.1856955195088237, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33299107142857143, "calib/step_q_c_n": 224.0, "calib/step_q_gap": 0.03755026538323136, "calib/step_q_w": 0.29544080604534007, "calib/step_q_w_n": 1588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 311.609375, "completions/mean_terminated_length": 314.06298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.0352, "grad_norm": 4.340875625610352, "learning_rate": 4.666666666666667e-06, "loss": 0.2648, "num_tokens": 7956142.0, "reward": 0.9643667936325073, "reward_std": 0.1367204785346985, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.7998273372650146, "rewards/format_reward_step": 0.98828125, "step": 33 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.851872069761157, "aux_distill/mean_u": 0.32349913069650194, "aux_distill/n_active_tok": 118.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.109375, "calib/ece": 0.20184476940382454, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.015748031496062992, "calib/gap": 0.00206108374384234, "calib/mean_conf": 0.30886389201349834, "calib/mu_c": 0.31068965517241376, "calib/mu_w": 0.3086285714285714, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1982677165354331, "calib/std_conf": 0.1947032285675119, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3370466321243524, "calib/step_q_c_n": 193.0, "calib/step_q_gap": 0.028676819146917942, "calib/step_q_w": 0.30836981297743443, "calib/step_q_w_n": 1627.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 294.09375, "completions/mean_terminated_length": 296.4094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.03626666666666667, "grad_norm": 1.6347882747650146, "learning_rate": 4.638888888888889e-06, "loss": 0.2868, "num_tokens": 8136542.0, "reward": 0.9612504243850708, "reward_std": 0.13310450315475464, "rewards/accuracy_reward_step": 0.11328125, "rewards/final_brier_reward_step": 0.8170320987701416, "rewards/format_reward_step": 0.9921875, "step": 34 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8687707763165236, "aux_distill/mean_u": 0.3417219541422614, "aux_distill/n_active_tok": 135.125, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.8671875, "calib/ece": 0.18003937007874016, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.012003913894324814, "calib/mean_conf": 0.2749212598425197, "calib/mu_c": 0.2645714285714286, "calib/mu_w": 0.27657534246575344, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.15858267716535435, "calib/std_conf": 0.1757632223629866, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2934931506849316, "calib/step_q_c_n": 292.0, "calib/step_q_gap": -0.002841834148233191, "calib/step_q_w": 0.2963349848331648, "calib/step_q_w_n": 1978.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 339.9765625, "completions/mean_terminated_length": 342.6535339355469, "completions/min_length": 0.0, "completions/min_terminated_length": 128.0, "epoch": 0.037333333333333336, "grad_norm": 1.1217002868652344, "learning_rate": 4.611111111111112e-06, "loss": 0.2413, "num_tokens": 8332832.0, "reward": 0.9755380749702454, "reward_std": 0.11975622177124023, "rewards/accuracy_reward_step": 0.13671875, "rewards/final_brier_reward_step": 0.8221698999404907, "rewards/format_reward_step": 0.9921875, "step": 35 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9129628445953131, "aux_distill/mean_u": 0.421250388575979, "aux_distill/n_active_tok": 130.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.28515625, "calib/ece": 0.131640625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.005971291866028627, "calib/mean_conf": 0.28375, "calib/mu_c": 0.28818181818181815, "calib/mu_w": 0.2822105263157895, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0787890625, "calib/std_conf": 0.1903717580157309, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32663507109004736, "calib/step_q_c_n": 422.0, "calib/step_q_gap": 0.001790927476156834, "calib/step_q_w": 0.3248441436138905, "calib/step_q_w_n": 1699.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 312.66796875, "completions/mean_terminated_length": 315.1299133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.0384, "grad_norm": 1.9868746995925903, "learning_rate": 4.583333333333333e-06, "loss": 0.2626, "num_tokens": 8515587.0, "reward": 1.0159192085266113, "reward_std": 0.12932753562927246, "rewards/accuracy_reward_step": 0.2578125, "rewards/final_brier_reward_step": 0.7740257978439331, "rewards/format_reward_step": 1.0, "step": 36 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8150140605866909, "aux_distill/mean_u": 0.2819595940367653, "aux_distill/n_active_tok": 142.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 9.19140625, "calib/ece": 0.18827624015779124, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": -0.02756114637071816, "calib/mean_conf": 0.31054186515779125, "calib/mu_c": 0.28717948717948716, "calib/mu_w": 0.3147406335502053, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17323717765779123, "calib/std_conf": 0.19026432766581608, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28545090909090914, "calib/step_q_c_n": 275.0, "calib/step_q_gap": -0.08267099806038813, "calib/step_q_w": 0.3681219071512973, "calib/step_q_w_n": 2078.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 338.0078125, "completions/mean_terminated_length": 340.6692810058594, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.039466666666666664, "grad_norm": 0.9691076874732971, "learning_rate": 4.555555555555556e-06, "loss": 0.2546, "num_tokens": 8709213.0, "reward": 0.9745661616325378, "reward_std": 0.12055601924657822, "rewards/accuracy_reward_step": 0.15234375, "rewards/final_brier_reward_step": 0.8006948828697205, "rewards/format_reward_step": 0.99609375, "step": 37 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9269427638500929, "aux_distill/mean_u": 0.39672384793919174, "aux_distill/n_active_tok": 149.75, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.75, "calib/ece": 0.13189723320158103, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.003952569169960474, "calib/gap": 0.017112944162436555, "calib/mean_conf": 0.25292490118577077, "calib/mu_c": 0.26625, "calib/mu_w": 0.24913705583756343, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.08173913043478262, "calib/std_conf": 0.17077636687404305, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3071501272264631, "calib/step_q_c_n": 393.0, "calib/step_q_gap": 0.03841698409759958, "calib/step_q_w": 0.26873314312886354, "calib/step_q_w_n": 2103.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 348.8671875, "completions/mean_terminated_length": 353.00396728515625, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.04053333333333333, "grad_norm": 1.204365611076355, "learning_rate": 4.527777777777778e-06, "loss": 0.2647, "num_tokens": 8905411.0, "reward": 0.9932834506034851, "reward_std": 0.14956696331501007, "rewards/accuracy_reward_step": 0.21875, "rewards/final_brier_reward_step": 0.7873480916023254, "rewards/format_reward_step": 0.98046875, "step": 38 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8835952132940292, "aux_distill/mean_u": 0.28569774736326614, "aux_distill/n_active_tok": 114.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.1328125, "calib/ece": 0.1694797277269374, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.0029947900175748554, "calib/mean_conf": 0.2791672277269374, "calib/mu_c": 0.27679245283018866, "calib/mu_w": 0.2797872428477635, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12080785272693742, "calib/std_conf": 0.19193509897197172, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30876010781671165, "calib/step_q_c_n": 371.0, "calib/step_q_gap": 0.0038731106546076033, "calib/step_q_w": 0.30488699716210405, "calib/step_q_w_n": 1711.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 332.22265625, "completions/mean_terminated_length": 334.8385925292969, "completions/min_length": 0.0, "completions/min_terminated_length": 138.0, "epoch": 0.0416, "grad_norm": 1.4351422786712646, "learning_rate": 4.5e-06, "loss": 0.2705, "num_tokens": 9096548.0, "reward": 0.9999179840087891, "reward_std": 0.11247202754020691, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.7928047180175781, "rewards/format_reward_step": 1.0, "step": 39 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.900733919814229, "aux_distill/mean_u": 0.333621742383105, "aux_distill/n_active_tok": 122.625, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.61328125, "calib/ece": 0.16886274509803922, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.016845509893455124, "calib/mean_conf": 0.251921568627451, "calib/mu_c": 0.26638888888888895, "calib/mu_w": 0.24954337899543383, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13980392156862748, "calib/std_conf": 0.19011408784605996, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2860714285714286, "calib/step_q_c_n": 252.0, "calib/step_q_gap": -0.0015418842805939303, "calib/step_q_w": 0.2876133128520225, "calib/step_q_w_n": 1953.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 333.73828125, "completions/mean_terminated_length": 336.36614990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.042666666666666665, "grad_norm": 1.1747071743011475, "learning_rate": 4.472222222222223e-06, "loss": 0.2479, "num_tokens": 9288745.0, "reward": 0.9839453101158142, "reward_std": 0.10935330390930176, "rewards/accuracy_reward_step": 0.140625, "rewards/final_brier_reward_step": 0.8311718702316284, "rewards/format_reward_step": 0.99609375, "step": 40 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.885868264362216, "aux_distill/mean_u": 0.34837704053660135, "aux_distill/n_active_tok": 120.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.34375, "calib/ece": 0.18067968750000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": -0.016945614035087653, "calib/mean_conf": 0.2678359375, "calib/mu_c": 0.255921052631579, "calib/mu_w": 0.27286666666666665, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07582031250000001, "calib/std_conf": 0.2042344423903473, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33281376518218625, "calib/step_q_c_n": 494.0, "calib/step_q_gap": 0.044120475138896165, "calib/step_q_w": 0.2886932900432901, "calib/step_q_w_n": 1386.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 297.26171875, "completions/mean_terminated_length": 299.60235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.04373333333333333, "grad_norm": 2.003614664077759, "learning_rate": 4.444444444444444e-06, "loss": 0.2752, "num_tokens": 9472092.0, "reward": 1.0167245864868164, "reward_std": 0.14265429973602295, "rewards/accuracy_reward_step": 0.296875, "rewards/final_brier_reward_step": 0.7404803037643433, "rewards/format_reward_step": 0.99609375, "step": 41 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9000848159193993, "aux_distill/mean_u": 0.3127236620497125, "aux_distill/n_active_tok": 118.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.03125, "calib/ece": 0.15384313725490198, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.01568627450980392, "calib/gap": 0.031094871346052766, "calib/mean_conf": 0.2975294117647058, "calib/mu_c": 0.3215517241379309, "calib/mu_w": 0.29045685279187816, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1119607843137255, "calib/std_conf": 0.2243818832417291, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3662303664921467, "calib/step_q_c_n": 382.0, "calib/step_q_gap": 0.07238765442524109, "calib/step_q_w": 0.2938427120669056, "calib/step_q_w_n": 1674.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 299.609375, "completions/mean_terminated_length": 301.968505859375, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.0448, "grad_norm": 3.954033613204956, "learning_rate": 4.416666666666667e-06, "loss": 0.2571, "num_tokens": 9653160.0, "reward": 0.9958779215812683, "reward_std": 0.16326522827148438, "rewards/accuracy_reward_step": 0.2265625, "rewards/final_brier_reward_step": 0.7730058431625366, "rewards/format_reward_step": 0.9921875, "step": 42 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8822179082781076, "aux_distill/mean_u": 0.2652459974117178, "aux_distill/n_active_tok": 110.625, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.265625, "calib/ece": 0.10752070312499999, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.032956915113871665, "calib/mean_conf": 0.253182421875, "calib/mu_c": 0.28021739130434786, "calib/mu_w": 0.2472604761904762, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09050781249999999, "calib/std_conf": 0.20561714812989704, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3243166666666667, "calib/step_q_c_n": 336.0, "calib/step_q_gap": 0.03327598425196859, "calib/step_q_w": 0.2910406824146981, "calib/step_q_w_n": 1524.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 288.6299133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.04586666666666667, "grad_norm": 4.156473636627197, "learning_rate": 4.388888888888889e-06, "loss": 0.2712, "num_tokens": 9831696.0, "reward": 0.9935085773468018, "reward_std": 0.12804695963859558, "rewards/accuracy_reward_step": 0.1796875, "rewards/final_brier_reward_step": 0.8112359046936035, "rewards/format_reward_step": 0.99609375, "step": 43 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8823073115199804, "aux_distill/mean_u": 0.2861128941515574, "aux_distill/n_active_tok": 130.25, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 7.52734375, "calib/ece": 0.18015810276679844, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.019762845849802372, "calib/gap": 0.010590909090909095, "calib/mean_conf": 0.271699604743083, "calib/mu_c": 0.2809090909090909, "calib/mu_w": 0.2703181818181818, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16071146245059292, "calib/std_conf": 0.2293293984725821, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32342342342342345, "calib/step_q_c_n": 222.0, "calib/step_q_gap": 0.01449292488969911, "calib/step_q_w": 0.30893049853372434, "calib/step_q_w_n": 1705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 303.41796875, "completions/mean_terminated_length": 304.60784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.046933333333333334, "grad_norm": 3.0850069522857666, "learning_rate": 4.361111111111112e-06, "loss": 0.2627, "num_tokens": 10015691.0, "reward": 0.9543378353118896, "reward_std": 0.15263614058494568, "rewards/accuracy_reward_step": 0.12890625, "rewards/final_brier_reward_step": 0.7993007898330688, "rewards/format_reward_step": 0.98046875, "step": 44 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8355578556656837, "aux_distill/mean_u": 0.26338739376699555, "aux_distill/n_active_tok": 115.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.21484375, "calib/ece": 0.21674509803921563, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.01568627450980392, "calib/gap": -0.055726495726495684, "calib/mean_conf": 0.265921568627451, "calib/mu_c": 0.2187179487179487, "calib/mu_w": 0.2744444444444444, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1648627450980392, "calib/std_conf": 0.21819184247590376, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25475471698113206, "calib/step_q_c_n": 265.0, "calib/step_q_gap": -0.03257777353719915, "calib/step_q_w": 0.2873324905183312, "calib/step_q_w_n": 1582.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 296.41796875, "completions/mean_terminated_length": 298.751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.048, "grad_norm": 3.737006902694702, "learning_rate": 4.333333333333334e-06, "loss": 0.2522, "num_tokens": 10196622.0, "reward": 0.9704841375350952, "reward_std": 0.11979597806930542, "rewards/accuracy_reward_step": 0.15234375, "rewards/final_brier_reward_step": 0.7925308346748352, "rewards/format_reward_step": 0.99609375, "step": 45 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9158951379358768, "aux_distill/mean_u": 0.3747160606423709, "aux_distill/n_active_tok": 109.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.03515625, "calib/ece": 0.1684313725490196, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": -0.008765096618357532, "calib/mean_conf": 0.2248235294117647, "calib/mu_c": 0.2177083333333333, "calib/mu_w": 0.22647342995169084, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10250980392156862, "calib/std_conf": 0.2021353367136176, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31444210526315797, "calib/step_q_c_n": 285.0, "calib/step_q_gap": 0.040425746424107845, "calib/step_q_w": 0.2740163588390501, "calib/step_q_w_n": 1516.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 285.61328125, "completions/mean_terminated_length": 287.8622131347656, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.04906666666666667, "grad_norm": 1.399847149848938, "learning_rate": 4.305555555555556e-06, "loss": 0.2632, "num_tokens": 10374507.0, "reward": 0.9875702857971191, "reward_std": 0.10994257032871246, "rewards/accuracy_reward_step": 0.1875, "rewards/final_brier_reward_step": 0.7954530715942383, "rewards/format_reward_step": 0.9921875, "step": 46 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8687598332762718, "aux_distill/mean_u": 0.30231973079135227, "aux_distill/n_active_tok": 139.625, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.08203125, "calib/ece": 0.15363636363636363, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.007905138339920948, "calib/gap": -0.013971698113207559, "calib/mean_conf": 0.2055731225296443, "calib/mu_c": 0.19452830188679246, "calib/mu_w": 0.20850000000000002, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07486166007905139, "calib/std_conf": 0.2029918697843591, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26025, "calib/step_q_c_n": 360.0, "calib/step_q_gap": 0.023171119592875294, "calib/step_q_w": 0.2370788804071247, "calib/step_q_w_n": 1965.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 315.52734375, "completions/mean_terminated_length": 319.268798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.050133333333333335, "grad_norm": 3.7428154945373535, "learning_rate": 4.277777777777778e-06, "loss": 0.2504, "num_tokens": 10561258.0, "reward": 0.9834374785423279, "reward_std": 0.13472025096416473, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.7754687070846558, "rewards/format_reward_step": 0.984375, "step": 47 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8738913396373391, "aux_distill/mean_u": 0.2939748861844822, "aux_distill/n_active_tok": 111.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.14453125, "calib/ece": 0.19413944223107568, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.02494813725490197, "calib/mean_conf": 0.2220358565737052, "calib/mu_c": 0.202156862745098, "calib/mu_w": 0.22710499999999997, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.10649402390438249, "calib/std_conf": 0.196638085229977, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.21553977272727273, "calib/step_q_c_n": 352.0, "calib/step_q_gap": -0.07732170323752077, "calib/step_q_w": 0.2928614759647935, "calib/step_q_w_n": 1477.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 298.18359375, "completions/mean_terminated_length": 300.531494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.0512, "grad_norm": 4.377185344696045, "learning_rate": 4.25e-06, "loss": 0.3166, "num_tokens": 10741281.0, "reward": 0.9776182174682617, "reward_std": 0.1417025923728943, "rewards/accuracy_reward_step": 0.20703125, "rewards/final_brier_reward_step": 0.7716426253318787, "rewards/format_reward_step": 0.9765625, "step": 48 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.877561567351222, "aux_distill/mean_u": 0.30501746020629955, "aux_distill/n_active_tok": 108.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.66796875, "calib/ece": 0.16074803149606298, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": 0.01150063251106892, "calib/mean_conf": 0.19996062992125985, "calib/mu_c": 0.20838235294117644, "calib/mu_w": 0.19688172043010752, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04649606299212598, "calib/std_conf": 0.19759478545092854, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.26375000000000004, "calib/step_q_c_n": 488.0, "calib/step_q_gap": 0.028097796610169534, "calib/step_q_w": 0.2356522033898305, "calib/step_q_w_n": 1475.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 301.2578125, "completions/mean_terminated_length": 302.4392395019531, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.05226666666666667, "grad_norm": 2.8171491622924805, "learning_rate": 4.222222222222223e-06, "loss": 0.2598, "num_tokens": 10922939.0, "reward": 1.0083338022232056, "reward_std": 0.1236845999956131, "rewards/accuracy_reward_step": 0.265625, "rewards/final_brier_reward_step": 0.7588551044464111, "rewards/format_reward_step": 0.9921875, "step": 49 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9258559513837099, "aux_distill/mean_u": 0.32496536010658356, "aux_distill/n_active_tok": 125.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.70703125, "calib/ece": 0.1795275590551181, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.01968503937007874, "calib/gap": -0.0027361005331302257, "calib/mean_conf": 0.2192913385826772, "calib/mu_c": 0.2171153846153846, "calib/mu_w": 0.21985148514851482, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.09704724409448819, "calib/std_conf": 0.21140021045424237, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2707202216066482, "calib/step_q_c_n": 361.0, "calib/step_q_gap": 0.0019423059738938875, "calib/step_q_w": 0.2687779156327543, "calib/step_q_w_n": 1612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 307.43359375, "completions/mean_terminated_length": 309.8543395996094, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.05333333333333334, "grad_norm": 0.9597369432449341, "learning_rate": 4.194444444444445e-06, "loss": 0.2235, "num_tokens": 11107002.0, "reward": 0.9902620911598206, "reward_std": 0.11881484091281891, "rewards/accuracy_reward_step": 0.203125, "rewards/final_brier_reward_step": 0.7852116823196411, "rewards/format_reward_step": 0.9921875, "step": 50 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9441087357699871, "aux_distill/mean_u": 0.3196395898677443, "aux_distill/n_active_tok": 126.625, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.15234375, "calib/ece": 0.229809918419611, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.039765872397339, "calib/mean_conf": 0.21152866425755446, "calib/mu_c": 0.18397435897435896, "calib/mu_w": 0.22374023137169796, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.0671259842519685, "calib/std_conf": 0.19343077812634754, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.23406779661016952, "calib/step_q_c_n": 590.0, "calib/step_q_gap": -0.005498214038365784, "calib/step_q_w": 0.2395660106485353, "calib/step_q_w_n": 1497.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 313.10546875, "completions/mean_terminated_length": 316.8182067871094, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.0544, "grad_norm": 2.159965753555298, "learning_rate": 4.166666666666667e-06, "loss": 0.2578, "num_tokens": 11296453.0, "reward": 1.0037527084350586, "reward_std": 0.12112631648778915, "rewards/accuracy_reward_step": 0.3046875, "rewards/final_brier_reward_step": 0.7145366668701172, "rewards/format_reward_step": 0.98828125, "step": 51 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9033014588057995, "aux_distill/mean_u": 0.3339367217015873, "aux_distill/n_active_tok": 109.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.2890625, "calib/ece": 0.17832031250000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": 0.015150966183574893, "calib/mean_conf": 0.2298046875, "calib/mu_c": 0.24069444444444446, "calib/mu_w": 0.22554347826086957, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06343750000000001, "calib/std_conf": 0.214132426848498, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29648148148148146, "calib/step_q_c_n": 486.0, "calib/step_q_gap": 0.017575684380032153, "calib/step_q_w": 0.2789057971014493, "calib/step_q_w_n": 1380.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 280.90234375, "completions/mean_terminated_length": 283.1141662597656, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.055466666666666664, "grad_norm": 1.1030815839767456, "learning_rate": 4.138888888888889e-06, "loss": 0.2568, "num_tokens": 11476316.0, "reward": 1.0183639526367188, "reward_std": 0.1275119185447693, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.7554777264595032, "rewards/format_reward_step": 1.0, "step": 52 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8972780760377645, "aux_distill/mean_u": 0.33077345545734277, "aux_distill/n_active_tok": 127.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.09765625, "calib/ece": 0.21405078125000002, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": 0.008478112175102581, "calib/mean_conf": 0.21173046875, "calib/mu_c": 0.21736046511627904, "calib/mu_w": 0.20888235294117646, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04492187499999999, "calib/std_conf": 0.19493206350971426, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25831831932773114, "calib/step_q_c_n": 595.0, "calib/step_q_gap": -0.013571058209481346, "calib/step_q_w": 0.2718893775372125, "calib/step_q_w_n": 1478.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 320.73046875, "completions/mean_terminated_length": 323.2558898925781, "completions/min_length": 0.0, "completions/min_terminated_length": 122.0, "epoch": 0.05653333333333333, "grad_norm": 0.7430399656295776, "learning_rate": 4.111111111111111e-06, "loss": 0.299, "num_tokens": 11664247.0, "reward": 1.0316053628921509, "reward_std": 0.11501534283161163, "rewards/accuracy_reward_step": 0.3359375, "rewards/final_brier_reward_step": 0.7272732257843018, "rewards/format_reward_step": 1.0, "step": 53 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8624539021402597, "aux_distill/mean_u": 0.2733415144863499, "aux_distill/n_active_tok": 115.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.9453125, "calib/ece": 0.28133359375, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": -0.01467832984326417, "calib/mean_conf": 0.20022890625, "calib/mu_c": 0.19145631067961166, "calib/mu_w": 0.20613464052287583, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.039609375, "calib/std_conf": 0.19904292178146832, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2401891891891892, "calib/step_q_c_n": 740.0, "calib/step_q_gap": -0.017947982371861793, "calib/step_q_w": 0.258137171561051, "calib/step_q_w_n": 1294.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 304.75, "completions/mean_terminated_length": 307.14959716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.0576, "grad_norm": 2.56850004196167, "learning_rate": 4.083333333333334e-06, "loss": 0.2742, "num_tokens": 11848495.0, "reward": 1.0347485542297363, "reward_std": 0.11770911514759064, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6710594296455383, "rewards/format_reward_step": 0.99609375, "step": 54 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8497700281441212, "aux_distill/mean_u": 0.2503387007848704, "aux_distill/n_active_tok": 123.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.36328125, "calib/ece": 0.15098549019607843, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.019738239538239555, "calib/mean_conf": 0.18736745098039215, "calib/mu_c": 0.2019969696969697, "calib/mu_w": 0.18225873015873015, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03976470588235294, "calib/std_conf": 0.16331167867467933, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27153822894168467, "calib/step_q_c_n": 463.0, "calib/step_q_gap": -0.015342581546992307, "calib/step_q_w": 0.286880810488677, "calib/step_q_w_n": 1678.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 313.9527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.058666666666666666, "grad_norm": 1.1233389377593994, "learning_rate": 4.055555555555556e-06, "loss": 0.2892, "num_tokens": 12036063.0, "reward": 1.0174031257629395, "reward_std": 0.09957180917263031, "rewards/accuracy_reward_step": 0.2578125, "rewards/final_brier_reward_step": 0.7809000015258789, "rewards/format_reward_step": 0.99609375, "step": 55 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8872428350150585, "aux_distill/mean_u": 0.32575370195465403, "aux_distill/n_active_tok": 117.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.10546875, "calib/ece": 0.19675787401574807, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.007874015748031496, "calib/gap": -0.010327287296037324, "calib/mean_conf": 0.1839507874015748, "calib/mu_c": 0.17679487179487177, "calib/mu_w": 0.1871221590909091, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.036811023622047245, "calib/std_conf": 0.18785905835616928, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.29798865478119935, "calib/step_q_c_n": 617.0, "calib/step_q_gap": -0.007017655232518072, "calib/step_q_w": 0.3050063100137174, "calib/step_q_w_n": 1458.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 310.671875, "completions/mean_terminated_length": 313.11810302734375, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.05973333333333333, "grad_norm": 0.5889172554016113, "learning_rate": 4.027777777777779e-06, "loss": 0.2527, "num_tokens": 12222435.0, "reward": 1.0117602348327637, "reward_std": 0.1229933425784111, "rewards/accuracy_reward_step": 0.3046875, "rewards/final_brier_reward_step": 0.7266455292701721, "rewards/format_reward_step": 0.9921875, "step": 56 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.935738505795598, "aux_distill/mean_u": 0.31780845640126715, "aux_distill/n_active_tok": 125.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.62890625, "calib/ece": 0.21763294117647058, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": -0.006428235294117651, "calib/mean_conf": 0.18785725490196079, "calib/mu_c": 0.18357176470588232, "calib/mu_w": 0.18999999999999997, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03607843137254902, "calib/std_conf": 0.19378627973123452, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.290451226993865, "calib/step_q_c_n": 652.0, "calib/step_q_gap": 0.011611727957256124, "calib/step_q_w": 0.2788394990366089, "calib/step_q_w_n": 1557.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 307.55859375, "completions/mean_terminated_length": 309.9803161621094, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.0608, "grad_norm": 0.4905681610107422, "learning_rate": 4.000000000000001e-06, "loss": 0.2566, "num_tokens": 12407962.0, "reward": 1.0207659006118774, "reward_std": 0.1187235563993454, "rewards/accuracy_reward_step": 0.33203125, "rewards/final_brier_reward_step": 0.7134066820144653, "rewards/format_reward_step": 0.99609375, "step": 57 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8835944719612598, "aux_distill/mean_u": 0.31737898162086964, "aux_distill/n_active_tok": 131.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 7.71875, "calib/ece": 0.12244212598425194, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.03459161626344087, "calib/mean_conf": 0.1974003937007874, "calib/mu_c": 0.2235483870967742, "calib/mu_w": 0.18895677083333332, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.037874015748031495, "calib/std_conf": 0.18803480296431632, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2734507042253521, "calib/step_q_c_n": 426.0, "calib/step_q_gap": -0.01306226351658335, "calib/step_q_w": 0.28651296774193546, "calib/step_q_w_n": 1550.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 296.234375, "completions/mean_terminated_length": 298.5669250488281, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.06186666666666667, "grad_norm": 0.6383081078529358, "learning_rate": 3.972222222222223e-06, "loss": 0.237, "num_tokens": 12590118.0, "reward": 1.0094563961029053, "reward_std": 0.11579343676567078, "rewards/accuracy_reward_step": 0.2421875, "rewards/final_brier_reward_step": 0.7845379114151001, "rewards/format_reward_step": 0.9921875, "step": 58 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8786263391375542, "aux_distill/mean_u": 0.2848541231171143, "aux_distill/n_active_tok": 115.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.18359375, "calib/ece": 0.19727568627450984, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03612934017146355, "calib/mean_conf": 0.18790078431372548, "calib/mu_c": 0.16183098591549297, "calib/mu_w": 0.19796032608695652, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05337254901960784, "calib/std_conf": 0.17178196828199355, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.263248031496063, "calib/step_q_c_n": 508.0, "calib/step_q_gap": -0.022587885328133606, "calib/step_q_w": 0.2858359168241966, "calib/step_q_w_n": 1587.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 303.1171875, "completions/mean_terminated_length": 305.5039367675781, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.06293333333333333, "grad_norm": 1.045523762702942, "learning_rate": 3.944444444444445e-06, "loss": 0.2482, "num_tokens": 12773964.0, "reward": 1.0048048496246338, "reward_std": 0.1106722503900528, "rewards/accuracy_reward_step": 0.27734375, "rewards/final_brier_reward_step": 0.7400784492492676, "rewards/format_reward_step": 0.9921875, "step": 59 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9114331156015396, "aux_distill/mean_u": 0.3258913867277279, "aux_distill/n_active_tok": 117.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.6015625, "calib/ece": 0.15221093749999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": -0.0014270833333333288, "calib/mean_conf": 0.1487265625, "calib/mu_c": 0.14765625000000002, "calib/mu_w": 0.14908333333333335, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.02546875, "calib/std_conf": 0.15708154803137636, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.23213580246913576, "calib/step_q_c_n": 405.0, "calib/step_q_gap": 0.006057866064203887, "calib/step_q_w": 0.22607793640493187, "calib/step_q_w_n": 1541.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 281.43359375, "completions/mean_terminated_length": 283.64959716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 72.0, "epoch": 0.064, "grad_norm": 1.4852534532546997, "learning_rate": 3.916666666666667e-06, "loss": 0.2697, "num_tokens": 12954867.0, "reward": 1.009638786315918, "reward_std": 0.09001928567886353, "rewards/accuracy_reward_step": 0.25, "rewards/final_brier_reward_step": 0.7731839418411255, "rewards/format_reward_step": 0.99609375, "step": 60 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8951994087547064, "aux_distill/mean_u": 0.2844116796439261, "aux_distill/n_active_tok": 131.875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 9.3125, "calib/ece": 0.3061264822134387, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.007905138339920948, "calib/gap": -0.01790540540540539, "calib/mean_conf": 0.17047430830039526, "calib/mu_c": 0.16, "calib/mu_w": 0.1779054054054054, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.030790513833992093, "calib/std_conf": 0.17423512217240988, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2416358839050132, "calib/step_q_c_n": 758.0, "calib/step_q_gap": -0.061912701580841695, "calib/step_q_w": 0.3035485854858549, "calib/step_q_w_n": 1626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 301.35546875, "completions/mean_terminated_length": 304.9288635253906, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.06506666666666666, "grad_norm": 0.9578151702880859, "learning_rate": 3.88888888888889e-06, "loss": 0.2356, "num_tokens": 13136078.0, "reward": 1.0245447158813477, "reward_std": 0.1300700306892395, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6506519317626953, "rewards/format_reward_step": 0.98828125, "step": 61 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9100477192550898, "aux_distill/mean_u": 0.28706747487226597, "aux_distill/n_active_tok": 125.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 8.265625, "calib/ece": 0.18208267716535437, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.011811023622047244, "calib/gap": 0.001003481419813823, "calib/mean_conf": 0.14453149606299212, "calib/mu_c": 0.14524657534246574, "calib/mu_w": 0.14424309392265192, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.019606299212598426, "calib/std_conf": 0.14875648927857302, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2428534136546185, "calib/step_q_c_n": 498.0, "calib/step_q_gap": -0.006505053588892001, "calib/step_q_w": 0.2493584672435105, "calib/step_q_w_n": 1618.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 317.453125, "completions/mean_terminated_length": 319.9527587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.06613333333333334, "grad_norm": 0.9756151437759399, "learning_rate": 3.861111111111112e-06, "loss": 0.286, "num_tokens": 13324426.0, "reward": 1.012264609336853, "reward_std": 0.09289586544036865, "rewards/accuracy_reward_step": 0.28515625, "rewards/final_brier_reward_step": 0.7471854090690613, "rewards/format_reward_step": 0.9921875, "step": 62 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.921884486451745, "aux_distill/mean_u": 0.37658791087950677, "aux_distill/n_active_tok": 159.625, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.546875, "calib/ece": 0.2384722222222222, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": -0.018688953488372073, "calib/mean_conf": 0.1461309523809524, "calib/mu_c": 0.13337500000000002, "calib/mu_w": 0.1520639534883721, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03357142857142857, "calib/std_conf": 0.16492485877498914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2725210084033613, "calib/step_q_c_n": 595.0, "calib/step_q_gap": 0.01504974949599791, "calib/step_q_w": 0.2574712589073634, "calib/step_q_w_n": 2105.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 342.53125, "completions/mean_terminated_length": 347.96826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.0672, "grad_norm": 0.6014112830162048, "learning_rate": 3.833333333333334e-06, "loss": 0.2515, "num_tokens": 13520754.0, "reward": 1.0021567344665527, "reward_std": 0.11314384639263153, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.707438588142395, "rewards/format_reward_step": 0.984375, "step": 63 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8175699971616268, "aux_distill/mean_u": 0.24582131315860167, "aux_distill/n_active_tok": 137.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 10.3125, "calib/ece": 0.24190476190476193, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": -0.01925925925925928, "calib/mean_conf": 0.14269841269841274, "calib/mu_c": 0.12962962962962962, "calib/mu_w": 0.1488888888888889, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03158730158730158, "calib/std_conf": 0.16538753287121283, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25657575757575757, "calib/step_q_c_n": 660.0, "calib/step_q_gap": 0.033606060606060584, "calib/step_q_w": 0.22296969696969698, "calib/step_q_w_n": 1980.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 331.27734375, "completions/mean_terminated_length": 336.5357360839844, "completions/min_length": 0.0, "completions/min_terminated_length": 69.0, "epoch": 0.06826666666666667, "grad_norm": 0.8397740721702576, "learning_rate": 3.8055555555555556e-06, "loss": 0.2273, "num_tokens": 13709337.0, "reward": 1.0019054412841797, "reward_std": 0.11516478657722473, "rewards/accuracy_reward_step": 0.31640625, "rewards/final_brier_reward_step": 0.7030296325683594, "rewards/format_reward_step": 0.984375, "step": 64 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8168295957148075, "aux_distill/mean_u": 0.2424435817139507, "aux_distill/n_active_tok": 123.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.08203125, "calib/ece": 0.2042647058823529, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.010540230317627547, "calib/mean_conf": 0.14083333333333334, "calib/mu_c": 0.14835616438356164, "calib/mu_w": 0.1378159340659341, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.029411764705882353, "calib/std_conf": 0.1520072130211508, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27874517374517377, "calib/step_q_c_n": 518.0, "calib/step_q_gap": 0.01918327819391652, "calib/step_q_w": 0.25956189555125725, "calib/step_q_w_n": 1551.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 292.21875, "completions/mean_terminated_length": 294.5196838378906, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.06933333333333333, "grad_norm": 1.7245993614196777, "learning_rate": 3.777777777777778e-06, "loss": 0.2277, "num_tokens": 13889169.0, "reward": 1.0131059885025024, "reward_std": 0.09085900336503983, "rewards/accuracy_reward_step": 0.28515625, "rewards/final_brier_reward_step": 0.7488681674003601, "rewards/format_reward_step": 0.9921875, "step": 65 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8621831871569157, "aux_distill/mean_u": 0.2643594781882684, "aux_distill/n_active_tok": 153.5, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.0234375, "calib/ece": 0.2113147410358566, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.025648954038784533, "calib/mean_conf": 0.141195219123506, "calib/mu_c": 0.1231081081081081, "calib/mu_w": 0.14875706214689263, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02884462151394422, "calib/std_conf": 0.14315235015754815, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24553872053872058, "calib/step_q_c_n": 594.0, "calib/step_q_gap": -0.024567036305079698, "calib/step_q_w": 0.2701057568438003, "calib/step_q_w_n": 2484.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 373.921875, "completions/mean_terminated_length": 381.37054443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.0704, "grad_norm": 5.546261310577393, "learning_rate": 3.7500000000000005e-06, "loss": 0.1916, "num_tokens": 14091245.0, "reward": 0.9962350726127625, "reward_std": 0.11140823364257812, "rewards/accuracy_reward_step": 0.2890625, "rewards/final_brier_reward_step": 0.722939133644104, "rewards/format_reward_step": 0.98046875, "step": 66 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8063938282430172, "aux_distill/mean_u": 0.24485005814775876, "aux_distill/n_active_tok": 176.875, "calib/answer_extract_rate": 0.96875, "calib/avg_num_step_conf": 14.796875, "calib/ece": 0.2592798353909465, "calib/final_conf_rate": 0.94921875, "calib/format_rate": 0.94921875, "calib/frac_conf_gt_0.9": 0.00823045267489712, "calib/gap": -0.030675828313253023, "calib/mean_conf": 0.15405349794238685, "calib/mu_c": 0.133855421686747, "calib/mu_w": 0.16453125000000002, "calib/nonempty_final_conf_rate": 0.94921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.035884773662551446, "calib/std_conf": 0.166563329748725, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2717168674698795, "calib/step_q_c_n": 996.0, "calib/step_q_gap": 0.04492449640970758, "calib/step_q_w": 0.22679237106017192, "calib/step_q_w_n": 2792.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 383.7421875, "completions/mean_terminated_length": 404.2716064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.07146666666666666, "grad_norm": 0.7894091010093689, "learning_rate": 3.7222222222222225e-06, "loss": 0.1327, "num_tokens": 14294491.0, "reward": 0.9720925092697144, "reward_std": 0.17863138020038605, "rewards/accuracy_reward_step": 0.33203125, "rewards/final_brier_reward_step": 0.6629350781440735, "rewards/format_reward_step": 0.94921875, "step": 67 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7980727730318904, "aux_distill/mean_u": 0.2929847676459035, "aux_distill/n_active_tok": 199.875, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 17.66015625, "calib/ece": 0.21976569037656904, "calib/final_conf_rate": 0.93359375, "calib/format_rate": 0.93359375, "calib/frac_conf_gt_0.9": 0.02092050209205021, "calib/gap": -0.004586732186732195, "calib/mean_conf": 0.1737071129707113, "calib/mu_c": 0.17054054054054055, "calib/mu_w": 0.17512727272727274, "calib/nonempty_final_conf_rate": 0.93359375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04192468619246862, "calib/std_conf": 0.18586754065481934, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2742739726027397, "calib/step_q_c_n": 730.0, "calib/step_q_gap": -0.031045995743343102, "calib/step_q_w": 0.3053199683460828, "calib/step_q_w_n": 3791.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 403.64453125, "completions/mean_terminated_length": 432.35565185546875, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.07253333333333334, "grad_norm": 1.416663408279419, "learning_rate": 3.694444444444445e-06, "loss": 0.0853, "num_tokens": 14501912.0, "reward": 0.9546322226524353, "reward_std": 0.20958805084228516, "rewards/accuracy_reward_step": 0.29296875, "rewards/final_brier_reward_step": 0.6827020049095154, "rewards/format_reward_step": 0.93359375, "step": 68 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7827791813760996, "aux_distill/mean_u": 0.2817591572783052, "aux_distill/n_active_tok": 236.25, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 23.88671875, "calib/ece": 0.27657333333333334, "calib/final_conf_rate": 0.87890625, "calib/format_rate": 0.87890625, "calib/frac_conf_gt_0.9": 0.017777777777777778, "calib/gap": -0.0463560344827586, "calib/mean_conf": 0.15924888888888888, "calib/mu_c": 0.12937500000000002, "calib/mu_w": 0.17573103448275862, "calib/nonempty_final_conf_rate": 0.87890625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04013333333333333, "calib/std_conf": 0.1781372013330327, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29911104868913857, "calib/step_q_c_n": 1068.0, "calib/step_q_gap": 0.05638503323441299, "calib/step_q_w": 0.24272601545472558, "calib/step_q_w_n": 5047.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 432.390625, "completions/mean_terminated_length": 491.9644470214844, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.0736, "grad_norm": 1.3896609544754028, "learning_rate": 3.6666666666666666e-06, "loss": -0.0331, "num_tokens": 14717100.0, "reward": 0.9001055955886841, "reward_std": 0.2826094627380371, "rewards/accuracy_reward_step": 0.32421875, "rewards/final_brier_reward_step": 0.5970861911773682, "rewards/format_reward_step": 0.87890625, "step": 69 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8019177615642548, "aux_distill/mean_u": 0.3130857905734622, "aux_distill/n_active_tok": 260.625, "calib/answer_extract_rate": 0.84765625, "calib/avg_num_step_conf": 29.87890625, "calib/ece": 0.17838095238095236, "calib/final_conf_rate": 0.8203125, "calib/format_rate": 0.8125, "calib/frac_conf_gt_0.9": 0.004761904761904762, "calib/gap": -0.02826281389748886, "calib/mean_conf": 0.16304761904761902, "calib/mu_c": 0.14245614035087717, "calib/mu_w": 0.17071895424836603, "calib/nonempty_final_conf_rate": 0.8203125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.035, "calib/std_conf": 0.16869879503633695, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2659308203991131, "calib/step_q_c_n": 902.0, "calib/step_q_gap": 0.02679896920599023, "calib/step_q_w": 0.23913185119312286, "calib/step_q_w_n": 6747.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 443.58984375, "completions/mean_terminated_length": 540.7571411132812, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.07466666666666667, "grad_norm": 2.984570264816284, "learning_rate": 3.638888888888889e-06, "loss": -0.1281, "num_tokens": 14937651.0, "reward": 0.8255972862243652, "reward_std": 0.34680598974227905, "rewards/accuracy_reward_step": 0.23046875, "rewards/final_brier_reward_step": 0.6082257628440857, "rewards/format_reward_step": 0.8125, "step": 70 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8115239245817065, "aux_distill/mean_u": 0.36577361168431416, "aux_distill/n_active_tok": 248.875, "calib/answer_extract_rate": 0.89453125, "calib/avg_num_step_conf": 26.7578125, "calib/ece": 0.21968468468468466, "calib/final_conf_rate": 0.8671875, "calib/format_rate": 0.86328125, "calib/frac_conf_gt_0.9": 0.0045045045045045045, "calib/gap": 0.0011015763454901295, "calib/mean_conf": 0.159954954954955, "calib/mu_c": 0.16070422535211265, "calib/mu_w": 0.15960264900662252, "calib/nonempty_final_conf_rate": 0.8671875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.029909909909909906, "calib/std_conf": 0.155692286453228, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2715031982942431, "calib/step_q_c_n": 938.0, "calib/step_q_gap": 0.0020297544512119514, "calib/step_q_w": 0.2694734438430311, "calib/step_q_w_n": 5912.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 462.32421875, "completions/mean_terminated_length": 533.130615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.07573333333333333, "grad_norm": 1.4650132656097412, "learning_rate": 3.6111111111111115e-06, "loss": 0.0103, "num_tokens": 15160414.0, "reward": 0.888353705406189, "reward_std": 0.30983930826187134, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.6321761608123779, "rewards/format_reward_step": 0.86328125, "step": 71 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7690277192741632, "aux_distill/mean_u": 0.3222443399292448, "aux_distill/n_active_tok": 245.125, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 22.91015625, "calib/ece": 0.16442553191489365, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.014400000000000024, "calib/mean_conf": 0.17872340425531916, "calib/mu_c": 0.16799999999999998, "calib/mu_w": 0.1824, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04391489361702128, "calib/std_conf": 0.15519948756874982, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.25215622732769044, "calib/step_q_c_n": 827.0, "calib/step_q_gap": -0.0312737051852115, "calib/step_q_w": 0.28342993251290194, "calib/step_q_w_n": 5038.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 511.328125, "completions/mean_terminated_length": 557.021240234375, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.0768, "grad_norm": 0.9580546021461487, "learning_rate": 3.5833333333333335e-06, "loss": 0.0729, "num_tokens": 15395722.0, "reward": 0.9316273331642151, "reward_std": 0.2351001799106598, "rewards/accuracy_reward_step": 0.234375, "rewards/final_brier_reward_step": 0.7109109163284302, "rewards/format_reward_step": 0.91796875, "step": 72 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7873094780370593, "aux_distill/mean_u": 0.2826349133400705, "aux_distill/n_active_tok": 212.0, "calib/answer_extract_rate": 0.921875, "calib/avg_num_step_conf": 19.82421875, "calib/ece": 0.2566312352930599, "calib/final_conf_rate": 0.91796875, "calib/format_rate": 0.91796875, "calib/frac_conf_gt_0.9": 0.03829787234042553, "calib/gap": -0.031687061508404446, "calib/mean_conf": 0.2211559987494933, "calib/mu_c": 0.2003908605695176, "calib/mu_w": 0.23207792207792205, "calib/nonempty_final_conf_rate": 0.91796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0665531914893617, "calib/std_conf": 0.2149404265793188, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3638418079096045, "calib/step_q_c_n": 885.0, "calib/step_q_gap": 0.03793607998597681, "calib/step_q_w": 0.3259057279236277, "calib/step_q_w_n": 4190.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 423.15234375, "completions/mean_terminated_length": 460.9659423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 107.0, "epoch": 0.07786666666666667, "grad_norm": 0.8632502555847168, "learning_rate": 3.555555555555556e-06, "loss": 0.0653, "num_tokens": 15611081.0, "reward": 0.937720000743866, "reward_std": 0.2524094581604004, "rewards/accuracy_reward_step": 0.31640625, "rewards/final_brier_reward_step": 0.6410648822784424, "rewards/format_reward_step": 0.91796875, "step": 73 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8297929093241692, "aux_distill/mean_u": 0.3083530604700027, "aux_distill/n_active_tok": 203.375, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 15.5703125, "calib/ece": 0.12463332343236579, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.012195121951219513, "calib/gap": 0.07146305939212541, "calib/mean_conf": 0.19877966489578042, "calib/mu_c": 0.2536842105263158, "calib/mu_w": 0.18222115113419038, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04585283562748774, "calib/std_conf": 0.18025502827472875, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3164716006884682, "calib/step_q_c_n": 581.0, "calib/step_q_gap": -0.012599678947922632, "calib/step_q_w": 0.3290712796363908, "calib/step_q_w_n": 3405.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 400.96875, "completions/mean_terminated_length": 417.2682800292969, "completions/min_length": 0.0, "completions/min_terminated_length": 96.0, "epoch": 0.07893333333333333, "grad_norm": 0.44695550203323364, "learning_rate": 3.5277777777777784e-06, "loss": 0.1973, "num_tokens": 15817657.0, "reward": 0.9808335900306702, "reward_std": 0.17156293988227844, "rewards/accuracy_reward_step": 0.22265625, "rewards/final_brier_reward_step": 0.7819797992706299, "rewards/format_reward_step": 0.95703125, "step": 74 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8280173083767295, "aux_distill/mean_u": 0.3092613757817121, "aux_distill/n_active_tok": 160.375, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.41796875, "calib/ece": 0.29354824224200393, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.008032128514056224, "calib/gap": -0.035276279308424696, "calib/mean_conf": 0.17400195856120884, "calib/mu_c": 0.15303453150238613, "calib/mu_w": 0.18831081081081083, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03096385542168674, "calib/std_conf": 0.1677046066909969, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2633288827668051, "calib/step_q_c_n": 867.0, "calib/step_q_gap": 0.0049077322477739105, "calib/step_q_w": 0.2584211505190312, "calib/step_q_w_n": 2312.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 390.31640625, "completions/mean_terminated_length": 401.2891540527344, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.08, "grad_norm": 0.5844130516052246, "learning_rate": 3.5e-06, "loss": 0.2072, "num_tokens": 16022330.0, "reward": 1.0046308040618896, "reward_std": 0.15581795573234558, "rewards/accuracy_reward_step": 0.39453125, "rewards/final_brier_reward_step": 0.6420742273330688, "rewards/format_reward_step": 0.97265625, "step": 75 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8842153791338205, "aux_distill/mean_u": 0.3634484035747906, "aux_distill/n_active_tok": 141.5, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.1015625, "calib/ece": 0.23534384920634918, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.011904761904761904, "calib/gap": -0.017342894461859937, "calib/mean_conf": 0.19783075396825398, "calib/mu_c": 0.18647528735632185, "calib/mu_w": 0.2038181818181818, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04396825396825396, "calib/std_conf": 0.19580172393485226, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2866044303797468, "calib/step_q_c_n": 711.0, "calib/step_q_gap": 0.00703109704641347, "calib/step_q_w": 0.27957333333333334, "calib/step_q_w_n": 1875.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 348.10546875, "completions/mean_terminated_length": 353.6309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 75.0, "epoch": 0.08106666666666666, "grad_norm": 0.5333477854728699, "learning_rate": 3.4722222222222224e-06, "loss": 0.2624, "num_tokens": 16214501.0, "reward": 1.0057470798492432, "reward_std": 0.15070591866970062, "rewards/accuracy_reward_step": 0.33984375, "rewards/final_brier_reward_step": 0.6911816596984863, "rewards/format_reward_step": 0.98046875, "step": 76 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8543671872466803, "aux_distill/mean_u": 0.33429203086375125, "aux_distill/n_active_tok": 122.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 8.23828125, "calib/ece": 0.17608695652173917, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.011857707509881422, "calib/gap": 0.04061777965624119, "calib/mean_conf": 0.19727272727272727, "calib/mu_c": 0.2244047619047619, "calib/mu_w": 0.18378698224852072, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.020671936758893283, "calib/std_conf": 0.18031174345926798, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3276148409893993, "calib/step_q_c_n": 566.0, "calib/step_q_gap": 0.04176941001078621, "calib/step_q_w": 0.2858454309786131, "calib/step_q_w_n": 1543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 292.85546875, "completions/mean_terminated_length": 296.32806396484375, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.08213333333333334, "grad_norm": 0.6401911973953247, "learning_rate": 3.444444444444445e-06, "loss": 0.2291, "num_tokens": 16394136.0, "reward": 1.019188404083252, "reward_std": 0.14684052765369415, "rewards/accuracy_reward_step": 0.328125, "rewards/final_brier_reward_step": 0.7297831773757935, "rewards/format_reward_step": 0.98046875, "step": 77 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.856964997947216, "aux_distill/mean_u": 0.3375044024240291, "aux_distill/n_active_tok": 111.125, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 8.05078125, "calib/ece": 0.2119047619047619, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.015873015873015872, "calib/gap": 0.0001949317738791645, "calib/mean_conf": 0.20912698412698413, "calib/mu_c": 0.20925925925925928, "calib/mu_w": 0.20906432748538012, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.049801587301587305, "calib/std_conf": 0.202887958138405, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.3684361842105263, "calib/step_q_c_n": 608.0, "calib/step_q_gap": 0.04266309405223312, "calib/step_q_w": 0.3257730901582932, "calib/step_q_w_n": 1453.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 286.5078125, "completions/mean_terminated_length": 291.0555725097656, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.0832, "grad_norm": 0.9656935930252075, "learning_rate": 3.416666666666667e-06, "loss": 0.2081, "num_tokens": 16575506.0, "reward": 1.0049442052841187, "reward_std": 0.146946519613266, "rewards/accuracy_reward_step": 0.31640625, "rewards/final_brier_reward_step": 0.7130132913589478, "rewards/format_reward_step": 0.98046875, "step": 78 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8454724475741386, "aux_distill/mean_u": 0.36820621793473934, "aux_distill/n_active_tok": 148.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.87109375, "calib/ece": 0.16129370078740157, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.023622047244094488, "calib/gap": 0.057363827133318585, "calib/mean_conf": 0.21405196850393704, "calib/mu_c": 0.254025974025974, "calib/mu_w": 0.1966621468926554, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.036098031496062985, "calib/std_conf": 0.2157638236920991, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3824413145539906, "calib/step_q_c_n": 639.0, "calib/step_q_gap": 0.06627826308340234, "calib/step_q_w": 0.31616305147058826, "calib/step_q_w_n": 1632.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 341.9609375, "completions/mean_terminated_length": 344.6535339355469, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.08426666666666667, "grad_norm": 0.8393629193305969, "learning_rate": 3.3888888888888893e-06, "loss": 0.2185, "num_tokens": 16769424.0, "reward": 1.022768497467041, "reward_std": 0.1480945348739624, "rewards/accuracy_reward_step": 0.30078125, "rewards/final_brier_reward_step": 0.7525681257247925, "rewards/format_reward_step": 0.9921875, "step": 79 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.831517806276679, "aux_distill/mean_u": 0.2914098852630286, "aux_distill/n_active_tok": 171.125, "calib/answer_extract_rate": 0.953125, "calib/avg_num_step_conf": 12.109375, "calib/ece": 0.20660023795359905, "calib/final_conf_rate": 0.9609375, "calib/format_rate": 0.953125, "calib/frac_conf_gt_0.9": 0.024390243902439025, "calib/gap": 0.009875955039670925, "calib/mean_conf": 0.22096073765615706, "calib/mu_c": 0.22762500000000002, "calib/mu_w": 0.2177490449603291, "calib/nonempty_final_conf_rate": 0.9609375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.05117886178861789, "calib/std_conf": 0.20067466003042428, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3260610263522885, "calib/step_q_c_n": 721.0, "calib/step_q_gap": 0.028705904320342257, "calib/step_q_w": 0.29735512203194625, "calib/step_q_w_n": 2379.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 429.87890625, "completions/mean_terminated_length": 445.5425109863281, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.08533333333333333, "grad_norm": 2.0035974979400635, "learning_rate": 3.3611111111111117e-06, "loss": 0.2209, "num_tokens": 16981633.0, "reward": 0.9816856980323792, "reward_std": 0.21914240717887878, "rewards/accuracy_reward_step": 0.3125, "rewards/final_brier_reward_step": 0.6977464556694031, "rewards/format_reward_step": 0.953125, "step": 80 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8161282241344452, "aux_distill/mean_u": 0.2957661482389787, "aux_distill/n_active_tok": 191.625, "calib/answer_extract_rate": 0.69140625, "calib/avg_num_step_conf": 26.234375, "calib/ece": 0.20500000000000002, "calib/final_conf_rate": 0.6953125, "calib/format_rate": 0.6796875, "calib/frac_conf_gt_0.9": 0.02247191011235955, "calib/gap": 0.010498504486540439, "calib/mean_conf": 0.22230337078651682, "calib/mu_c": 0.22932203389830508, "calib/mu_w": 0.21882352941176464, "calib/nonempty_final_conf_rate": 0.6953125, "calib/nonempty_reasoning_rate": 0.9921875, "calib/nonempty_step_conf_rate": 0.9921875, "calib/pce": 0.047921348314606746, "calib/std_conf": 0.21159845927172033, "calib/step_conf_rate": 0.9921875, "calib/step_q_c": 0.34729551451187335, "calib/step_q_c_n": 758.0, "calib/step_q_gap": -0.0039771776667100744, "calib/step_q_w": 0.3512726921785834, "calib/step_q_w_n": 5958.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.27734375, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 394.12890625, "completions/mean_terminated_length": 545.3892211914062, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.0864, "grad_norm": 3.689892053604126, "learning_rate": 3.3333333333333333e-06, "loss": -0.1709, "num_tokens": 17188778.0, "reward": 0.701918363571167, "reward_std": 0.4659685492515564, "rewards/accuracy_reward_step": 0.234375, "rewards/final_brier_reward_step": 0.489774227142334, "rewards/format_reward_step": 0.6796875, "step": 81 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7570554517930553, "aux_distill/mean_u": 0.2875513874466876, "aux_distill/n_active_tok": 217.5483870967742, "calib/answer_extract_rate": 0.51953125, "calib/avg_num_step_conf": 29.359375, "calib/ece": 0.21593984962406013, "calib/final_conf_rate": 0.51953125, "calib/format_rate": 0.51171875, "calib/frac_conf_gt_0.9": 0.015037593984962405, "calib/gap": -0.006004497751124438, "calib/mean_conf": 0.242406015037594, "calib/mu_c": 0.23847826086956522, "calib/mu_w": 0.24448275862068966, "calib/nonempty_final_conf_rate": 0.51953125, "calib/nonempty_reasoning_rate": 0.96484375, "calib/nonempty_step_conf_rate": 0.96484375, "calib/pce": 0.0562406015037594, "calib/std_conf": 0.20432964070120244, "calib/step_conf_rate": 0.96484375, "calib/step_q_c": 0.3298311444652908, "calib/step_q_c_n": 533.0, "calib/step_q_gap": -0.07547177691520468, "calib/step_q_w": 0.4053029213804955, "calib/step_q_w_n": 6983.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 667.0270385742188, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.08746666666666666, "grad_norm": 8.525459289550781, "learning_rate": 3.3055555555555558e-06, "loss": -0.366, "num_tokens": 17393050.0, "reward": 0.5286312699317932, "reward_std": 0.5281660556793213, "rewards/accuracy_reward_step": 0.1796875, "rewards/final_brier_reward_step": 0.36585623025894165, "rewards/format_reward_step": 0.51171875, "step": 82 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7571941297501326, "aux_distill/mean_u": 0.2952067848573532, "aux_distill/n_active_tok": 255.875, "calib/answer_extract_rate": 0.62109375, "calib/avg_num_step_conf": 27.77734375, "calib/ece": 0.2588588957055215, "calib/final_conf_rate": 0.63671875, "calib/format_rate": 0.61328125, "calib/frac_conf_gt_0.9": 0.012269938650306749, "calib/gap": -0.07034216785592931, "calib/mean_conf": 0.23574233128834357, "calib/mu_c": 0.18870370370370373, "calib/mu_w": 0.25904587155963305, "calib/nonempty_final_conf_rate": 0.63671875, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.0816564417177914, "calib/std_conf": 0.20480982017762447, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.30772997032640953, "calib/step_q_c_n": 674.0, "calib/step_q_gap": -0.06138778639255893, "calib/step_q_w": 0.36911775671896846, "calib/step_q_w_n": 6437.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.30078125, "completions/max_length": 2857.0, "completions/max_terminated_length": 2857.0, "completions/mean_length": 454.484375, "completions/mean_terminated_length": 649.9888305664062, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.08853333333333334, "grad_norm": 3.0912773609161377, "learning_rate": 3.277777777777778e-06, "loss": -0.194, "num_tokens": 17616662.0, "reward": 0.6224585771560669, "reward_std": 0.4793688654899597, "rewards/accuracy_reward_step": 0.2109375, "rewards/final_brier_reward_step": 0.420698344707489, "rewards/format_reward_step": 0.61328125, "step": 83 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7821133900433779, "aux_distill/mean_u": 0.31811979441848154, "aux_distill/n_active_tok": 229.125, "calib/answer_extract_rate": 0.72265625, "calib/avg_num_step_conf": 24.9453125, "calib/ece": 0.19843243243243244, "calib/final_conf_rate": 0.72265625, "calib/format_rate": 0.71484375, "calib/frac_conf_gt_0.9": 0.02702702702702703, "calib/gap": -0.043775894538606475, "calib/mean_conf": 0.2565945945945946, "calib/mu_c": 0.22677966101694913, "calib/mu_w": 0.2705555555555556, "calib/nonempty_final_conf_rate": 0.72265625, "calib/nonempty_reasoning_rate": 0.98828125, "calib/nonempty_step_conf_rate": 0.98828125, "calib/pce": 0.06805405405405407, "calib/std_conf": 0.21356123996180776, "calib/step_conf_rate": 0.98828125, "calib/step_q_c": 0.3681395348837209, "calib/step_q_c_n": 774.0, "calib/step_q_gap": -0.03123756775348513, "calib/step_q_w": 0.39937710263720605, "calib/step_q_w_n": 5612.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 455.98046875, "completions/mean_terminated_length": 569.4194946289062, "completions/min_length": 0.0, "completions/min_terminated_length": 41.0, "epoch": 0.0896, "grad_norm": 1.6149286031723022, "learning_rate": 3.2500000000000002e-06, "loss": -0.0904, "num_tokens": 17839313.0, "reward": 0.7286218404769897, "reward_std": 0.47204819321632385, "rewards/accuracy_reward_step": 0.23046875, "rewards/final_brier_reward_step": 0.5119312405586243, "rewards/format_reward_step": 0.71484375, "step": 84 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8418309669941664, "aux_distill/mean_u": 0.30520798490763335, "aux_distill/n_active_tok": 183.625, "calib/answer_extract_rate": 0.9375, "calib/avg_num_step_conf": 15.20703125, "calib/ece": 0.21260330578512396, "calib/final_conf_rate": 0.9453125, "calib/format_rate": 0.9375, "calib/frac_conf_gt_0.9": 0.049586776859504134, "calib/gap": 0.029399350649350675, "calib/mean_conf": 0.27004132231404954, "calib/mu_c": 0.28875, "calib/mu_w": 0.25935064935064933, "calib/nonempty_final_conf_rate": 0.9453125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.05950413223140496, "calib/std_conf": 0.2362106111882379, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3968901734104046, "calib/step_q_c_n": 865.0, "calib/step_q_gap": 0.025058931666679385, "calib/step_q_w": 0.3718312417437252, "calib/step_q_w_n": 3028.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 410.4453125, "completions/mean_terminated_length": 430.6311340332031, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.09066666666666667, "grad_norm": 0.6718313097953796, "learning_rate": 3.2222222222222227e-06, "loss": 0.1399, "num_tokens": 18052211.0, "reward": 0.9761437177658081, "reward_std": 0.251767098903656, "rewards/accuracy_reward_step": 0.34375, "rewards/final_brier_reward_step": 0.671037495136261, "rewards/format_reward_step": 0.9375, "step": 85 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8239959748461843, "aux_distill/mean_u": 0.319219554660902, "aux_distill/n_active_tok": 175.25, "calib/answer_extract_rate": 0.95703125, "calib/avg_num_step_conf": 12.54296875, "calib/ece": 0.2043265306122449, "calib/final_conf_rate": 0.95703125, "calib/format_rate": 0.9453125, "calib/frac_conf_gt_0.9": 0.02857142857142857, "calib/gap": -0.009132947976878636, "calib/mean_conf": 0.20644897959183672, "calib/mu_c": 0.2, "calib/mu_w": 0.20913294797687865, "calib/nonempty_final_conf_rate": 0.95703125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.058448979591836744, "calib/std_conf": 0.2032512330182736, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3402790697674418, "calib/step_q_c_n": 645.0, "calib/step_q_gap": -0.013185466475738206, "calib/step_q_w": 0.35346453624318, "calib/step_q_w_n": 2566.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 374.6640625, "completions/mean_terminated_length": 386.75, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.09173333333333333, "grad_norm": 0.5359158515930176, "learning_rate": 3.1944444444444443e-06, "loss": 0.1646, "num_tokens": 18253637.0, "reward": 0.9615132808685303, "reward_std": 0.20205360651016235, "rewards/accuracy_reward_step": 0.28125, "rewards/final_brier_reward_step": 0.6964640617370605, "rewards/format_reward_step": 0.9453125, "step": 86 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.803918688558042, "aux_distill/mean_u": 0.2932792815270562, "aux_distill/n_active_tok": 159.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 10.21484375, "calib/ece": 0.30135999999999996, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.02, "calib/gap": -0.022649350649350697, "calib/mean_conf": 0.20632000000000003, "calib/mu_c": 0.1936363636363636, "calib/mu_w": 0.2162857142857143, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.03384000000000001, "calib/std_conf": 0.18657560826646125, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3034076433121019, "calib/step_q_c_n": 942.0, "calib/step_q_gap": -0.05141004945538169, "calib/step_q_w": 0.3548176927674836, "calib/step_q_w_n": 1673.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 347.71875, "completions/mean_terminated_length": 354.6454162597656, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.0928, "grad_norm": 0.5403491854667664, "learning_rate": 3.1666666666666667e-06, "loss": 0.1776, "num_tokens": 18448149.0, "reward": 1.0203272104263306, "reward_std": 0.17660395801067352, "rewards/accuracy_reward_step": 0.43359375, "rewards/final_brier_reward_step": 0.6344043016433716, "rewards/format_reward_step": 0.97265625, "step": 87 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.876606872305274, "aux_distill/mean_u": 0.2893717978872386, "aux_distill/n_active_tok": 136.125, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.7890625, "calib/ece": 0.2384285714285714, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.003968253968253968, "calib/gap": -0.003660256410256385, "calib/mean_conf": 0.18934920634920635, "calib/mu_c": 0.18708333333333335, "calib/mu_w": 0.19074358974358974, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.023412698412698413, "calib/std_conf": 0.1634352120294649, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2629210836277974, "calib/step_q_c_n": 849.0, "calib/step_q_gap": -0.02230522898535886, "calib/step_q_w": 0.2852263126131563, "calib/step_q_w_n": 1657.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 345.0546875, "completions/mean_terminated_length": 350.5317687988281, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.09386666666666667, "grad_norm": 0.5013801455497742, "learning_rate": 3.138888888888889e-06, "loss": 0.233, "num_tokens": 18646331.0, "reward": 1.019582748413086, "reward_std": 0.14026647806167603, "rewards/accuracy_reward_step": 0.37890625, "rewards/final_brier_reward_step": 0.6836965084075928, "rewards/format_reward_step": 0.9765625, "step": 88 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8640087898820639, "aux_distill/mean_u": 0.3608727952693905, "aux_distill/n_active_tok": 140.125, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.90234375, "calib/ece": 0.18127058823529407, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.0196078431372549, "calib/gap": 0.037335528159148146, "calib/mean_conf": 0.19276862745098042, "calib/mu_c": 0.2179518072289156, "calib/mu_w": 0.18061627906976746, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.024274509803921575, "calib/std_conf": 0.18715120602797072, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3426186830015313, "calib/step_q_c_n": 653.0, "calib/step_q_gap": 0.04391204093511064, "calib/step_q_w": 0.29870664206642067, "calib/step_q_w_n": 1626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 346.13671875, "completions/mean_terminated_length": 348.8622131347656, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.09493333333333333, "grad_norm": 0.5498507618904114, "learning_rate": 3.1111111111111116e-06, "loss": 0.266, "num_tokens": 18843830.0, "reward": 1.0269007682800293, "reward_std": 0.13228027522563934, "rewards/accuracy_reward_step": 0.32421875, "rewards/final_brier_reward_step": 0.7373951077461243, "rewards/format_reward_step": 0.9921875, "step": 89 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8565371185541153, "aux_distill/mean_u": 0.307992225866492, "aux_distill/n_active_tok": 141.0, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 9.86328125, "calib/ece": 0.23880478087649398, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.01195219123505976, "calib/gap": 0.007009454796411352, "calib/mean_conf": 0.2145816733067729, "calib/mu_c": 0.21907777777777782, "calib/mu_w": 0.21206832298136646, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04741035856573704, "calib/std_conf": 0.198886632863703, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.33993487698986974, "calib/step_q_c_n": 691.0, "calib/step_q_gap": 0.002947417884090031, "calib/step_q_w": 0.3369874591057797, "calib/step_q_w_n": 1834.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 351.59765625, "completions/mean_terminated_length": 358.6015930175781, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.096, "grad_norm": 0.4637305438518524, "learning_rate": 3.0833333333333336e-06, "loss": 0.2443, "num_tokens": 19037159.0, "reward": 1.011687994003296, "reward_std": 0.16643886268138885, "rewards/accuracy_reward_step": 0.3515625, "rewards/final_brier_reward_step": 0.6952507495880127, "rewards/format_reward_step": 0.9765625, "step": 90 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8760084416717291, "aux_distill/mean_u": 0.28022362415155183, "aux_distill/n_active_tok": 129.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 8.640625, "calib/ece": 0.258492125984252, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.015748031496062992, "calib/gap": -0.027924675324675324, "calib/mean_conf": 0.20683070866141734, "calib/mu_c": 0.1899, "calib/mu_w": 0.21782467532467534, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.03581102362204725, "calib/std_conf": 0.17850015682377415, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2720631313131313, "calib/step_q_c_n": 792.0, "calib/step_q_gap": -0.08380024896855881, "calib/step_q_w": 0.3558633802816901, "calib/step_q_w_n": 1420.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 339.00390625, "completions/mean_terminated_length": 341.6732177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.09706666666666666, "grad_norm": 0.5233395099639893, "learning_rate": 3.055555555555556e-06, "loss": 0.2388, "num_tokens": 19231656.0, "reward": 1.0215727090835571, "reward_std": 0.14010092616081238, "rewards/accuracy_reward_step": 0.390625, "rewards/final_brier_reward_step": 0.6681454181671143, "rewards/format_reward_step": 0.984375, "step": 91 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8196988068521023, "aux_distill/mean_u": 0.24410908331532602, "aux_distill/n_active_tok": 123.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.5, "calib/ece": 0.21699604743083004, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.019762845849802372, "calib/gap": 0.008241670012043234, "calib/mean_conf": 0.21928853754940714, "calib/mu_c": 0.22446808510638291, "calib/mu_w": 0.21622641509433968, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.032371541501976284, "calib/std_conf": 0.20788290958580805, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.29255714285714285, "calib/step_q_c_n": 700.0, "calib/step_q_gap": -0.015381365803365243, "calib/step_q_w": 0.3079385086605081, "calib/step_q_w_n": 1732.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 337.625, "completions/mean_terminated_length": 342.9841613769531, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.09813333333333334, "grad_norm": 0.5248092412948608, "learning_rate": 3.0277777777777776e-06, "loss": 0.2386, "num_tokens": 19424808.0, "reward": 1.0217368602752686, "reward_std": 0.15499769151210785, "rewards/accuracy_reward_step": 0.3671875, "rewards/final_brier_reward_step": 0.6919113397598267, "rewards/format_reward_step": 0.984375, "step": 92 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8715116884559393, "aux_distill/mean_u": 0.292919875969081, "aux_distill/n_active_tok": 134.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.38671875, "calib/ece": 0.2876086956521739, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.015810276679841896, "calib/gap": -0.03291698595146875, "calib/mean_conf": 0.2075691699604743, "calib/mu_c": 0.1887037037037037, "calib/mu_w": 0.22162068965517245, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0341501976284585, "calib/std_conf": 0.18972447298727482, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28146012269938653, "calib/step_q_c_n": 815.0, "calib/step_q_gap": -0.07955750954242713, "calib/step_q_w": 0.36101763224181366, "calib/step_q_w_n": 1588.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 341.03125, "completions/mean_terminated_length": 345.0751037597656, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.0992, "grad_norm": 0.4833468794822693, "learning_rate": 3e-06, "loss": 0.263, "num_tokens": 19617888.0, "reward": 1.0288138389587402, "reward_std": 0.14168192446231842, "rewards/accuracy_reward_step": 0.421875, "rewards/final_brier_reward_step": 0.6474713683128357, "rewards/format_reward_step": 0.98828125, "step": 93 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.828835241496563, "aux_distill/mean_u": 0.27052133899454756, "aux_distill/n_active_tok": 121.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.53515625, "calib/ece": 0.34593800392156865, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": -0.030389300000000008, "calib/mean_conf": 0.20253258431372548, "calib/mu_c": 0.18704, "calib/mu_w": 0.21742930000000002, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.029137254901960792, "calib/std_conf": 0.2013929581370898, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2530616996507567, "calib/step_q_c_n": 859.0, "calib/step_q_gap": -0.032922238667000336, "calib/step_q_w": 0.28598393831775704, "calib/step_q_w_n": 1070.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 299.28125, "completions/mean_terminated_length": 301.6377868652344, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.10026666666666667, "grad_norm": 0.5058930516242981, "learning_rate": 2.9722222222222225e-06, "loss": 0.2347, "num_tokens": 19803184.0, "reward": 1.046791911125183, "reward_std": 0.12935060262680054, "rewards/accuracy_reward_step": 0.48828125, "rewards/final_brier_reward_step": 0.6092088222503662, "rewards/format_reward_step": 0.99609375, "step": 94 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9062896221876144, "aux_distill/mean_u": 0.3493818523568767, "aux_distill/n_active_tok": 113.75, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.93359375, "calib/ece": 0.28477968749999993, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": 0.006114682539682503, "calib/mean_conf": 0.18225156250000002, "calib/mu_c": 0.1856910714285714, "calib/mu_w": 0.1795763888888889, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.014765625000000001, "calib/std_conf": 0.1670195296329402, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2656130909090909, "calib/step_q_c_n": 825.0, "calib/step_q_gap": 0.026708447459671325, "calib/step_q_w": 0.23890464344941958, "calib/step_q_w_n": 1206.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 314.15234375, "completions/mean_terminated_length": 316.6259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 0.10133333333333333, "grad_norm": 0.5349423885345459, "learning_rate": 2.944444444444445e-06, "loss": 0.2295, "num_tokens": 19989735.0, "reward": 1.0411256551742554, "reward_std": 0.11611491441726685, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.6564700603485107, "rewards/format_reward_step": 0.98828125, "step": 95 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8619091520085931, "aux_distill/mean_u": 0.3177986059032521, "aux_distill/n_active_tok": 143.75, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 9.99609375, "calib/ece": 0.3648896, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.004, "calib/gap": 0.0050328205128205294, "calib/mean_conf": 0.16543040000000003, "calib/mu_c": 0.16784615384615384, "calib/mu_w": 0.1628133333333333, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0051600000000000005, "calib/std_conf": 0.1503400339092685, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2535573122529644, "calib/step_q_c_n": 1012.0, "calib/step_q_gap": 0.02521956176815507, "calib/step_q_w": 0.22833775048480934, "calib/step_q_w_n": 1547.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 348.24609375, "completions/mean_terminated_length": 355.18328857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.1024, "grad_norm": 0.5326002240180969, "learning_rate": 2.916666666666667e-06, "loss": 0.2346, "num_tokens": 20184702.0, "reward": 1.0335478782653809, "reward_std": 0.15459981560707092, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.5866272449493408, "rewards/format_reward_step": 0.97265625, "step": 96 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8531998693943024, "aux_distill/mean_u": 0.2890356585287972, "aux_distill/n_active_tok": 144.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 10.1796875, "calib/ece": 0.20959055118110237, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.013189355742296932, "calib/mean_conf": 0.16426771653543307, "calib/mu_c": 0.1730952380952381, "calib/mu_w": 0.15990588235294118, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0215748031496063, "calib/std_conf": 0.15347099286409566, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25576441441441444, "calib/step_q_c_n": 666.0, "calib/step_q_gap": 0.04668194018761032, "calib/step_q_w": 0.20908247422680412, "calib/step_q_w_n": 1940.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 362.73046875, "completions/mean_terminated_length": 365.58660888671875, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.10346666666666667, "grad_norm": 0.5830482840538025, "learning_rate": 2.888888888888889e-06, "loss": 0.277, "num_tokens": 20382633.0, "reward": 1.0239131450653076, "reward_std": 0.10469962656497955, "rewards/accuracy_reward_step": 0.328125, "rewards/final_brier_reward_step": 0.7275139093399048, "rewards/format_reward_step": 0.9921875, "step": 97 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8644020799547434, "aux_distill/mean_u": 0.296143036334745, "aux_distill/n_active_tok": 152.25, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 10.77734375, "calib/ece": 0.32207928286852594, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.00796812749003984, "calib/gap": 0.006815569053708392, "calib/mean_conf": 0.163498406374502, "calib/mu_c": 0.16719130434782606, "calib/mu_w": 0.16037573529411767, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013705179282868525, "calib/std_conf": 0.15263408655975402, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28418463073852296, "calib/step_q_c_n": 1002.0, "calib/step_q_gap": 0.02861587718132319, "calib/step_q_w": 0.25556875355719977, "calib/step_q_w_n": 1757.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 362.5625, "completions/mean_terminated_length": 368.3174743652344, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.10453333333333334, "grad_norm": 0.49543944001197815, "learning_rate": 2.861111111111111e-06, "loss": 0.2354, "num_tokens": 20581633.0, "reward": 1.03104829788208, "reward_std": 0.1321575939655304, "rewards/accuracy_reward_step": 0.44921875, "rewards/final_brier_reward_step": 0.6324091553688049, "rewards/format_reward_step": 0.98046875, "step": 98 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8613427989184856, "aux_distill/mean_u": 0.36263934351551574, "aux_distill/n_active_tok": 190.0, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.87890625, "calib/ece": 0.16048192771084335, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.008032128514056224, "calib/gap": 0.010117528019925298, "calib/mean_conf": 0.18120481927710846, "calib/mu_c": 0.18835616438356168, "calib/mu_w": 0.17823863636363638, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.024257028112449796, "calib/std_conf": 0.1701622017856304, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2812059392265193, "calib/step_q_c_n": 724.0, "calib/step_q_gap": -0.06427320534753506, "calib/step_q_w": 0.3454791445740544, "calib/step_q_w_n": 2829.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 428.9375, "completions/mean_terminated_length": 440.9959716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.1056, "grad_norm": 0.39491331577301025, "learning_rate": 2.8333333333333335e-06, "loss": 0.187, "num_tokens": 20797241.0, "reward": 0.9963167905807495, "reward_std": 0.16188617050647736, "rewards/accuracy_reward_step": 0.28515625, "rewards/final_brier_reward_step": 0.734821081161499, "rewards/format_reward_step": 0.97265625, "step": 99 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.820311427116394, "aux_distill/mean_u": 0.3183899844738844, "aux_distill/n_active_tok": 192.75, "calib/answer_extract_rate": 0.9296875, "calib/avg_num_step_conf": 18.1953125, "calib/ece": 0.3286554621848739, "calib/final_conf_rate": 0.9296875, "calib/format_rate": 0.9296875, "calib/frac_conf_gt_0.9": 0.012605042016806723, "calib/gap": -0.01797679683078668, "calib/mean_conf": 0.19445378151260503, "calib/mu_c": 0.1850877192982456, "calib/mu_w": 0.20306451612903229, "calib/nonempty_final_conf_rate": 0.9296875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02205882352941177, "calib/std_conf": 0.17215245090107312, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3088545150501672, "calib/step_q_c_n": 1196.0, "calib/step_q_gap": -0.022706836769590122, "calib/step_q_w": 0.3315613518197573, "calib/step_q_w_n": 3462.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 402.67578125, "completions/mean_terminated_length": 433.1302795410156, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.10666666666666667, "grad_norm": 0.4126844108104706, "learning_rate": 2.805555555555556e-06, "loss": 0.0821, "num_tokens": 21007734.0, "reward": 0.9807562232017517, "reward_std": 0.2439180612564087, "rewards/accuracy_reward_step": 0.4453125, "rewards/final_brier_reward_step": 0.5865125060081482, "rewards/format_reward_step": 0.9296875, "step": 100 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8029415346682072, "aux_distill/mean_u": 0.2606578303927161, "aux_distill/n_active_tok": 277.625, "calib/answer_extract_rate": 0.8125, "calib/avg_num_step_conf": 31.69921875, "calib/ece": 0.14891980676328506, "calib/final_conf_rate": 0.80859375, "calib/format_rate": 0.8046875, "calib/frac_conf_gt_0.9": 0.00966183574879227, "calib/gap": 0.07122306390181968, "calib/mean_conf": 0.21310917874396135, "calib/mu_c": 0.26093529411764704, "calib/mu_w": 0.18971223021582737, "calib/nonempty_final_conf_rate": 0.80859375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.01676328502415459, "calib/std_conf": 0.17152507024488853, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3626553168635876, "calib/step_q_c_n": 931.0, "calib/step_q_gap": -0.057379482690978145, "calib/step_q_w": 0.42003479955456574, "calib/step_q_w_n": 7184.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19140625, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 435.09375, "completions/mean_terminated_length": 538.0869750976562, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.10773333333333333, "grad_norm": 0.3700943887233734, "learning_rate": 2.7777777777777783e-06, "loss": -0.1194, "num_tokens": 21226110.0, "reward": 0.8445423245429993, "reward_std": 0.388075590133667, "rewards/accuracy_reward_step": 0.265625, "rewards/final_brier_reward_step": 0.6187721490859985, "rewards/format_reward_step": 0.8046875, "step": 101 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.753003110177815, "aux_distill/mean_u": 0.29103300838633506, "aux_distill/n_active_tok": 254.5, "calib/answer_extract_rate": 0.796875, "calib/avg_num_step_conf": 32.359375, "calib/ece": 0.26816666666666666, "calib/final_conf_rate": 0.796875, "calib/format_rate": 0.79296875, "calib/frac_conf_gt_0.9": 0.00980392156862745, "calib/gap": -0.007465838509316758, "calib/mean_conf": 0.24399019607843137, "calib/mu_c": 0.2398913043478261, "calib/mu_w": 0.24735714285714286, "calib/nonempty_final_conf_rate": 0.796875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03058823529411765, "calib/std_conf": 0.2015103264564598, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.42965867992766726, "calib/step_q_c_n": 1106.0, "calib/step_q_gap": 0.023690713950568654, "calib/step_q_w": 0.4059679659770986, "calib/step_q_w_n": 7178.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 2892.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 419.28125, "completions/mean_terminated_length": 523.5902709960938, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1088, "grad_norm": 0.4960228204727173, "learning_rate": 2.7500000000000004e-06, "loss": -0.0723, "num_tokens": 21440142.0, "reward": 0.8393516540527344, "reward_std": 0.39622962474823, "rewards/accuracy_reward_step": 0.359375, "rewards/final_brier_reward_step": 0.5263594388961792, "rewards/format_reward_step": 0.79296875, "step": 102 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7848681882023811, "aux_distill/mean_u": 0.29963380616280844, "aux_distill/n_active_tok": 223.0, "calib/answer_extract_rate": 0.9140625, "calib/avg_num_step_conf": 22.140625, "calib/ece": 0.2553653846153846, "calib/final_conf_rate": 0.9140625, "calib/format_rate": 0.9140625, "calib/frac_conf_gt_0.9": 0.01282051282051282, "calib/gap": 0.0185940487660268, "calib/mean_conf": 0.21856623931623936, "calib/mu_c": 0.22897572815533979, "calib/mu_w": 0.210381679389313, "calib/nonempty_final_conf_rate": 0.9140625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.016880341880341883, "calib/std_conf": 0.18379481805063905, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3806787834902245, "calib/step_q_c_n": 1381.0, "calib/step_q_gap": -0.018342770043715317, "calib/step_q_w": 0.3990215535339398, "calib/step_q_w_n": 4287.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 493.2265625, "completions/mean_terminated_length": 537.3021240234375, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.10986666666666667, "grad_norm": 0.3963656425476074, "learning_rate": 2.7222222222222224e-06, "loss": 0.0785, "num_tokens": 21670960.0, "reward": 0.968917727470398, "reward_std": 0.26493752002716064, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.6214292645454407, "rewards/format_reward_step": 0.9140625, "step": 103 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.75134018342942, "aux_distill/mean_u": 0.30651122907575223, "aux_distill/n_active_tok": 190.125, "calib/answer_extract_rate": 0.9609375, "calib/avg_num_step_conf": 14.66796875, "calib/ece": 0.2352226720647774, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.95703125, "calib/frac_conf_gt_0.9": 0.024291497975708502, "calib/gap": -0.015760073260073365, "calib/mean_conf": 0.22720647773279354, "calib/mu_c": 0.2172527472527472, "calib/mu_w": 0.23301282051282057, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.04700404858299595, "calib/std_conf": 0.20290573856710922, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2972996746203905, "calib/step_q_c_n": 922.0, "calib/step_q_gap": -0.10208966530195324, "calib/step_q_w": 0.3993893399223437, "calib/step_q_w_n": 2833.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1175.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 418.80859375, "completions/mean_terminated_length": 434.06884765625, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.11093333333333333, "grad_norm": 0.4534111022949219, "learning_rate": 2.6944444444444444e-06, "loss": 0.167, "num_tokens": 21884855.0, "reward": 0.9896644353866577, "reward_std": 0.20764201879501343, "rewards/accuracy_reward_step": 0.35546875, "rewards/final_brier_reward_step": 0.6668288707733154, "rewards/format_reward_step": 0.95703125, "step": 104 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.829353941604495, "aux_distill/mean_u": 0.2884675067906213, "aux_distill/n_active_tok": 160.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 10.84375, "calib/ece": 0.33407086614173226, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": -0.006714950660956981, "calib/mean_conf": 0.19175590551181104, "calib/mu_c": 0.18829268292682927, "calib/mu_w": 0.19500763358778625, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.02078740157480315, "calib/std_conf": 0.1637721297920406, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3003345388788427, "calib/step_q_c_n": 1106.0, "calib/step_q_gap": -0.05909935333672617, "calib/step_q_w": 0.3594338922155689, "calib/step_q_w_n": 1670.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 382.421875, "completions/mean_terminated_length": 386.95654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.112, "grad_norm": 0.4169822931289673, "learning_rate": 2.666666666666667e-06, "loss": 0.2266, "num_tokens": 22088515.0, "reward": 1.0472151041030884, "reward_std": 0.13726508617401123, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.625680148601532, "rewards/format_reward_step": 0.98828125, "step": 105 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8308898760005832, "aux_distill/mean_u": 0.2872108335961295, "aux_distill/n_active_tok": 125.625, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.94140625, "calib/ece": 0.26890625, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": -0.022763157894736846, "calib/mean_conf": 0.208515625, "calib/mu_c": 0.195, "calib/mu_w": 0.21776315789473685, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0355859375, "calib/std_conf": 0.17521901403917148, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27391780821917805, "calib/step_q_c_n": 730.0, "calib/step_q_gap": -0.046343128081666185, "calib/step_q_w": 0.32026093630084423, "calib/step_q_w_n": 1303.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 315.41015625, "completions/mean_terminated_length": 317.8937072753906, "completions/min_length": 0.0, "completions/min_terminated_length": 121.0, "epoch": 0.11306666666666666, "grad_norm": 0.4877546429634094, "learning_rate": 2.6388888888888893e-06, "loss": 0.2546, "num_tokens": 22273844.0, "reward": 1.0382380485534668, "reward_std": 0.11429660767316818, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.6741324663162231, "rewards/format_reward_step": 0.99609375, "step": 106 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9066454619169235, "aux_distill/mean_u": 0.3440224827633614, "aux_distill/n_active_tok": 122.25, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 7.82421875, "calib/ece": 0.27308593750000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.02734375, "calib/gap": 0.010836730853666238, "calib/mean_conf": 0.2028515625, "calib/mu_c": 0.20915887850467296, "calib/mu_w": 0.19832214765100672, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028984375, "calib/std_conf": 0.1833160091653443, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.32453738910012675, "calib/step_q_c_n": 789.0, "calib/step_q_gap": 0.001094226002927412, "calib/step_q_w": 0.32344316309719934, "calib/step_q_w_n": 1214.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 307.30859375, "completions/mean_terminated_length": 309.72833251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.11413333333333334, "grad_norm": 0.46674540638923645, "learning_rate": 2.6111111111111113e-06, "loss": 0.2812, "num_tokens": 22457131.0, "reward": 1.046167016029358, "reward_std": 0.123149573802948, "rewards/accuracy_reward_step": 0.41796875, "rewards/final_brier_reward_step": 0.678271472454071, "rewards/format_reward_step": 0.99609375, "step": 107 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8467370923608541, "aux_distill/mean_u": 0.29868711003865983, "aux_distill/n_active_tok": 125.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.4140625, "calib/ece": 0.4441796875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": -0.03470141700404858, "calib/mean_conf": 0.18699218750000002, "calib/mu_c": 0.17289473684210527, "calib/mu_w": 0.20759615384615385, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018710937499999997, "calib/std_conf": 0.14887053919081789, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2749911190053286, "calib/step_q_c_n": 1126.0, "calib/step_q_gap": -0.021511471668246573, "calib/step_q_w": 0.29650259067357515, "calib/step_q_w_n": 772.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 294.11328125, "completions/mean_terminated_length": 296.42913818359375, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.1152, "grad_norm": 0.4774184226989746, "learning_rate": 2.5833333333333337e-06, "loss": 0.2502, "num_tokens": 22635656.0, "reward": 1.0740920305252075, "reward_std": 0.10364996641874313, "rewards/accuracy_reward_step": 0.59375, "rewards/final_brier_reward_step": 0.5544339418411255, "rewards/format_reward_step": 1.0, "step": 108 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.9085110891610384, "aux_distill/mean_u": 0.30277771671408027, "aux_distill/n_active_tok": 127.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.1953125, "calib/ece": 0.22762500000000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.0026041626142774, "calib/mean_conf": 0.202609375, "calib/mu_c": 0.20422680412371136, "calib/mu_w": 0.20162264150943396, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0256640625, "calib/std_conf": 0.17991144672340717, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2743208092485549, "calib/step_q_c_n": 692.0, "calib/step_q_gap": -0.007309347223706797, "calib/step_q_w": 0.2816301564722617, "calib/step_q_w_n": 1406.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 318.0546875, "completions/mean_terminated_length": 320.5590515136719, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.11626666666666667, "grad_norm": 0.45826196670532227, "learning_rate": 2.5555555555555557e-06, "loss": 0.2573, "num_tokens": 22821678.0, "reward": 1.0406734943389893, "reward_std": 0.10282213985919952, "rewards/accuracy_reward_step": 0.37890625, "rewards/final_brier_reward_step": 0.7024407386779785, "rewards/format_reward_step": 1.0, "step": 109 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8161964118480682, "aux_distill/mean_u": 0.20091813851221327, "aux_distill/n_active_tok": 124.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 7.921875, "calib/ece": 0.250546875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": 0.015232793522267224, "calib/mean_conf": 0.19210937499999997, "calib/mu_c": 0.20115384615384618, "calib/mu_w": 0.18592105263157896, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.018203125, "calib/std_conf": 0.16482543959325385, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.339047619047619, "calib/step_q_c_n": 756.0, "calib/step_q_gap": 0.07994699011680134, "calib/step_q_w": 0.25910062893081764, "calib/step_q_w_n": 1272.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 309.60546875, "completions/mean_terminated_length": 312.0433044433594, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.11733333333333333, "grad_norm": 0.524115800857544, "learning_rate": 2.5277777777777778e-06, "loss": 0.219, "num_tokens": 23005857.0, "reward": 1.0496820211410522, "reward_std": 0.09998592734336853, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.6931140422821045, "rewards/format_reward_step": 1.0, "step": 110 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8629253953695297, "aux_distill/mean_u": 0.27772488076746643, "aux_distill/n_active_tok": 129.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.34765625, "calib/ece": 0.29621093750000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.03939433068929471, "calib/mean_conf": 0.1716015625, "calib/mu_c": 0.19299145299145298, "calib/mu_w": 0.15359712230215827, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.005390625, "calib/std_conf": 0.11243128733834988, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2669621621621622, "calib/step_q_c_n": 925.0, "calib/step_q_gap": -0.011140148068860933, "calib/step_q_w": 0.2781023102310231, "calib/step_q_w_n": 1212.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 321.51953125, "completions/mean_terminated_length": 324.0511779785156, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.1184, "grad_norm": 0.45644694566726685, "learning_rate": 2.5e-06, "loss": 0.2514, "num_tokens": 23195574.0, "reward": 1.0671591758728027, "reward_std": 0.07571589201688766, "rewards/accuracy_reward_step": 0.45703125, "rewards/final_brier_reward_step": 0.6772871017456055, "rewards/format_reward_step": 1.0, "step": 111 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8283669967204332, "aux_distill/mean_u": 0.2970689657933608, "aux_distill/n_active_tok": 144.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.5234375, "calib/ece": 0.25892578125000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.007412488174077636, "calib/mean_conf": 0.17919921875, "calib/mu_c": 0.18357142857142864, "calib/mu_w": 0.176158940397351, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013984374999999999, "calib/std_conf": 0.13212152372206296, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24368415051311287, "calib/step_q_c_n": 877.0, "calib/step_q_gap": -0.0294882632799906, "calib/step_q_w": 0.2731724137931035, "calib/step_q_w_n": 1305.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 334.32421875, "completions/mean_terminated_length": 336.9566955566406, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.11946666666666667, "grad_norm": 0.42700129747390747, "learning_rate": 2.4722222222222226e-06, "loss": 0.25, "num_tokens": 23389081.0, "reward": 1.0481845140457153, "reward_std": 0.0933864563703537, "rewards/accuracy_reward_step": 0.41015625, "rewards/final_brier_reward_step": 0.6901190280914307, "rewards/format_reward_step": 0.99609375, "step": 112 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.825203481130302, "aux_distill/mean_u": 0.23171280180171644, "aux_distill/n_active_tok": 133.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.5, "calib/ece": 0.33029296875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.04800671550671551, "calib/mean_conf": 0.18291015625, "calib/mu_c": 0.20653846153846156, "calib/mu_w": 0.15853174603174605, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0026953125, "calib/std_conf": 0.12331135854758307, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2763565891472869, "calib/step_q_c_n": 1032.0, "calib/step_q_gap": -0.012678375887678173, "calib/step_q_w": 0.28903496503496506, "calib/step_q_w_n": 1144.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 313.15625, "completions/mean_terminated_length": 315.6220397949219, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.12053333333333334, "grad_norm": 0.474639892578125, "learning_rate": 2.4444444444444447e-06, "loss": 0.2397, "num_tokens": 23574449.0, "reward": 1.0805518627166748, "reward_std": 0.09097933769226074, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.6532913446426392, "rewards/format_reward_step": 1.0, "step": 113 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8264994230121374, "aux_distill/mean_u": 0.260318213715089, "aux_distill/n_active_tok": 130.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.26953125, "calib/ece": 0.3955468749999999, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.02640873015873016, "calib/mean_conf": 0.17601562499999998, "calib/mu_c": 0.18756944444444446, "calib/mu_w": 0.1611607142857143, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.00453125, "calib/std_conf": 0.12373636189843054, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2543778801843318, "calib/step_q_c_n": 1085.0, "calib/step_q_gap": 0.0020522987889829425, "calib/step_q_w": 0.25232558139534883, "calib/step_q_w_n": 1032.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 321.21484375, "completions/mean_terminated_length": 323.74407958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.1216, "grad_norm": 0.40753239393234253, "learning_rate": 2.4166666666666667e-06, "loss": 0.2685, "num_tokens": 23761704.0, "reward": 1.0823616981506348, "reward_std": 0.08417695015668869, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.6022234559059143, "rewards/format_reward_step": 1.0, "step": 114 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8344746474176645, "aux_distill/mean_u": 0.26471749218578394, "aux_distill/n_active_tok": 153.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 9.45703125, "calib/ece": 0.29492156862745095, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.03482805374090964, "calib/mean_conf": 0.1933921568627451, "calib/mu_c": 0.21155737704918032, "calib/mu_w": 0.17672932330827068, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.004941176470588235, "calib/std_conf": 0.14304244241008351, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24150134048257374, "calib/step_q_c_n": 1119.0, "calib/step_q_gap": -0.033216785477487715, "calib/step_q_w": 0.27471812596006145, "calib/step_q_w_n": 1302.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 359.45703125, "completions/mean_terminated_length": 362.28741455078125, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.12266666666666666, "grad_norm": 0.4399702250957489, "learning_rate": 2.388888888888889e-06, "loss": 0.2428, "num_tokens": 23958989.0, "reward": 1.0680962800979614, "reward_std": 0.11737754195928574, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.6635362505912781, "rewards/format_reward_step": 0.99609375, "step": 115 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8296884559094906, "aux_distill/mean_u": 0.2383707263061892, "aux_distill/n_active_tok": 157.25, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.125, "calib/ece": 0.2138117647058823, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00392156862745098, "calib/gap": 0.019737956487956454, "calib/mean_conf": 0.20187450980392158, "calib/mu_c": 0.21394949494949492, "calib/mu_w": 0.19421153846153846, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.013725490196078431, "calib/std_conf": 0.14172868286387316, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28197294250281846, "calib/step_q_c_n": 887.0, "calib/step_q_gap": -0.003939667467856012, "calib/step_q_w": 0.28591260997067447, "calib/step_q_w_n": 1705.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 408.76953125, "completions/mean_terminated_length": 410.37255859375, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.12373333333333333, "grad_norm": 0.4059218168258667, "learning_rate": 2.361111111111111e-06, "loss": 0.2438, "num_tokens": 24168154.0, "reward": 1.0485306978225708, "reward_std": 0.09994710981845856, "rewards/accuracy_reward_step": 0.38671875, "rewards/final_brier_reward_step": 0.7142488956451416, "rewards/format_reward_step": 0.99609375, "step": 116 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8448361586779356, "aux_distill/mean_u": 0.25940913176972474, "aux_distill/n_active_tok": 156.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.4375, "calib/ece": 0.22378515625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.015625, "calib/gap": -0.007329716352560495, "calib/mean_conf": 0.23301171875, "calib/mu_c": 0.22863106796116503, "calib/mu_w": 0.23596078431372552, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.027226562499999996, "calib/std_conf": 0.15947716045384336, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2669742574257426, "calib/step_q_c_n": 1010.0, "calib/step_q_gap": -0.025353059060418626, "calib/step_q_w": 0.2923273164861612, "calib/step_q_w_n": 1662.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 397.28515625, "completions/mean_terminated_length": 400.41339111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.1248, "grad_norm": 0.3755255937576294, "learning_rate": 2.3333333333333336e-06, "loss": 0.2472, "num_tokens": 24376459.0, "reward": 1.0521246194839478, "reward_std": 0.1031055748462677, "rewards/accuracy_reward_step": 0.40234375, "rewards/final_brier_reward_step": 0.7019054293632507, "rewards/format_reward_step": 1.0, "step": 117 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8845678055658937, "aux_distill/mean_u": 0.3274318211216818, "aux_distill/n_active_tok": 165.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.44140625, "calib/ece": 0.25635039370078744, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.003937007874015748, "calib/gap": 0.015536764705882306, "calib/mean_conf": 0.23168110236220477, "calib/mu_c": 0.23999999999999996, "calib/mu_w": 0.22446323529411766, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011732283464566926, "calib/std_conf": 0.14912145830804718, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2450046425255339, "calib/step_q_c_n": 1077.0, "calib/step_q_gap": -0.023299868752661657, "calib/step_q_w": 0.26830451127819555, "calib/step_q_w_n": 1596.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 393.8046875, "completions/mean_terminated_length": 396.905517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.12586666666666665, "grad_norm": 0.35954874753952026, "learning_rate": 2.305555555555556e-06, "loss": 0.2548, "num_tokens": 24581281.0, "reward": 1.0651524066925049, "reward_std": 0.12032070755958557, "rewards/accuracy_reward_step": 0.4609375, "rewards/final_brier_reward_step": 0.6771796941757202, "rewards/format_reward_step": 0.9921875, "step": 118 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8137836419045925, "aux_distill/mean_u": 0.26442520070357983, "aux_distill/n_active_tok": 150.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.61328125, "calib/ece": 0.2552968750000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0, "calib/gap": 0.043520966855887216, "calib/mean_conf": 0.252828125, "calib/mu_c": 0.2744186046511628, "calib/mu_w": 0.23089763779527558, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0021093750000000006, "calib/std_conf": 0.14796788130024832, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27225104602510464, "calib/step_q_c_n": 1195.0, "calib/step_q_gap": -0.018978500624041217, "calib/step_q_w": 0.29122954664914585, "calib/step_q_w_n": 1522.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 415.8203125, "completions/mean_terminated_length": 419.094482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.12693333333333334, "grad_norm": 0.3934442698955536, "learning_rate": 2.277777777777778e-06, "loss": 0.2799, "num_tokens": 24792795.0, "reward": 1.0953730344772339, "reward_std": 0.10872112214565277, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.6868396997451782, "rewards/format_reward_step": 1.0, "step": 119 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8701824210584164, "aux_distill/mean_u": 0.2888762191205573, "aux_distill/n_active_tok": 162.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.875, "calib/ece": 0.31291015625, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.026631944444444444, "calib/mean_conf": 0.27279296875000003, "calib/mu_c": 0.2844444444444445, "calib/mu_w": 0.25781250000000006, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.011601562499999997, "calib/std_conf": 0.1702758335225863, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23827830188679247, "calib/step_q_c_n": 1272.0, "calib/step_q_gap": -0.05679892741257056, "calib/step_q_w": 0.29507722929936303, "calib/step_q_w_n": 1256.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 387.85546875, "completions/mean_terminated_length": 390.9094543457031, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.128, "grad_norm": 0.7766105532646179, "learning_rate": 2.25e-06, "loss": 0.2532, "num_tokens": 24998774.0, "reward": 1.10829496383667, "reward_std": 0.12710832059383392, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.654090166091919, "rewards/format_reward_step": 1.0, "step": 120 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8371859528124332, "aux_distill/mean_u": 0.2782328386251819, "aux_distill/n_active_tok": 159.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.3515625, "calib/ece": 0.3069925490196078, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.00784313725490196, "calib/gap": 0.020572110739502047, "calib/mean_conf": 0.2653211764705883, "calib/mu_c": 0.27476014492753625, "calib/mu_w": 0.2541880341880342, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.015568627450980392, "calib/std_conf": 0.1585131689419768, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.277452566096423, "calib/step_q_c_n": 1286.0, "calib/step_q_gap": -0.02520139284785855, "calib/step_q_w": 0.30265395894428154, "calib/step_q_w_n": 1364.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 413.54296875, "completions/mean_terminated_length": 416.7992248535156, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.12906666666666666, "grad_norm": 0.36996495723724365, "learning_rate": 2.222222222222222e-06, "loss": 0.2268, "num_tokens": 25209697.0, "reward": 1.0985853672027588, "reward_std": 0.12847691774368286, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6581084132194519, "rewards/format_reward_step": 0.99609375, "step": 121 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8512360025197268, "aux_distill/mean_u": 0.2924762686230811, "aux_distill/n_active_tok": 160.0, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.0546875, "calib/ece": 0.2778117647058823, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.011764705882352941, "calib/gap": 0.07618210380796786, "calib/mean_conf": 0.30673725490196074, "calib/mu_c": 0.3393013698630137, "calib/mu_w": 0.2631192660550458, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.006, "calib/std_conf": 0.171524986987587, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23481716475095785, "calib/step_q_c_n": 1305.0, "calib/step_q_gap": -0.06490702752642588, "calib/step_q_w": 0.2997241922773837, "calib/step_q_w_n": 1269.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 394.52734375, "completions/mean_terminated_length": 397.63385009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.13013333333333332, "grad_norm": 0.39588937163352966, "learning_rate": 2.1944444444444445e-06, "loss": 0.2451, "num_tokens": 25418040.0, "reward": 1.1280884742736816, "reward_std": 0.12784995138645172, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6897708177566528, "rewards/format_reward_step": 0.99609375, "step": 122 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8117791870608926, "aux_distill/mean_u": 0.22482812837851487, "aux_distill/n_active_tok": 182.125, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.44140625, "calib/ece": 0.22187109374999997, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.01171875, "calib/gap": 0.04839890109890105, "calib/mean_conf": 0.30586328125, "calib/mu_c": 0.3296846153846154, "calib/mu_w": 0.28128571428571436, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0099609375, "calib/std_conf": 0.1724087645371702, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2834858528698464, "calib/step_q_c_n": 1237.0, "calib/step_q_gap": -0.042104676378064476, "calib/step_q_w": 0.3255905292479109, "calib/step_q_w_n": 1436.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 436.1875, "completions/mean_terminated_length": 439.6220397949219, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.1312, "grad_norm": 0.3385443389415741, "learning_rate": 2.166666666666667e-06, "loss": 0.2469, "num_tokens": 25634992.0, "reward": 1.1057794094085693, "reward_std": 0.12787926197052002, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7037463188171387, "rewards/format_reward_step": 1.0, "step": 123 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.879656570032239, "aux_distill/mean_u": 0.2719786785542341, "aux_distill/n_active_tok": 141.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.8359375, "calib/ece": 0.28291796875, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.00390625, "calib/gap": 0.026609239164921583, "calib/mean_conf": 0.34137890625, "calib/mu_c": 0.35208496732026145, "calib/mu_w": 0.32547572815533987, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0133203125, "calib/std_conf": 0.16329816295293623, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2698769968051118, "calib/step_q_c_n": 1252.0, "calib/step_q_gap": -0.0455032012146902, "calib/step_q_w": 0.315380198019802, "calib/step_q_w_n": 1010.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 377.21875, "completions/mean_terminated_length": 380.18896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.13226666666666667, "grad_norm": 0.40829819440841675, "learning_rate": 2.138888888888889e-06, "loss": 0.2655, "num_tokens": 25838376.0, "reward": 1.1388227939605713, "reward_std": 0.13053259253501892, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.6799894571304321, "rewards/format_reward_step": 1.0, "step": 124 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8070436110720038, "aux_distill/mean_u": 0.218702439653177, "aux_distill/n_active_tok": 135.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.32421875, "calib/ece": 0.2130560342555995, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.019762845849802372, "calib/gap": -0.008654143619905685, "calib/mean_conf": 0.40035635046113305, "calib/mu_c": 0.39618320610687024, "calib/mu_w": 0.40483734972677593, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.04781291172595521, "calib/std_conf": 0.1918678271576223, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26378531073446326, "calib/step_q_c_n": 1062.0, "calib/step_q_gap": -0.01954426159258077, "calib/step_q_w": 0.28332957232704403, "calib/step_q_w_n": 1325.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 378.16796875, "completions/mean_terminated_length": 382.6521911621094, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.13333333333333333, "grad_norm": 0.5166134238243103, "learning_rate": 2.1111111111111114e-06, "loss": 0.1924, "num_tokens": 26039995.0, "reward": 1.0936212539672852, "reward_std": 0.16634529829025269, "rewards/accuracy_reward_step": 0.51171875, "rewards/final_brier_reward_step": 0.6872425079345703, "rewards/format_reward_step": 0.98828125, "step": 125 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7905949633568525, "aux_distill/mean_u": 0.22697852024378237, "aux_distill/n_active_tok": 147.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.375, "calib/ece": 0.10710690234375, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0078125, "calib/gap": 0.06187749438339435, "calib/mean_conf": 0.43086184765625, "calib/mu_c": 0.4613171769230769, "calib/mu_w": 0.39943968253968254, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.015078125000000012, "calib/std_conf": 0.18609255587625964, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.25488984684684685, "calib/step_q_c_n": 1110.0, "calib/step_q_gap": -0.0629840291221454, "calib/step_q_w": 0.31787387596899225, "calib/step_q_w_n": 1290.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 413.5078125, "completions/mean_terminated_length": 416.7637634277344, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.1344, "grad_norm": 0.4073919951915741, "learning_rate": 2.0833333333333334e-06, "loss": 0.2482, "num_tokens": 26251317.0, "reward": 1.1208109855651855, "reward_std": 0.1425214558839798, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7377157807350159, "rewards/format_reward_step": 0.99609375, "step": 126 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7611914118751884, "aux_distill/mean_u": 0.21427045188004712, "aux_distill/n_active_tok": 145.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 9.41015625, "calib/ece": 0.13895176470588233, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0196078431372549, "calib/gap": 0.08786426725729052, "calib/mean_conf": 0.4333854901960784, "calib/mu_c": 0.47680077519379843, "calib/mu_w": 0.3889365079365079, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.03322745098039215, "calib/std_conf": 0.18705611191605545, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2536829690346084, "calib/step_q_c_n": 1098.0, "calib/step_q_gap": -0.017251432185834026, "calib/step_q_w": 0.27093440122044243, "calib/step_q_w_n": 1311.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 388.625, "completions/mean_terminated_length": 391.6850280761719, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.13546666666666668, "grad_norm": 0.4389866292476654, "learning_rate": 2.0555555555555555e-06, "loss": 0.2181, "num_tokens": 26454477.0, "reward": 1.1253852844238281, "reward_std": 0.15292197465896606, "rewards/accuracy_reward_step": 0.50390625, "rewards/final_brier_reward_step": 0.7507706880569458, "rewards/format_reward_step": 0.99609375, "step": 127 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8607869111001492, "aux_distill/mean_u": 0.2650464180056679, "aux_distill/n_active_tok": 145.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 9.3203125, "calib/ece": 0.08300393700787398, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.027559055118110236, "calib/gap": 0.09334362766420179, "calib/mean_conf": 0.5029015748031496, "calib/mu_c": 0.5473684210526315, "calib/mu_w": 0.4540247933884297, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.031141732283464563, "calib/std_conf": 0.20164805759386392, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25387522935779816, "calib/step_q_c_n": 1090.0, "calib/step_q_gap": -0.05599745582738702, "calib/step_q_w": 0.3098726851851852, "calib/step_q_w_n": 1296.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 385.42578125, "completions/mean_terminated_length": 389.9960632324219, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.13653333333333334, "grad_norm": 0.43388524651527405, "learning_rate": 2.027777777777778e-06, "loss": 0.2457, "num_tokens": 26659810.0, "reward": 1.1309232711791992, "reward_std": 0.20366542041301727, "rewards/accuracy_reward_step": 0.51953125, "rewards/final_brier_reward_step": 0.7501278519630432, "rewards/format_reward_step": 0.9921875, "step": 128 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8683709055185318, "aux_distill/mean_u": 0.31790983151919006, "aux_distill/n_active_tok": 144.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.734375, "calib/ece": 0.10494927843137251, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.0196078431372549, "calib/gap": 0.052860777350533295, "calib/mean_conf": 0.5346585647058825, "calib/mu_c": 0.558705035971223, "calib/mu_w": 0.5058442586206897, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0472549019607843, "calib/std_conf": 0.18598623563819291, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23410098302055407, "calib/step_q_c_n": 1119.0, "calib/step_q_gap": -0.02256104115133492, "calib/step_q_w": 0.256662024171889, "calib/step_q_w_n": 1117.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 368.296875, "completions/mean_terminated_length": 371.19683837890625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.1376, "grad_norm": 0.5401434898376465, "learning_rate": 2.0000000000000003e-06, "loss": 0.2675, "num_tokens": 26856478.0, "reward": 1.1418068408966064, "reward_std": 0.1939820498228073, "rewards/accuracy_reward_step": 0.546875, "rewards/final_brier_reward_step": 0.7406448721885681, "rewards/format_reward_step": 0.99609375, "step": 129 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8692066632211208, "aux_distill/mean_u": 0.25088071420496283, "aux_distill/n_active_tok": 125.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.48828125, "calib/ece": 0.04881510416666664, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.05859375, "calib/gap": 0.1445388415851992, "calib/mean_conf": 0.5760807291666666, "calib/mu_c": 0.635364238410596, "calib/mu_w": 0.4908253968253968, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.017526041666666662, "calib/std_conf": 0.20678414523099387, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27217905405405407, "calib/step_q_c_n": 1184.0, "calib/step_q_gap": 0.030604062480073635, "calib/step_q_w": 0.24157499157398044, "calib/step_q_w_n": 989.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 352.51171875, "completions/mean_terminated_length": 355.28741455078125, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.13866666666666666, "grad_norm": 0.40447065234184265, "learning_rate": 1.9722222222222224e-06, "loss": 0.2612, "num_tokens": 27052009.0, "reward": 1.1874513626098633, "reward_std": 0.17112727463245392, "rewards/accuracy_reward_step": 0.58984375, "rewards/final_brier_reward_step": 0.7850587964057922, "rewards/format_reward_step": 1.0, "step": 130 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8476680275052786, "aux_distill/mean_u": 0.23583052530939752, "aux_distill/n_active_tok": 144.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 8.87890625, "calib/ece": 0.17316406249999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.0703125, "calib/gap": 0.10731275303643739, "calib/mean_conf": 0.5771484375, "calib/mu_c": 0.6408653846153847, "calib/mu_w": 0.5335526315789473, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1720312499999999, "calib/std_conf": 0.20614598096084383, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23984172661870506, "calib/step_q_c_n": 834.0, "calib/step_q_gap": -0.055043610420905775, "calib/step_q_w": 0.29488533703961084, "calib/step_q_w_n": 1439.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 371.578125, "completions/mean_terminated_length": 374.5039367675781, "completions/min_length": 0.0, "completions/min_terminated_length": 150.0, "epoch": 0.13973333333333332, "grad_norm": 0.6352536678314209, "learning_rate": 1.944444444444445e-06, "loss": 0.239, "num_tokens": 27253341.0, "reward": 1.0725533962249756, "reward_std": 0.1738070249557495, "rewards/accuracy_reward_step": 0.40625, "rewards/final_brier_reward_step": 0.7388566732406616, "rewards/format_reward_step": 1.0, "step": 131 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8875786550343037, "aux_distill/mean_u": 0.2981896139011602, "aux_distill/n_active_tok": 133.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.28515625, "calib/ece": 0.10589843750000008, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.078125, "calib/gap": 0.09960147695441823, "calib/mean_conf": 0.6212890624999999, "calib/mu_c": 0.660974025974026, "calib/mu_w": 0.5613725490196078, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.06281250000000008, "calib/std_conf": 0.21616756565884507, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2856017328519856, "calib/step_q_c_n": 1385.0, "calib/step_q_gap": -0.010592319567369224, "calib/step_q_w": 0.2961940524193548, "calib/step_q_w_n": 992.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 395.390625, "completions/mean_terminated_length": 398.5039367675781, "completions/min_length": 0.0, "completions/min_terminated_length": 116.0, "epoch": 0.1408, "grad_norm": 0.4326506555080414, "learning_rate": 1.916666666666667e-06, "loss": 0.2447, "num_tokens": 27460153.0, "reward": 1.1812529563903809, "reward_std": 0.21404148638248444, "rewards/accuracy_reward_step": 0.6015625, "rewards/final_brier_reward_step": 0.7609433531761169, "rewards/format_reward_step": 1.0, "step": 132 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8566825911402702, "aux_distill/mean_u": 0.30097786155730477, "aux_distill/n_active_tok": 153.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.39453125, "calib/ece": 0.25664062500000007, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.10546875, "calib/gap": 0.048932330827067605, "calib/mean_conf": 0.601015625, "calib/mu_c": 0.6317894736842105, "calib/mu_w": 0.5828571428571429, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24328125000000003, "calib/std_conf": 0.22125649652803278, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25251977401129944, "calib/step_q_c_n": 885.0, "calib/step_q_gap": -0.027565752304490043, "calib/step_q_w": 0.2800855263157895, "calib/step_q_w_n": 1520.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 416.375, "completions/mean_terminated_length": 419.6535339355469, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.14186666666666667, "grad_norm": 0.3714953660964966, "learning_rate": 1.888888888888889e-06, "loss": 0.2421, "num_tokens": 27673089.0, "reward": 1.0293660163879395, "reward_std": 0.2450329065322876, "rewards/accuracy_reward_step": 0.37109375, "rewards/final_brier_reward_step": 0.6876382827758789, "rewards/format_reward_step": 1.0, "step": 133 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8316728174686432, "aux_distill/mean_u": 0.24845326043078172, "aux_distill/n_active_tok": 168.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.4765625, "calib/ece": 0.13919140625000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.11328125, "calib/gap": 0.18020545265603027, "calib/mean_conf": 0.56841015625, "calib/mu_c": 0.6620325203252032, "calib/mu_w": 0.48182706766917294, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11356640625000003, "calib/std_conf": 0.2657971643059808, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2906495589414595, "calib/step_q_c_n": 1247.0, "calib/step_q_gap": -0.010627792974916783, "calib/step_q_w": 0.3012773519163763, "calib/step_q_w_n": 1435.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 450.24609375, "completions/mean_terminated_length": 455.5849914550781, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.14293333333333333, "grad_norm": 0.38375750184059143, "learning_rate": 1.8611111111111113e-06, "loss": 0.2399, "num_tokens": 27897304.0, "reward": 1.1212167739868164, "reward_std": 0.20416364073753357, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.7619649171829224, "rewards/format_reward_step": 1.0, "step": 134 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8126184120774269, "aux_distill/mean_u": 0.2249086348603118, "aux_distill/n_active_tok": 148.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.84375, "calib/ece": 0.167433881854416, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.12109375, "calib/gap": 0.12185181360830022, "calib/mean_conf": 0.6109254931455841, "calib/mu_c": 0.6708994326559193, "calib/mu_w": 0.5490476190476191, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1352734375000001, "calib/std_conf": 0.24969082446086727, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2639072319720659, "calib/step_q_c_n": 1275.0, "calib/step_q_gap": -0.01553051903195013, "calib/step_q_w": 0.279437751004016, "calib/step_q_w_n": 1245.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 426.47265625, "completions/mean_terminated_length": 429.8307189941406, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.144, "grad_norm": 0.3621184527873993, "learning_rate": 1.8333333333333333e-06, "loss": 0.2554, "num_tokens": 28112361.0, "reward": 1.1229034662246704, "reward_std": 0.20501597225666046, "rewards/accuracy_reward_step": 0.5078125, "rewards/final_brier_reward_step": 0.7379943132400513, "rewards/format_reward_step": 1.0, "step": 135 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8527163006365299, "aux_distill/mean_u": 0.24802126757150933, "aux_distill/n_active_tok": 156.625, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 10.765625, "calib/ece": 0.08708661417322835, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.09448818897637795, "calib/gap": 0.2711115248557066, "calib/mean_conf": 0.5421259842519685, "calib/mu_c": 0.6819512195121952, "calib/mu_w": 0.41083969465648856, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.07248031496062994, "calib/std_conf": 0.2681876470854467, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2807493061979648, "calib/step_q_c_n": 1081.0, "calib/step_q_gap": -0.02145964902591585, "calib/step_q_w": 0.30220895522388064, "calib/step_q_w_n": 1675.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 441.3359375, "completions/mean_terminated_length": 446.5691833496094, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.14506666666666668, "grad_norm": 0.44931545853614807, "learning_rate": 1.8055555555555557e-06, "loss": 0.207, "num_tokens": 28333831.0, "reward": 1.1383601427078247, "reward_std": 0.1638520359992981, "rewards/accuracy_reward_step": 0.48046875, "rewards/final_brier_reward_step": 0.8040640354156494, "rewards/format_reward_step": 0.9921875, "step": 136 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8319171629846096, "aux_distill/mean_u": 0.2752806769768428, "aux_distill/n_active_tok": 145.25, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 9.7421875, "calib/ece": 0.09984313725490195, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.11764705882352941, "calib/gap": 0.2407757135826773, "calib/mean_conf": 0.5584313725490195, "calib/mu_c": 0.6792913385826773, "calib/mu_w": 0.438515625, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08011764705882352, "calib/std_conf": 0.28072296405028024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2751356080489939, "calib/step_q_c_n": 1143.0, "calib/step_q_gap": 0.03524504032138473, "calib/step_q_w": 0.23989056772760917, "calib/step_q_w_n": 1351.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 425.0390625, "completions/mean_terminated_length": 426.7059020996094, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.14613333333333334, "grad_norm": 0.35590553283691406, "learning_rate": 1.777777777777778e-06, "loss": 0.2827, "num_tokens": 28549625.0, "reward": 1.1385234594345093, "reward_std": 0.17667999863624573, "rewards/accuracy_reward_step": 0.49609375, "rewards/final_brier_reward_step": 0.7848593592643738, "rewards/format_reward_step": 0.99609375, "step": 137 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8382508996874094, "aux_distill/mean_u": 0.2673697069633504, "aux_distill/n_active_tok": 147.75, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.53515625, "calib/ece": 0.12353333333333334, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.11372549019607843, "calib/gap": 0.20276334453154166, "calib/mean_conf": 0.5598705882352941, "calib/mu_c": 0.6505177304964539, "calib/mu_w": 0.44775438596491224, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.0652313725490196, "calib/std_conf": 0.26972782744769586, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.21309735849056602, "calib/step_q_c_n": 1325.0, "calib/step_q_gap": -0.04360307882721823, "calib/step_q_w": 0.25670043731778425, "calib/step_q_w_n": 1372.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 444.98828125, "completions/mean_terminated_length": 448.49212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.1472, "grad_norm": 0.394085168838501, "learning_rate": 1.75e-06, "loss": 0.2411, "num_tokens": 28767878.0, "reward": 1.1620368957519531, "reward_std": 0.20082634687423706, "rewards/accuracy_reward_step": 0.55078125, "rewards/final_brier_reward_step": 0.7771989107131958, "rewards/format_reward_step": 0.99609375, "step": 138 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8436331208795309, "aux_distill/mean_u": 0.2814290154348513, "aux_distill/n_active_tok": 138.875, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.296875, "calib/ece": 0.11386718749999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.12109375, "calib/gap": 0.23992398594698178, "calib/mean_conf": 0.5486328125, "calib/mu_c": 0.6432903225806451, "calib/mu_w": 0.40336633663366334, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.028515624999999992, "calib/std_conf": 0.2902290413162333, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25776119402985076, "calib/step_q_c_n": 1407.0, "calib/step_q_gap": -0.02217714101639795, "calib/step_q_w": 0.2799383350462487, "calib/step_q_w_n": 973.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 397.0625, "completions/mean_terminated_length": 400.18896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.14826666666666666, "grad_norm": 0.4404184818267822, "learning_rate": 1.7222222222222224e-06, "loss": 0.2486, "num_tokens": 28972622.0, "reward": 1.1968767642974854, "reward_std": 0.18953311443328857, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.7882847785949707, "rewards/format_reward_step": 1.0, "step": 139 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8466813545674086, "aux_distill/mean_u": 0.23530338947091445, "aux_distill/n_active_tok": 169.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.015625, "calib/ece": 0.17171874999999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.171875, "calib/gap": 0.21049717744677948, "calib/mean_conf": 0.569765625, "calib/mu_c": 0.641301775147929, "calib/mu_w": 0.4308045977011495, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.040664062499999945, "calib/std_conf": 0.309708695903682, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2294391934467549, "calib/step_q_c_n": 1587.0, "calib/step_q_gap": -0.00437861617453475, "calib/step_q_w": 0.23381780962128965, "calib/step_q_w_n": 977.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 428.6171875, "completions/mean_terminated_length": 431.99212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 135.0, "epoch": 0.14933333333333335, "grad_norm": 0.6040653586387634, "learning_rate": 1.6944444444444446e-06, "loss": 0.2489, "num_tokens": 29187364.0, "reward": 1.2130831480026245, "reward_std": 0.1731255054473877, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7660101652145386, "rewards/format_reward_step": 1.0, "step": 140 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8504192354157567, "aux_distill/mean_u": 0.27125072911758774, "aux_distill/n_active_tok": 163.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.58203125, "calib/ece": 0.17248046874999998, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.296875, "calib/gap": 0.1967826293953614, "calib/mean_conf": 0.65916015625, "calib/mu_c": 0.7260355029585798, "calib/mu_w": 0.5292528735632184, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08574218749999997, "calib/std_conf": 0.3202229718395068, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24454718176060797, "calib/step_q_c_n": 1579.0, "calib/step_q_gap": 0.021320637138182302, "calib/step_q_w": 0.22322654462242567, "calib/step_q_w_n": 874.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 444.90234375, "completions/mean_terminated_length": 448.405517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.1504, "grad_norm": 0.4739407002925873, "learning_rate": 1.6666666666666667e-06, "loss": 0.2614, "num_tokens": 29408355.0, "reward": 1.2107794284820557, "reward_std": 0.21473020315170288, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7614026069641113, "rewards/format_reward_step": 1.0, "step": 141 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8319859988987446, "aux_distill/mean_u": 0.23112263651612833, "aux_distill/n_active_tok": 160.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 9.91015625, "calib/ece": 0.13419607843137257, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.27450980392156865, "calib/gap": 0.26719765684051405, "calib/mean_conf": 0.6069019607843138, "calib/mu_c": 0.7200680272108844, "calib/mu_w": 0.4528703703703703, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08231372549019614, "calib/std_conf": 0.3279362237675192, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2621822358346095, "calib/step_q_c_n": 1306.0, "calib/step_q_gap": -0.07119713053419635, "calib/step_q_w": 0.33337936636880583, "calib/step_q_w_n": 1231.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 446.37109375, "completions/mean_terminated_length": 449.8858337402344, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.15146666666666667, "grad_norm": 0.3541559875011444, "learning_rate": 1.638888888888889e-06, "loss": 0.1921, "num_tokens": 29627786.0, "reward": 1.1725636720657349, "reward_std": 0.19649362564086914, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7748148441314697, "rewards/format_reward_step": 0.99609375, "step": 142 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.855469910427928, "aux_distill/mean_u": 0.2705275509708212, "aux_distill/n_active_tok": 142.75, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.390625, "calib/ece": 0.15089843749999995, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.3515625, "calib/gap": 0.3437423538047467, "calib/mean_conf": 0.6123046875, "calib/mu_c": 0.7761194029850746, "calib/mu_w": 0.43237704918032793, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11988281249999996, "calib/std_conf": 0.36266989679394035, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2441018998272884, "calib/step_q_c_n": 1158.0, "calib/step_q_gap": -0.03865893484365859, "calib/step_q_w": 0.282760834670947, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 422.3984375, "completions/mean_terminated_length": 425.7243957519531, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.15253333333333333, "grad_norm": 0.3848201632499695, "learning_rate": 1.6111111111111113e-06, "loss": 0.2682, "num_tokens": 29843256.0, "reward": 1.153026819229126, "reward_std": 0.15764334797859192, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7826160192489624, "rewards/format_reward_step": 1.0, "step": 143 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8302166722714901, "aux_distill/mean_u": 0.24965398126664595, "aux_distill/n_active_tok": 149.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.83203125, "calib/ece": 0.19162499999999993, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.484375, "calib/gap": 0.31800591675266615, "calib/mean_conf": 0.66540625, "calib/mu_c": 0.7709941520467838, "calib/mu_w": 0.45298823529411764, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.09453124999999996, "calib/std_conf": 0.36710423923040914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2574477244772448, "calib/step_q_c_n": 1626.0, "calib/step_q_gap": 0.05942303311922009, "calib/step_q_w": 0.1980246913580247, "calib/step_q_w_n": 891.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 444.6015625, "completions/mean_terminated_length": 448.10235595703125, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 0.1536, "grad_norm": 0.4171602725982666, "learning_rate": 1.5833333333333333e-06, "loss": 0.2536, "num_tokens": 30061202.0, "reward": 1.2262345552444458, "reward_std": 0.14458578824996948, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.784500241279602, "rewards/format_reward_step": 1.0, "step": 144 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7345545515418053, "aux_distill/mean_u": 0.25175217095796415, "aux_distill/n_active_tok": 182.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.49609375, "calib/ece": 0.18062055335968383, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5335968379446641, "calib/gap": 0.322297159979156, "calib/mean_conf": 0.6942964426877472, "calib/mu_c": 0.8229605263157896, "calib/mu_w": 0.5006633663366337, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.13706324110671939, "calib/std_conf": 0.365968198616236, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24804450438300743, "calib/step_q_c_n": 1483.0, "calib/step_q_gap": -0.055667824384115855, "calib/step_q_w": 0.3037123287671233, "calib/step_q_w_n": 1460.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 460.94140625, "completions/mean_terminated_length": 466.4071350097656, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.15466666666666667, "grad_norm": 0.983404278755188, "learning_rate": 1.5555555555555558e-06, "loss": 0.2223, "num_tokens": 30281907.0, "reward": 1.1744862794876099, "reward_std": 0.23245766758918762, "rewards/accuracy_reward_step": 0.59765625, "rewards/final_brier_reward_step": 0.7630351185798645, "rewards/format_reward_step": 0.98828125, "step": 145 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8586379196494818, "aux_distill/mean_u": 0.2799588618828064, "aux_distill/n_active_tok": 202.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.796875, "calib/ece": 0.25566929133858274, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.5118110236220472, "calib/gap": 0.30277665995975855, "calib/mean_conf": 0.6478740157480315, "calib/mu_c": 0.8171428571428573, "calib/mu_w": 0.5143661971830987, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23129921259842523, "calib/std_conf": 0.39229915854773334, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27074539363484085, "calib/step_q_c_n": 1194.0, "calib/step_q_gap": -0.0038603018744691187, "calib/step_q_w": 0.27460569550930997, "calib/step_q_w_n": 1826.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 506.44921875, "completions/mean_terminated_length": 508.4353332519531, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.15573333333333333, "grad_norm": 0.4449126720428467, "learning_rate": 1.527777777777778e-06, "loss": 0.2461, "num_tokens": 30518774.0, "reward": 1.0651085376739502, "reward_std": 0.24594148993492126, "rewards/accuracy_reward_step": 0.4375, "rewards/final_brier_reward_step": 0.7005296945571899, "rewards/format_reward_step": 0.9921875, "step": 146 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8714845888316631, "aux_distill/mean_u": 0.3000520868029459, "aux_distill/n_active_tok": 202.375, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.421875, "calib/ece": 0.26945098039215687, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.596078431372549, "calib/gap": 0.2942093252744541, "calib/mean_conf": 0.7012156862745098, "calib/mu_c": 0.840820895522388, "calib/mu_w": 0.5466115702479339, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.22258823529411764, "calib/std_conf": 0.39100936821059745, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2534572784810127, "calib/step_q_c_n": 1264.0, "calib/step_q_gap": -0.14323787821414397, "calib/step_q_w": 0.39669515669515665, "calib/step_q_w_n": 1404.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 458.91015625, "completions/mean_terminated_length": 460.7098388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 161.0, "epoch": 0.1568, "grad_norm": 0.3406878113746643, "learning_rate": 1.5e-06, "loss": 0.2536, "num_tokens": 30739935.0, "reward": 1.1151740550994873, "reward_std": 0.1806105673313141, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.7108168005943298, "rewards/format_reward_step": 0.99609375, "step": 147 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8154330048710108, "aux_distill/mean_u": 0.2554237223511033, "aux_distill/n_active_tok": 159.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.0546875, "calib/ece": 0.17080468749999994, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.69140625, "calib/gap": 0.3563581395348837, "calib/mean_conf": 0.7840859375, "calib/mu_c": 0.9038, "calib/mu_w": 0.5474418604651163, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14541406249999994, "calib/std_conf": 0.34397198576300664, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2468400263331139, "calib/step_q_c_n": 1519.0, "calib/step_q_gap": -0.04597513954366342, "calib/step_q_w": 0.2928151658767773, "calib/step_q_w_n": 1055.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 434.2109375, "completions/mean_terminated_length": 437.6299133300781, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.15786666666666666, "grad_norm": 0.3257892429828644, "learning_rate": 1.4722222222222225e-06, "loss": 0.2332, "num_tokens": 30956205.0, "reward": 1.233625888824463, "reward_std": 0.23119288682937622, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.8031893968582153, "rewards/format_reward_step": 1.0, "step": 148 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7958731511607766, "aux_distill/mean_u": 0.2204305141091541, "aux_distill/n_active_tok": 161.25, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.69921875, "calib/ece": 0.22457031249999992, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.64453125, "calib/gap": 0.3587686450727079, "calib/mean_conf": 0.7226171875, "calib/mu_c": 0.875374149659864, "calib/mu_w": 0.5166055045871562, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18648437499999992, "calib/std_conf": 0.3883586579498258, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.30053857350800584, "calib/step_q_c_n": 1374.0, "calib/step_q_gap": -0.06163725066781833, "calib/step_q_w": 0.36217582417582417, "calib/step_q_w_n": 1365.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 504.1796875, "completions/mean_terminated_length": 508.14959716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.15893333333333334, "grad_norm": 0.587183952331543, "learning_rate": 1.4444444444444445e-06, "loss": 0.243, "num_tokens": 31189731.0, "reward": 1.1661572456359863, "reward_std": 0.2592434883117676, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.7580957412719727, "rewards/format_reward_step": 1.0, "step": 149 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8890952225774527, "aux_distill/mean_u": 0.311144694626446, "aux_distill/n_active_tok": 176.75, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 9.375, "calib/ece": 0.21964705882352953, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.788235294117647, "calib/gap": 0.34338901213623896, "calib/mean_conf": 0.8215294117647061, "calib/mu_c": 0.9521518987341772, "calib/mu_w": 0.6087628865979382, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2107843137254903, "calib/std_conf": 0.3387757863966573, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23309803921568625, "calib/step_q_c_n": 1275.0, "calib/step_q_gap": -0.10549751633986934, "calib/step_q_w": 0.3385955555555556, "calib/step_q_w_n": 1125.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 424.16015625, "completions/mean_terminated_length": 427.5, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.16, "grad_norm": 0.4133373498916626, "learning_rate": 1.4166666666666667e-06, "loss": 0.2409, "num_tokens": 31403276.0, "reward": 1.1904525756835938, "reward_std": 0.18449309468269348, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.767623782157898, "rewards/format_reward_step": 0.99609375, "step": 150 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8351477207615972, "aux_distill/mean_u": 0.2853782974883022, "aux_distill/n_active_tok": 207.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.62890625, "calib/ece": 0.2806274509803922, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6705882352941176, "calib/gap": 0.26302917403384696, "calib/mean_conf": 0.7217254901960785, "calib/mu_c": 0.8320945945945948, "calib/mu_w": 0.5690654205607478, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21098039215686276, "calib/std_conf": 0.4011681021878422, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2626766040181465, "calib/step_q_c_n": 1543.0, "calib/step_q_gap": -0.049359658185479716, "calib/step_q_w": 0.31203626220362624, "calib/step_q_w_n": 1434.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 494.9296875, "completions/mean_terminated_length": 498.8267822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 127.0, "epoch": 0.16106666666666666, "grad_norm": 0.3437005579471588, "learning_rate": 1.3888888888888892e-06, "loss": 0.2287, "num_tokens": 31637002.0, "reward": 1.1375683546066284, "reward_std": 0.19518016278743744, "rewards/accuracy_reward_step": 0.578125, "rewards/final_brier_reward_step": 0.7009179592132568, "rewards/format_reward_step": 0.99609375, "step": 151 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8661780599504709, "aux_distill/mean_u": 0.29071673239059204, "aux_distill/n_active_tok": 143.25, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.77734375, "calib/ece": 0.25152941176470583, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.6823529411764706, "calib/gap": 0.3766925351071694, "calib/mean_conf": 0.7257254901960786, "calib/mu_c": 0.9074242424242426, "calib/mu_w": 0.5307317073170732, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2298039215686274, "calib/std_conf": 0.4019698592949756, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2951077943615257, "calib/step_q_c_n": 1206.0, "calib/step_q_gap": -0.012941143178718972, "calib/step_q_w": 0.30804893754024465, "calib/step_q_w_n": 1553.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 473.08203125, "completions/mean_terminated_length": 476.8070983886719, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.16213333333333332, "grad_norm": 0.31601613759994507, "learning_rate": 1.3611111111111112e-06, "loss": 0.1938, "num_tokens": 31863503.0, "reward": 1.1211999654769897, "reward_std": 0.2283134013414383, "rewards/accuracy_reward_step": 0.515625, "rewards/final_brier_reward_step": 0.730681300163269, "rewards/format_reward_step": 0.99609375, "step": 152 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8924239929765463, "aux_distill/mean_u": 0.3081032026315535, "aux_distill/n_active_tok": 148.875, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 10.80859375, "calib/ece": 0.27490234375, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.6640625, "calib/gap": 0.27321370680153023, "calib/mean_conf": 0.70943359375, "calib/mu_c": 0.8129559748427673, "calib/mu_w": 0.539742268041237, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18162109375000002, "calib/std_conf": 0.40331460110589845, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3348767288033674, "calib/step_q_c_n": 1663.0, "calib/step_q_gap": 0.051987236049744145, "calib/step_q_w": 0.28288949275362324, "calib/step_q_w_n": 1104.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 495.67578125, "completions/mean_terminated_length": 499.5787353515625, "completions/min_length": 0.0, "completions/min_terminated_length": 156.0, "epoch": 0.1632, "grad_norm": 0.3540187180042267, "learning_rate": 1.3333333333333334e-06, "loss": 0.2643, "num_tokens": 32097716.0, "reward": 1.1719425916671753, "reward_std": 0.20725688338279724, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7227912545204163, "rewards/format_reward_step": 1.0, "step": 153 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8481863886117935, "aux_distill/mean_u": 0.2958641144448045, "aux_distill/n_active_tok": 167.125, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 11.3515625, "calib/ece": 0.25767068273092375, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.96875, "calib/frac_conf_gt_0.9": 0.6746987951807228, "calib/gap": 0.42878292871900814, "calib/mean_conf": 0.7087550200803213, "calib/mu_c": 0.9291735537190082, "calib/mu_w": 0.5003906250000001, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2402409638554217, "calib/std_conf": 0.41155261930734266, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28757575757575754, "calib/step_q_c_n": 1089.0, "calib/step_q_gap": -0.09181097880288852, "calib/step_q_w": 0.37938673637864606, "calib/step_q_w_n": 1817.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 480.69921875, "completions/mean_terminated_length": 488.32940673828125, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.16426666666666667, "grad_norm": 0.3609575629234314, "learning_rate": 1.3055555555555556e-06, "loss": 0.2392, "num_tokens": 32325215.0, "reward": 1.083282232284546, "reward_std": 0.2492285668849945, "rewards/accuracy_reward_step": 0.4765625, "rewards/final_brier_reward_step": 0.7212519645690918, "rewards/format_reward_step": 0.96875, "step": 154 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8189228344708681, "aux_distill/mean_u": 0.20074483546096195, "aux_distill/n_active_tok": 151.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.73828125, "calib/ece": 0.33277343750000005, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.7265625, "calib/gap": 0.23265598238316632, "calib/mean_conf": 0.7462890625, "calib/mu_c": 0.8571641791044777, "calib/mu_w": 0.6245081967213114, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2778125000000001, "calib/std_conf": 0.39972536768120065, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.302288595810706, "calib/step_q_c_n": 1289.0, "calib/step_q_gap": -0.01769479289361292, "calib/step_q_w": 0.3199833887043189, "calib/step_q_w_n": 1204.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 423.73828125, "completions/mean_terminated_length": 427.0747985839844, "completions/min_length": 0.0, "completions/min_terminated_length": 162.0, "epoch": 0.16533333333333333, "grad_norm": 0.4874245822429657, "learning_rate": 1.2777777777777779e-06, "loss": 0.2194, "num_tokens": 32540908.0, "reward": 1.0903079509735107, "reward_std": 0.25045233964920044, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.657178521156311, "rewards/format_reward_step": 1.0, "step": 155 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8304382590577006, "aux_distill/mean_u": 0.3006649706126327, "aux_distill/n_active_tok": 165.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 10.59375, "calib/ece": 0.23736220472440941, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.594488188976378, "calib/gap": 0.3713015557762331, "calib/mean_conf": 0.632007874015748, "calib/mu_c": 0.7708805031446541, "calib/mu_w": 0.399578947368421, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12169291338582677, "calib/std_conf": 0.43995022975605924, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3129776021080369, "calib/step_q_c_n": 1518.0, "calib/step_q_gap": 0.0053394111532630295, "calib/step_q_w": 0.30763819095477385, "calib/step_q_w_n": 1194.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 504.10546875, "completions/mean_terminated_length": 506.0823974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.1664, "grad_norm": 0.4381815195083618, "learning_rate": 1.25e-06, "loss": 0.2404, "num_tokens": 32774719.0, "reward": 1.176797866821289, "reward_std": 0.20549890398979187, "rewards/accuracy_reward_step": 0.62109375, "rewards/final_brier_reward_step": 0.7403144836425781, "rewards/format_reward_step": 0.9921875, "step": 156 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8249999564141035, "aux_distill/mean_u": 0.2358457818520905, "aux_distill/n_active_tok": 198.0, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.9296875, "calib/ece": 0.2089330708661416, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.6259842519685039, "calib/gap": 0.4298190476190476, "calib/mean_conf": 0.6625551181102363, "calib/mu_c": 0.8046999999999999, "calib/mu_w": 0.37488095238095226, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10109842519685026, "calib/std_conf": 0.43245846702562246, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.258047619047619, "calib/step_q_c_n": 1680.0, "calib/step_q_gap": -0.06967836348513212, "calib/step_q_w": 0.32772598253275115, "calib/step_q_w_n": 1374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 506.83203125, "completions/mean_terminated_length": 512.8419189453125, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.16746666666666668, "grad_norm": 0.37992429733276367, "learning_rate": 1.2222222222222223e-06, "loss": 0.2279, "num_tokens": 33008196.0, "reward": 1.2160041332244873, "reward_std": 0.17098858952522278, "rewards/accuracy_reward_step": 0.6640625, "rewards/final_brier_reward_step": 0.7757582068443298, "rewards/format_reward_step": 0.9921875, "step": 157 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.746043186634779, "aux_distill/mean_u": 0.24562522883955865, "aux_distill/n_active_tok": 183.0, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.19921875, "calib/ece": 0.18522529644268768, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.6245059288537549, "calib/gap": 0.4845342349957736, "calib/mean_conf": 0.6513754940711463, "calib/mu_c": 0.8122485207100593, "calib/mu_w": 0.3277142857142857, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.08430830039525684, "calib/std_conf": 0.43960899500620837, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26825465838509316, "calib/step_q_c_n": 1610.0, "calib/step_q_gap": -0.0423897330230214, "calib/step_q_w": 0.31064439140811456, "calib/step_q_w_n": 1257.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 484.640625, "completions/mean_terminated_length": 492.3333740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.16853333333333334, "grad_norm": 0.5074396133422852, "learning_rate": 1.1944444444444446e-06, "loss": 0.2143, "num_tokens": 33237504.0, "reward": 1.2193374633789062, "reward_std": 0.18944784998893738, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7902376651763916, "rewards/format_reward_step": 0.98828125, "step": 158 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8241626480594277, "aux_distill/mean_u": 0.26721943700812645, "aux_distill/n_active_tok": 166.125, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 11.44921875, "calib/ece": 0.2765357142857143, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.6031746031746031, "calib/gap": 0.3270807702248644, "calib/mean_conf": 0.6278293650793652, "calib/mu_c": 0.7654109589041097, "calib/mu_w": 0.4383301886792453, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1625, "calib/std_conf": 0.44762274038426675, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.20295438596491228, "calib/step_q_c_n": 1425.0, "calib/step_q_gap": -0.07211666317187393, "calib/step_q_w": 0.2750710491367862, "calib/step_q_w_n": 1506.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 473.73828125, "completions/mean_terminated_length": 481.2579650878906, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.1696, "grad_norm": 0.4542931914329529, "learning_rate": 1.1666666666666668e-06, "loss": 0.1962, "num_tokens": 33463565.0, "reward": 1.1282752752304077, "reward_std": 0.21358579397201538, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.701863169670105, "rewards/format_reward_step": 0.984375, "step": 159 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.879276018589735, "aux_distill/mean_u": 0.3238322491555653, "aux_distill/n_active_tok": 163.25, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 11.703125, "calib/ece": 0.30257028112449796, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.5502008032128514, "calib/gap": 0.311876102292769, "calib/mean_conf": 0.5762248995983936, "calib/mu_c": 0.6776785714285715, "calib/mu_w": 0.36580246913580244, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1020481927710843, "calib/std_conf": 0.45937294175971427, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2581012658227848, "calib/step_q_c_n": 1659.0, "calib/step_q_gap": 0.027802088560256732, "calib/step_q_w": 0.23029917726252808, "calib/step_q_w_n": 1337.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 468.46484375, "completions/mean_terminated_length": 481.634521484375, "completions/min_length": 0.0, "completions/min_terminated_length": 141.0, "epoch": 0.17066666666666666, "grad_norm": 0.559697151184082, "learning_rate": 1.138888888888889e-06, "loss": 0.1897, "num_tokens": 33688332.0, "reward": 1.153278112411499, "reward_std": 0.2089913785457611, "rewards/accuracy_reward_step": 0.65625, "rewards/final_brier_reward_step": 0.677649974822998, "rewards/format_reward_step": 0.97265625, "step": 160 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8093566577881575, "aux_distill/mean_u": 0.26456430254127333, "aux_distill/n_active_tok": 152.375, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.7578125, "calib/ece": 0.24238281250000004, "calib/final_conf_rate": 1.0, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.59375, "calib/gap": 0.4298599162542825, "calib/mean_conf": 0.6154296875, "calib/mu_c": 0.7346486486486487, "calib/mu_w": 0.30478873239436616, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.06757812500000006, "calib/std_conf": 0.45531390585908127, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.21960448642266825, "calib/step_q_c_n": 1694.0, "calib/step_q_gap": 0.014156725228638395, "calib/step_q_w": 0.20544776119402985, "calib/step_q_w_n": 804.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 451.3671875, "completions/mean_terminated_length": 454.9212646484375, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.17173333333333332, "grad_norm": 0.5177509784698486, "learning_rate": 1.111111111111111e-06, "loss": 0.2397, "num_tokens": 33907802.0, "reward": 1.2339601516723633, "reward_std": 0.19392341375350952, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.7491703033447266, "rewards/format_reward_step": 0.99609375, "step": 161 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8228137148544192, "aux_distill/mean_u": 0.23076060475157925, "aux_distill/n_active_tok": 158.5, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 9.84375, "calib/ece": 0.20257812500000003, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.66796875, "calib/gap": 0.4412045454545454, "calib/mean_conf": 0.690703125, "calib/mu_c": 0.8285795454545454, "calib/mu_w": 0.38737499999999997, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.10289062500000003, "calib/std_conf": 0.42986132137613214, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.19520210896309315, "calib/step_q_c_n": 1707.0, "calib/step_q_gap": -0.018709330151297998, "calib/step_q_w": 0.21391143911439114, "calib/step_q_w_n": 813.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 449.9765625, "completions/mean_terminated_length": 453.5196838378906, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.1728, "grad_norm": 0.39314666390419006, "learning_rate": 1.0833333333333335e-06, "loss": 0.2273, "num_tokens": 34127140.0, "reward": 1.2387226819992065, "reward_std": 0.182912677526474, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.7899453043937683, "rewards/format_reward_step": 1.0, "step": 162 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8334611281752586, "aux_distill/mean_u": 0.29235313919252837, "aux_distill/n_active_tok": 174.625, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.72265625, "calib/ece": 0.19699604743082988, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.5928853754940712, "calib/gap": 0.5048886731391586, "calib/mean_conf": 0.621185770750988, "calib/mu_c": 0.8267333333333334, "calib/mu_w": 0.32184466019417474, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.11264822134387333, "calib/std_conf": 0.453629387954209, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.23267145656433702, "calib/step_q_c_n": 1531.0, "calib/step_q_gap": 0.02569186472760232, "calib/step_q_w": 0.2069795918367347, "calib/step_q_w_n": 1470.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 539.78125, "completions/mean_terminated_length": 546.1818237304688, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.17386666666666667, "grad_norm": 0.5653132200241089, "learning_rate": 1.0555555555555557e-06, "loss": 0.2166, "num_tokens": 34370156.0, "reward": 1.1803362369537354, "reward_std": 0.18647001683712006, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.786453902721405, "rewards/format_reward_step": 0.98828125, "step": 163 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8464186768978834, "aux_distill/mean_u": 0.26922085797948403, "aux_distill/n_active_tok": 195.375, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.1640625, "calib/ece": 0.18371999999999988, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.652, "calib/gap": 0.5143531559728742, "calib/mean_conf": 0.67956, "calib/mu_c": 0.9017605633802815, "calib/mu_w": 0.38740740740740737, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14763999999999985, "calib/std_conf": 0.4355210745761909, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2321144781144781, "calib/step_q_c_n": 1485.0, "calib/step_q_gap": 0.023690074931454225, "calib/step_q_w": 0.20842440318302388, "calib/step_q_w_n": 1885.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 601.07421875, "completions/mean_terminated_length": 608.2015991210938, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.17493333333333333, "grad_norm": 0.3060588836669922, "learning_rate": 1.0277777777777777e-06, "loss": 0.2373, "num_tokens": 34630167.0, "reward": 1.1586520671844482, "reward_std": 0.1987255960702896, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.786054253578186, "rewards/format_reward_step": 0.9765625, "step": 164 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8290663491934538, "aux_distill/mean_u": 0.21056666403523405, "aux_distill/n_active_tok": 169.375, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.0703125, "calib/ece": 0.2945849802371541, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7747035573122529, "calib/gap": 0.2901091758172408, "calib/mean_conf": 0.7915810276679842, "calib/mu_c": 0.9223021582733812, "calib/mu_w": 0.6321929824561404, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2683794466403162, "calib/std_conf": 0.37803197257410914, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2831420765027322, "calib/step_q_c_n": 1464.0, "calib/step_q_gap": -0.04385054342346706, "calib/step_q_w": 0.32699261992619927, "calib/step_q_w_n": 1626.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 513.33203125, "completions/mean_terminated_length": 521.4801635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.176, "grad_norm": 0.3841126263141632, "learning_rate": 1.0000000000000002e-06, "loss": 0.2575, "num_tokens": 34867156.0, "reward": 1.1088169813156128, "reward_std": 0.19576653838157654, "rewards/accuracy_reward_step": 0.54296875, "rewards/final_brier_reward_step": 0.6863839626312256, "rewards/format_reward_step": 0.98828125, "step": 165 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8489821460098028, "aux_distill/mean_u": 0.29878565550623887, "aux_distill/n_active_tok": 201.375, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.6171875, "calib/ece": 0.2149802371541502, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.83399209486166, "calib/gap": 0.29267097829319033, "calib/mean_conf": 0.8430434782608697, "calib/mu_c": 0.930960451977401, "calib/mu_w": 0.6382894736842106, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 0.99609375, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.17920948616600793, "calib/std_conf": 0.33328023865394385, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.2746310504634397, "calib/step_q_c_n": 1942.0, "calib/step_q_gap": 0.02766676474915397, "calib/step_q_w": 0.24696428571428575, "calib/step_q_w_n": 1288.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 547.95703125, "completions/mean_terminated_length": 554.45458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, "epoch": 0.17706666666666668, "grad_norm": 0.31519395112991333, "learning_rate": 9.722222222222224e-07, "loss": 0.2368, "num_tokens": 35113617.0, "reward": 1.2258692979812622, "reward_std": 0.23709669709205627, "rewards/accuracy_reward_step": 0.69140625, "rewards/final_brier_reward_step": 0.7720511555671692, "rewards/format_reward_step": 0.98828125, "step": 166 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8104597348719835, "aux_distill/mean_u": 0.2658721026737349, "aux_distill/n_active_tok": 182.875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.82421875, "calib/ece": 0.17719367588932802, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8814229249011858, "calib/gap": 0.26956763925729454, "calib/mean_conf": 0.8803557312252965, "calib/mu_c": 0.9421538461538463, "calib/mu_w": 0.6725862068965518, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.14339920948616597, "calib/std_conf": 0.29882290804470024, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.275175079581628, "calib/step_q_c_n": 2199.0, "calib/step_q_gap": -0.04958337452465217, "calib/step_q_w": 0.3247584541062802, "calib/step_q_w_n": 828.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 530.625, "completions/mean_terminated_length": 534.8031616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.17813333333333334, "grad_norm": 0.3957613408565521, "learning_rate": 9.444444444444445e-07, "loss": 0.2567, "num_tokens": 35355065.0, "reward": 1.2788411378860474, "reward_std": 0.20663800835609436, "rewards/accuracy_reward_step": 0.76171875, "rewards/final_brier_reward_step": 0.807682454586029, "rewards/format_reward_step": 0.98828125, "step": 167 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8572558909654617, "aux_distill/mean_u": 0.32651245974212556, "aux_distill/n_active_tok": 207.875, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.33984375, "calib/ece": 0.21231075697211158, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.896414342629482, "calib/gap": 0.26695864661654123, "calib/mean_conf": 0.8923107569721117, "calib/mu_c": 0.9731428571428571, "calib/mu_w": 0.7061842105263159, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.20370517928286855, "calib/std_conf": 0.2860310890588503, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31775862068965516, "calib/step_q_c_n": 2030.0, "calib/step_q_gap": 0.00037233910120748615, "calib/step_q_w": 0.31738628158844767, "calib/step_q_w_n": 1385.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 595.70703125, "completions/mean_terminated_length": 602.770751953125, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.1792, "grad_norm": 0.2700546383857727, "learning_rate": 9.166666666666666e-07, "loss": 0.2194, "num_tokens": 35612238.0, "reward": 1.215261459350586, "reward_std": 0.27505505084991455, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7664605379104614, "rewards/format_reward_step": 0.98046875, "step": 168 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8547635991126299, "aux_distill/mean_u": 0.2882160263843028, "aux_distill/n_active_tok": 185.875, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.34765625, "calib/ece": 0.28507936507936515, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8412698412698413, "calib/gap": 0.2736870748299318, "calib/mean_conf": 0.8474603174603176, "calib/mu_c": 0.9614965986394556, "calib/mu_w": 0.6878095238095238, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2746031746031747, "calib/std_conf": 0.3311450468844275, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.3273376623376623, "calib/step_q_c_n": 1694.0, "calib/step_q_gap": 0.03185027310794181, "calib/step_q_w": 0.2954873892297205, "calib/step_q_w_n": 1467.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 571.69921875, "completions/mean_terminated_length": 578.478271484375, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.18026666666666666, "grad_norm": 0.2821076214313507, "learning_rate": 8.88888888888889e-07, "loss": 0.2208, "num_tokens": 35862777.0, "reward": 1.1290287971496582, "reward_std": 0.2493588924407959, "rewards/accuracy_reward_step": 0.57421875, "rewards/final_brier_reward_step": 0.6994640231132507, "rewards/format_reward_step": 0.984375, "step": 169 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.829416710883379, "aux_distill/mean_u": 0.25372244946181366, "aux_distill/n_active_tok": 203.0, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.0859375, "calib/ece": 0.22000000000000017, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.8605577689243028, "calib/gap": 0.24231843575418976, "calib/mean_conf": 0.8678087649402392, "calib/mu_c": 0.9373184357541898, "calib/mu_w": 0.6950000000000001, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.18733067729083683, "calib/std_conf": 0.31213628919231023, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3354214647627822, "calib/step_q_c_n": 2171.0, "calib/step_q_gap": 0.040764976213163884, "calib/step_q_w": 0.2946564885496183, "calib/step_q_w_n": 1179.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 547.20703125, "completions/mean_terminated_length": 558.1076049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 146.0, "epoch": 0.18133333333333335, "grad_norm": 0.3414306044578552, "learning_rate": 8.611111111111112e-07, "loss": 0.2157, "num_tokens": 36107014.0, "reward": 1.2150764465332031, "reward_std": 0.2328571081161499, "rewards/accuracy_reward_step": 0.69921875, "rewards/final_brier_reward_step": 0.7543718814849854, "rewards/format_reward_step": 0.9765625, "step": 170 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8333851397037506, "aux_distill/mean_u": 0.31075516679071946, "aux_distill/n_active_tok": 157.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.734375, "calib/ece": 0.37089791514463505, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8549019607843137, "calib/gap": 0.18891153961358642, "calib/mean_conf": 0.8657998759289488, "calib/mu_c": 0.9554402535495133, "calib/mu_w": 0.7665287139359269, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3556037974975762, "calib/std_conf": 0.3119447808665385, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2976370106761566, "calib/step_q_c_n": 1405.0, "calib/step_q_gap": -0.040511832350735244, "calib/step_q_w": 0.33814884302689185, "calib/step_q_w_n": 1599.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 539.51171875, "completions/mean_terminated_length": 543.7598266601562, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.1824, "grad_norm": 0.4528205394744873, "learning_rate": 8.333333333333333e-07, "loss": 0.2122, "num_tokens": 36352025.0, "reward": 1.074401617050171, "reward_std": 0.23831433057785034, "rewards/accuracy_reward_step": 0.5234375, "rewards/final_brier_reward_step": 0.6292720437049866, "rewards/format_reward_step": 0.99609375, "step": 171 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8144433349370956, "aux_distill/mean_u": 0.22704867808540513, "aux_distill/n_active_tok": 169.875, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 10.875, "calib/ece": 0.26695219123505975, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.9203187250996016, "calib/gap": 0.15199018617405124, "calib/mean_conf": 0.9280677290836654, "calib/mu_c": 0.9777218934911244, "calib/mu_w": 0.8257317073170731, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26085657370517923, "calib/std_conf": 0.22707904312620714, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.29474042027194064, "calib/step_q_c_n": 1618.0, "calib/step_q_gap": -0.04203487989958593, "calib/step_q_w": 0.33677530017152657, "calib/step_q_w_n": 1166.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 483.25390625, "completions/mean_terminated_length": 492.8804931640625, "completions/min_length": 0.0, "completions/min_terminated_length": 132.0, "epoch": 0.18346666666666667, "grad_norm": 0.39163994789123535, "learning_rate": 8.055555555555557e-07, "loss": 0.2099, "num_tokens": 36579090.0, "reward": 1.1783955097198486, "reward_std": 0.21855786442756653, "rewards/accuracy_reward_step": 0.66015625, "rewards/final_brier_reward_step": 0.7161659002304077, "rewards/format_reward_step": 0.98046875, "step": 172 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7717863172292709, "aux_distill/mean_u": 0.23909250423805128, "aux_distill/n_active_tok": 183.125, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.98046875, "calib/ece": 0.3257193675889329, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.83399209486166, "calib/gap": 0.1694787360105332, "calib/mean_conf": 0.8410553359683796, "calib/mu_c": 0.9067032258064516, "calib/mu_w": 0.7372244897959184, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2770632411067194, "calib/std_conf": 0.34101605461781503, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27821873308067135, "calib/step_q_c_n": 1847.0, "calib/step_q_gap": -0.02184274232916472, "calib/step_q_w": 0.3000614754098361, "calib/step_q_w_n": 1220.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2850.0, "completions/max_terminated_length": 2850.0, "completions/mean_length": 575.21484375, "completions/mean_terminated_length": 582.0355834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.18453333333333333, "grad_norm": 0.43757736682891846, "learning_rate": 7.777777777777779e-07, "loss": 0.2275, "num_tokens": 36829505.0, "reward": 1.130254864692688, "reward_std": 0.20538073778152466, "rewards/accuracy_reward_step": 0.60546875, "rewards/final_brier_reward_step": 0.666759729385376, "rewards/format_reward_step": 0.98828125, "step": 173 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8352727349847555, "aux_distill/mean_u": 0.27053089226250693, "aux_distill/n_active_tok": 186.125, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.53515625, "calib/ece": 0.2823904382470118, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.7808764940239044, "calib/gap": 0.30275407697325507, "calib/mean_conf": 0.7901992031872511, "calib/mu_c": 0.9168493150684932, "calib/mu_w": 0.6140952380952381, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24545816733067716, "calib/std_conf": 0.3834145709626288, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.31455794504181595, "calib/step_q_c_n": 1674.0, "calib/step_q_gap": -0.002034888834405557, "calib/step_q_w": 0.3165928338762215, "calib/step_q_w_n": 1535.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 592.8515625, "completions/mean_terminated_length": 602.261962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 144.0, "epoch": 0.1856, "grad_norm": 0.30176031589508057, "learning_rate": 7.5e-07, "loss": 0.2294, "num_tokens": 37085507.0, "reward": 1.1251819133758545, "reward_std": 0.304109126329422, "rewards/accuracy_reward_step": 0.5703125, "rewards/final_brier_reward_step": 0.6995828151702881, "rewards/format_reward_step": 0.98046875, "step": 174 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8153031952679157, "aux_distill/mean_u": 0.2494548199217007, "aux_distill/n_active_tok": 179.625, "calib/answer_extract_rate": 0.96484375, "calib/avg_num_step_conf": 15.0703125, "calib/ece": 0.3794979757085021, "calib/final_conf_rate": 0.96484375, "calib/format_rate": 0.96484375, "calib/frac_conf_gt_0.9": 0.8097165991902834, "calib/gap": 0.1968481510621557, "calib/mean_conf": 0.8256518218623481, "calib/mu_c": 0.9236774193548386, "calib/mu_w": 0.7268292682926829, "calib/nonempty_final_conf_rate": 0.96484375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.3515627530364373, "calib/std_conf": 0.35116507103079797, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2560912547528517, "calib/step_q_c_n": 1315.0, "calib/step_q_gap": -0.01895396742567762, "calib/step_q_w": 0.27504522217852934, "calib/step_q_w_n": 2543.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 579.5859375, "completions/mean_terminated_length": 598.2822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.18666666666666668, "grad_norm": 0.2807024419307709, "learning_rate": 7.222222222222222e-07, "loss": 0.1579, "num_tokens": 37339705.0, "reward": 1.0238916873931885, "reward_std": 0.3080098628997803, "rewards/accuracy_reward_step": 0.484375, "rewards/final_brier_reward_step": 0.5985648036003113, "rewards/format_reward_step": 0.96484375, "step": 175 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8529551476240158, "aux_distill/mean_u": 0.3246562601008102, "aux_distill/n_active_tok": 154.875, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.21484375, "calib/ece": 0.2678740157480316, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8188976377952756, "calib/gap": 0.3169576923076922, "calib/mean_conf": 0.8336220472440946, "calib/mu_c": 0.9633999999999999, "calib/mu_w": 0.6464423076923077, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.255472440944882, "calib/std_conf": 0.34766072993072755, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.277580547112462, "calib/step_q_c_n": 1645.0, "calib/step_q_gap": 0.022081221876294688, "calib/step_q_w": 0.2554993252361673, "calib/step_q_w_n": 1482.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2024.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 560.88671875, "completions/mean_terminated_length": 567.53759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.18773333333333334, "grad_norm": 0.3367729187011719, "learning_rate": 6.944444444444446e-07, "loss": 0.2451, "num_tokens": 37587356.0, "reward": 1.1519694328308105, "reward_std": 0.2565736174583435, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7258139848709106, "rewards/format_reward_step": 0.9921875, "step": 176 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.846255837008357, "aux_distill/mean_u": 0.2547409636449559, "aux_distill/n_active_tok": 172.75, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 12.03125, "calib/ece": 0.26491600000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.832, "calib/gap": 0.33505267295597463, "calib/mean_conf": 0.839396, "calib/mu_c": 0.9814583333333332, "calib/mu_w": 0.6464056603773586, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.26415600000000006, "calib/std_conf": 0.34353509163402796, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.24766943291839558, "calib/step_q_c_n": 1446.0, "calib/step_q_gap": -0.033187360227259294, "calib/step_q_w": 0.28085679314565487, "calib/step_q_w_n": 1634.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2398.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 557.25390625, "completions/mean_terminated_length": 568.3546142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.1888, "grad_norm": 0.3005785048007965, "learning_rate": 6.666666666666667e-07, "loss": 0.203, "num_tokens": 37833845.0, "reward": 1.1269714832305908, "reward_std": 0.2106054425239563, "rewards/accuracy_reward_step": 0.5625, "rewards/final_brier_reward_step": 0.7148808240890503, "rewards/format_reward_step": 0.9765625, "step": 177 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8563280925154686, "aux_distill/mean_u": 0.2752489762335201, "aux_distill/n_active_tok": 221.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.66796875, "calib/ece": 0.2809055118110236, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8464566929133859, "calib/gap": 0.2337249472573839, "calib/mean_conf": 0.857992125984252, "calib/mu_c": 0.9463291139240505, "calib/mu_w": 0.7126041666666666, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.25842519685039367, "calib/std_conf": 0.31873443135826335, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2823424054206663, "calib/step_q_c_n": 1771.0, "calib/step_q_gap": -0.0351100402315076, "calib/step_q_w": 0.3174524456521739, "calib/step_q_w_n": 1472.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 548.85546875, "completions/mean_terminated_length": 555.3636474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.18986666666666666, "grad_norm": 0.3231167197227478, "learning_rate": 6.388888888888889e-07, "loss": 0.218, "num_tokens": 38080424.0, "reward": 1.1606513261795044, "reward_std": 0.2605002820491791, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7119277119636536, "rewards/format_reward_step": 0.9921875, "step": 178 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8314931150525808, "aux_distill/mean_u": 0.2916405868492405, "aux_distill/n_active_tok": 216.75, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.07421875, "calib/ece": 0.19565177865612648, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8260869565217391, "calib/gap": 0.38206014211886286, "calib/mean_conf": 0.8252960474308301, "calib/mu_c": 0.9476156976744184, "calib/mu_w": 0.5655555555555556, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.17055296442687748, "calib/std_conf": 0.35870026592531407, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26839024390243904, "calib/step_q_c_n": 1845.0, "calib/step_q_gap": 0.007780292056532134, "calib/step_q_w": 0.2606099518459069, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 575.703125, "completions/mean_terminated_length": 580.2362060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.19093333333333334, "grad_norm": 0.3155559301376343, "learning_rate": 6.111111111111112e-07, "loss": 0.2778, "num_tokens": 38334068.0, "reward": 1.2248156070709229, "reward_std": 0.2330620288848877, "rewards/accuracy_reward_step": 0.671875, "rewards/final_brier_reward_step": 0.7894749641418457, "rewards/format_reward_step": 0.98828125, "step": 179 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.7971970979124308, "aux_distill/mean_u": 0.24041332149219932, "aux_distill/n_active_tok": 194.125, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 13.73828125, "calib/ece": 0.25646666666666673, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.796078431372549, "calib/gap": 0.28167070707070696, "calib/mean_conf": 0.8167019607843138, "calib/mu_c": 0.9161151515151514, "calib/mu_w": 0.6344444444444445, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21305490196078436, "calib/std_conf": 0.35877278488289205, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2661774744027304, "calib/step_q_c_n": 2051.0, "calib/step_q_gap": -0.015460179076123681, "calib/step_q_w": 0.28163765347885406, "calib/step_q_w_n": 1466.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 682.99609375, "completions/mean_terminated_length": 688.3740234375, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.192, "grad_norm": 0.2555125951766968, "learning_rate": 5.833333333333334e-07, "loss": 0.2443, "num_tokens": 38612771.0, "reward": 1.190252661705017, "reward_std": 0.223234623670578, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7398804426193237, "rewards/format_reward_step": 0.99609375, "step": 180 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8345296196639538, "aux_distill/mean_u": 0.26767888136809903, "aux_distill/n_active_tok": 186.0, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 11.2890625, "calib/ece": 0.2867193675889328, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.984375, "calib/frac_conf_gt_0.9": 0.8458498023715415, "calib/gap": 0.23232245169886734, "calib/mean_conf": 0.8459288537549408, "calib/mu_c": 0.9331645569620252, "calib/mu_w": 0.7008421052631578, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2540711462450593, "calib/std_conf": 0.33912761084905496, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.24617365269461078, "calib/step_q_c_n": 1670.0, "calib/step_q_gap": -0.021949298125061356, "calib/step_q_w": 0.26812295081967213, "calib/step_q_w_n": 1220.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 505.734375, "completions/mean_terminated_length": 513.761962890625, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.19306666666666666, "grad_norm": 0.38826414942741394, "learning_rate": 5.555555555555555e-07, "loss": 0.2464, "num_tokens": 38848503.0, "reward": 1.1498777866363525, "reward_std": 0.21765771508216858, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.698193371295929, "rewards/format_reward_step": 0.984375, "step": 181 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8593632094562054, "aux_distill/mean_u": 0.26944253590385014, "aux_distill/n_active_tok": 210.625, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.94921875, "calib/ece": 0.294392156862745, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8745098039215686, "calib/gap": 0.16662650602409612, "calib/mean_conf": 0.8784705882352943, "calib/mu_c": 0.9366265060240961, "calib/mu_w": 0.77, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2609411764705881, "calib/std_conf": 0.3012034914349353, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25794397759103643, "calib/step_q_c_n": 1785.0, "calib/step_q_gap": -0.014685535752762657, "calib/step_q_w": 0.2726295133437991, "calib/step_q_w_n": 1274.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 572.48828125, "completions/mean_terminated_length": 576.9960327148438, "completions/min_length": 0.0, "completions/min_terminated_length": 225.0, "epoch": 0.19413333333333332, "grad_norm": 0.28626206517219543, "learning_rate": 5.277777777777779e-07, "loss": 0.2553, "num_tokens": 39101220.0, "reward": 1.1739048957824707, "reward_std": 0.24584300816059113, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.7032785415649414, "rewards/format_reward_step": 0.99609375, "step": 182 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8672371339052916, "aux_distill/mean_u": 0.30996374739638244, "aux_distill/n_active_tok": 220.875, "calib/answer_extract_rate": 0.9765625, "calib/avg_num_step_conf": 13.296875, "calib/ece": 0.25592000000000004, "calib/final_conf_rate": 0.9765625, "calib/format_rate": 0.9765625, "calib/frac_conf_gt_0.9": 0.804, "calib/gap": 0.2876544071463466, "calib/mean_conf": 0.8148000000000001, "calib/mu_c": 0.9172049689440995, "calib/mu_w": 0.6295505617977529, "calib/nonempty_final_conf_rate": 0.9765625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21336000000000005, "calib/std_conf": 0.3630352049044279, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28253883235136584, "calib/step_q_c_n": 1867.0, "calib/step_q_gap": 0.026566158311027543, "calib/step_q_w": 0.2559726740403383, "calib/step_q_w_n": 1537.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 625.296875, "completions/mean_terminated_length": 632.7114868164062, "completions/min_length": 0.0, "completions/min_terminated_length": 187.0, "epoch": 0.1952, "grad_norm": 0.29089388251304626, "learning_rate": 5.000000000000001e-07, "loss": 0.2574, "num_tokens": 39367976.0, "reward": 1.164876103401184, "reward_std": 0.2800893187522888, "rewards/accuracy_reward_step": 0.62890625, "rewards/final_brier_reward_step": 0.7242835760116577, "rewards/format_reward_step": 0.9765625, "step": 183 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8836637884378433, "aux_distill/mean_u": 0.3054510764293644, "aux_distill/n_active_tok": 187.375, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.67578125, "calib/ece": 0.21167098039215682, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.8549019607843137, "calib/gap": 0.31682057142857123, "calib/mean_conf": 0.8529258823529412, "calib/mu_c": 0.9523205714285713, "calib/mu_w": 0.6355000000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1891611764705882, "calib/std_conf": 0.33352521267059637, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.22977933801404213, "calib/step_q_c_n": 1994.0, "calib/step_q_gap": -0.10504478258897296, "calib/step_q_w": 0.3348241206030151, "calib/step_q_w_n": 995.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 542.71484375, "completions/mean_terminated_length": 546.9881591796875, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.19626666666666667, "grad_norm": 0.28528183698654175, "learning_rate": 4.7222222222222226e-07, "loss": 0.2563, "num_tokens": 39612191.0, "reward": 1.229371428489685, "reward_std": 0.21543928980827332, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7790553569793701, "rewards/format_reward_step": 0.99609375, "step": 184 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8230397878214717, "aux_distill/mean_u": 0.2919940917335714, "aux_distill/n_active_tok": 176.375, "calib/answer_extract_rate": 0.984375, "calib/avg_num_step_conf": 12.4375, "calib/ece": 0.25213095238095246, "calib/final_conf_rate": 0.984375, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8571428571428571, "calib/gap": 0.23040740740740717, "calib/mean_conf": 0.8614960317460318, "calib/mu_c": 0.9355555555555555, "calib/mu_w": 0.7051481481481483, "calib/nonempty_final_conf_rate": 0.984375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.21752777777777788, "calib/std_conf": 0.32164138190799013, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.25592944718117133, "calib/step_q_c_n": 1827.0, "calib/step_q_gap": -0.06152817993747273, "calib/step_q_w": 0.31745762711864406, "calib/step_q_w_n": 1357.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 553.1875, "completions/mean_terminated_length": 561.96826171875, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.19733333333333333, "grad_norm": 0.39035874605178833, "learning_rate": 4.444444444444445e-07, "loss": 0.1929, "num_tokens": 39860727.0, "reward": 1.189182996749878, "reward_std": 0.25200313329696655, "rewards/accuracy_reward_step": 0.66796875, "rewards/final_brier_reward_step": 0.7299284934997559, "rewards/format_reward_step": 0.98046875, "step": 185 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8578335829079151, "aux_distill/mean_u": 0.2921642115088832, "aux_distill/n_active_tok": 171.25, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.21484375, "calib/ece": 0.19909447449562465, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.7795275590551181, "calib/gap": 0.42372533501685417, "calib/mean_conf": 0.7847637658342075, "calib/mu_c": 0.9315662441077632, "calib/mu_w": 0.5078409090909091, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1651574666216089, "calib/std_conf": 0.39025296827853495, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2744632469592808, "calib/step_q_c_n": 1891.0, "calib/step_q_gap": 0.0016477129786983125, "calib/step_q_w": 0.2728155339805825, "calib/step_q_w_n": 1236.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 566.46875, "completions/mean_terminated_length": 570.9291381835938, "completions/min_length": 0.0, "completions/min_terminated_length": 157.0, "epoch": 0.1984, "grad_norm": 0.33431562781333923, "learning_rate": 4.1666666666666667e-07, "loss": 0.2537, "num_tokens": 40110783.0, "reward": 1.215174674987793, "reward_std": 0.23402857780456543, "rewards/accuracy_reward_step": 0.6484375, "rewards/final_brier_reward_step": 0.789724588394165, "rewards/format_reward_step": 0.9921875, "step": 186 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.824182590469718, "aux_distill/mean_u": 0.2650358004125993, "aux_distill/n_active_tok": 204.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.7734375, "calib/ece": 0.2811732283464566, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.8070866141732284, "calib/gap": 0.2934272930648768, "calib/mean_conf": 0.8147952755905513, "calib/mu_c": 0.9360939597315435, "calib/mu_w": 0.6426666666666667, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.2546771653543306, "calib/std_conf": 0.36384323955490216, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.28421627525942106, "calib/step_q_c_n": 1831.0, "calib/step_q_gap": -0.027907618545888635, "calib/step_q_w": 0.3121238938053097, "calib/step_q_w_n": 1695.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 604.75390625, "completions/mean_terminated_length": 611.9249267578125, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.19946666666666665, "grad_norm": 0.2580687403678894, "learning_rate": 3.8888888888888895e-07, "loss": 0.2251, "num_tokens": 40367144.0, "reward": 1.138094186782837, "reward_std": 0.2702537178993225, "rewards/accuracy_reward_step": 0.58203125, "rewards/final_brier_reward_step": 0.7058757543563843, "rewards/format_reward_step": 0.98828125, "step": 187 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8105640029534698, "aux_distill/mean_u": 0.29351264664163546, "aux_distill/n_active_tok": 219.875, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 12.7734375, "calib/ece": 0.1787007874015748, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.84251968503937, "calib/gap": 0.35303642773207977, "calib/mean_conf": 0.8453937007874016, "calib/mu_c": 0.9412972972972973, "calib/mu_w": 0.5882608695652175, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1478740157480315, "calib/std_conf": 0.33680007989694366, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2589574366331899, "calib/step_q_c_n": 2091.0, "calib/step_q_gap": -0.03526537931252682, "calib/step_q_w": 0.2942228159457167, "calib/step_q_w_n": 1179.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 630.16796875, "completions/mean_terminated_length": 635.1299438476562, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.20053333333333334, "grad_norm": 0.253006249666214, "learning_rate": 3.611111111111111e-07, "loss": 0.2441, "num_tokens": 40632539.0, "reward": 1.261594295501709, "reward_std": 0.16998478770256042, "rewards/accuracy_reward_step": 0.72265625, "rewards/final_brier_reward_step": 0.8083449006080627, "rewards/format_reward_step": 0.9921875, "step": 188 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8325030775740743, "aux_distill/mean_u": 0.2645952570453017, "aux_distill/n_active_tok": 164.625, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.9921875, "calib/ece": 0.23508235294117644, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7686274509803922, "calib/gap": 0.3517131313131311, "calib/mean_conf": 0.7800235294117649, "calib/mu_c": 0.9041575757575756, "calib/mu_w": 0.5524444444444445, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.18402352941176467, "calib/std_conf": 0.392159210659638, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2832378826530612, "calib/step_q_c_n": 1568.0, "calib/step_q_gap": -0.033977205629442786, "calib/step_q_w": 0.317215088282504, "calib/step_q_w_n": 1246.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 517.3984375, "completions/mean_terminated_length": 521.472412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.2016, "grad_norm": 0.33088845014572144, "learning_rate": 3.3333333333333335e-07, "loss": 0.2758, "num_tokens": 40872761.0, "reward": 1.1992274522781372, "reward_std": 0.21433404088020325, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.757830023765564, "rewards/format_reward_step": 0.99609375, "step": 189 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8428498767316341, "aux_distill/mean_u": 0.3225249213968002, "aux_distill/n_active_tok": 222.875, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 13.54296875, "calib/ece": 0.22940711462450605, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.7905138339920948, "calib/gap": 0.39989902912621333, "calib/mean_conf": 0.8003952569169961, "calib/mu_c": 0.9631999999999997, "calib/mu_w": 0.5633009708737864, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.21845849802371556, "calib/std_conf": 0.37837472524572235, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2910631040363843, "calib/step_q_c_n": 1759.0, "calib/step_q_gap": -0.0024757718418358388, "calib/step_q_w": 0.29353887587822014, "calib/step_q_w_n": 1708.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 627.0234375, "completions/mean_terminated_length": 636.9762573242188, "completions/min_length": 0.0, "completions/min_terminated_length": 179.0, "epoch": 0.20266666666666666, "grad_norm": 0.3278692960739136, "learning_rate": 3.055555555555556e-07, "loss": 0.2014, "num_tokens": 41138887.0, "reward": 1.165348768234253, "reward_std": 0.2440982311964035, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.7564789056777954, "rewards/format_reward_step": 0.98828125, "step": 190 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8193467408418655, "aux_distill/mean_u": 0.20845514856220498, "aux_distill/n_active_tok": 222.625, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 12.49609375, "calib/ece": 0.3277254901960783, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.7725490196078432, "calib/gap": 0.24052509990009974, "calib/mean_conf": 0.785686274509804, "calib/mu_c": 0.8913286713286712, "calib/mu_w": 0.6508035714285715, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2763137254901959, "calib/std_conf": 0.38520814287995353, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.28625617206982545, "calib/step_q_c_n": 1604.0, "calib/step_q_gap": -0.016665458024218427, "calib/step_q_w": 0.3029216300940439, "calib/step_q_w_n": 1595.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 581.796875, "completions/mean_terminated_length": 584.0784912109375, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.20373333333333332, "grad_norm": 0.2885873019695282, "learning_rate": 2.7777777777777776e-07, "loss": 0.2257, "num_tokens": 41391995.0, "reward": 1.112635612487793, "reward_std": 0.20652508735656738, "rewards/accuracy_reward_step": 0.55859375, "rewards/final_brier_reward_step": 0.670583963394165, "rewards/format_reward_step": 0.99609375, "step": 191 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8118951702490449, "aux_distill/mean_u": 0.24394687888891278, "aux_distill/n_active_tok": 194.5, "calib/answer_extract_rate": 0.98828125, "calib/avg_num_step_conf": 12.828125, "calib/ece": 0.15258933522084617, "calib/final_conf_rate": 0.98828125, "calib/format_rate": 0.98828125, "calib/frac_conf_gt_0.9": 0.782608695652174, "calib/gap": 0.4859580254252286, "calib/mean_conf": 0.796888925648719, "calib/mu_c": 0.9447891942563974, "calib/mu_w": 0.4588311688311688, "calib/nonempty_final_conf_rate": 0.98828125, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.12691304347826085, "calib/std_conf": 0.37765356366640646, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2691260053619303, "calib/step_q_c_n": 1865.0, "calib/step_q_gap": -0.053601267365342375, "calib/step_q_w": 0.3227272727272727, "calib/step_q_w_n": 1419.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 609.125, "completions/mean_terminated_length": 618.793701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.2048, "grad_norm": 0.44815996289253235, "learning_rate": 2.5000000000000004e-07, "loss": 0.2364, "num_tokens": 41652907.0, "reward": 1.2535533905029297, "reward_std": 0.2129342257976532, "rewards/accuracy_reward_step": 0.6875, "rewards/final_brier_reward_step": 0.831325352191925, "rewards/format_reward_step": 0.98828125, "step": 192 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8178987912833691, "aux_distill/mean_u": 0.23051178048996848, "aux_distill/n_active_tok": 182.125, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.16796875, "calib/ece": 0.23338645418326698, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.98046875, "calib/frac_conf_gt_0.9": 0.8446215139442231, "calib/gap": 0.3356832720838435, "calib/mean_conf": 0.8408764940239045, "calib/mu_c": 0.965253164556962, "calib/mu_w": 0.6295698924731185, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.222390438247012, "calib/std_conf": 0.3451739394729761, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2731526548672566, "calib/step_q_c_n": 1808.0, "calib/step_q_gap": 0.010663851284403114, "calib/step_q_w": 0.2624888035828535, "calib/step_q_w_n": 1563.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 580.14453125, "completions/mean_terminated_length": 589.3532104492188, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.20586666666666667, "grad_norm": 0.25259193778038025, "learning_rate": 2.2222222222222224e-07, "loss": 0.2709, "num_tokens": 41907136.0, "reward": 1.1711702346801758, "reward_std": 0.28969404101371765, "rewards/accuracy_reward_step": 0.6171875, "rewards/final_brier_reward_step": 0.7446843385696411, "rewards/format_reward_step": 0.98046875, "step": 193 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8706032708287239, "aux_distill/mean_u": 0.2951731910868477, "aux_distill/n_active_tok": 167.5, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 11.984375, "calib/ece": 0.22929133858267708, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8385826771653543, "calib/gap": 0.32819611848825314, "calib/mean_conf": 0.836456692913386, "calib/mu_c": 0.9514545454545453, "calib/mu_w": 0.6232584269662922, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2080708661417322, "calib/std_conf": 0.3495160378622438, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27585446527012125, "calib/step_q_c_n": 1814.0, "calib/step_q_gap": -0.013284290710739999, "calib/step_q_w": 0.28913875598086125, "calib/step_q_w_n": 1254.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2842.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 527.45703125, "completions/mean_terminated_length": 533.7114868164062, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.20693333333333333, "grad_norm": 0.358541339635849, "learning_rate": 1.9444444444444447e-07, "loss": 0.2455, "num_tokens": 42148109.0, "reward": 1.197729229927063, "reward_std": 0.22360165417194366, "rewards/accuracy_reward_step": 0.64453125, "rewards/final_brier_reward_step": 0.7587398290634155, "rewards/format_reward_step": 0.9921875, "step": 194 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8249854417517781, "aux_distill/mean_u": 0.2865456123927324, "aux_distill/n_active_tok": 215.625, "calib/answer_extract_rate": 0.97265625, "calib/avg_num_step_conf": 13.7578125, "calib/ece": 0.31261043666280663, "calib/final_conf_rate": 0.97265625, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7349397590361446, "calib/gap": 0.26040804686107377, "calib/mean_conf": 0.7554216816427264, "calib/mu_c": 0.8673239347115411, "calib/mu_w": 0.6069158878504674, "calib/nonempty_final_conf_rate": 0.97265625, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.24887549690377048, "calib/std_conf": 0.40102669492810117, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.25076524741081707, "calib/step_q_c_n": 1738.0, "calib/step_q_gap": -0.04811367635599911, "calib/step_q_w": 0.2988789237668162, "calib/step_q_w_n": 1784.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 605.25390625, "completions/mean_terminated_length": 617.310791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.208, "grad_norm": 0.3838996887207031, "learning_rate": 1.6666666666666668e-07, "loss": 0.2191, "num_tokens": 42409038.0, "reward": 1.0980085134506226, "reward_std": 0.29674944281578064, "rewards/accuracy_reward_step": 0.5546875, "rewards/final_brier_reward_step": 0.6686733961105347, "rewards/format_reward_step": 0.97265625, "step": 195 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.87274937517941, "aux_distill/mean_u": 0.2900594268034635, "aux_distill/n_active_tok": 171.5, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 10.60546875, "calib/ece": 0.24312941166743263, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.9176470588235294, "calib/gap": 0.19994928557254454, "calib/mean_conf": 0.9103450979419426, "calib/mu_c": 0.9730742855725446, "calib/mu_w": 0.7731250000000001, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.23359999990272676, "calib/std_conf": 0.26470785692410465, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.27979405034324945, "calib/step_q_c_n": 1748.0, "calib/step_q_gap": -0.004900882438549969, "calib/step_q_w": 0.2846949327817994, "calib/step_q_w_n": 967.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 482.55859375, "completions/mean_terminated_length": 486.3582763671875, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.20906666666666668, "grad_norm": 0.2778487205505371, "learning_rate": 1.3888888888888888e-07, "loss": 0.2494, "num_tokens": 42635117.0, "reward": 1.213637351989746, "reward_std": 0.17422911524772644, "rewards/accuracy_reward_step": 0.68359375, "rewards/final_brier_reward_step": 0.7475874423980713, "rewards/format_reward_step": 0.99609375, "step": 196 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8694142308086157, "aux_distill/mean_u": 0.3005259901536683, "aux_distill/n_active_tok": 215.375, "calib/answer_extract_rate": 0.9921875, "calib/avg_num_step_conf": 13.5234375, "calib/ece": 0.3585039370078741, "calib/final_conf_rate": 0.9921875, "calib/format_rate": 0.9921875, "calib/frac_conf_gt_0.9": 0.8149606299212598, "calib/gap": 0.13569102564102553, "calib/mean_conf": 0.8215748031496063, "calib/mu_c": 0.8771333333333333, "calib/mu_w": 0.7414423076923078, "calib/nonempty_final_conf_rate": 0.9921875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.2947637795275591, "calib/std_conf": 0.36048507389207807, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.255830258302583, "calib/step_q_c_n": 1897.0, "calib/step_q_gap": -0.03370967779965345, "calib/step_q_w": 0.28953993610223644, "calib/step_q_w_n": 1565.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 597.7734375, "completions/mean_terminated_length": 604.8616943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.21013333333333334, "grad_norm": 0.3032035529613495, "learning_rate": 1.1111111111111112e-07, "loss": 0.2319, "num_tokens": 42893203.0, "reward": 1.1068097352981567, "reward_std": 0.26105594635009766, "rewards/accuracy_reward_step": 0.5859375, "rewards/final_brier_reward_step": 0.6354945302009583, "rewards/format_reward_step": 0.9921875, "step": 197 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8194463439285755, "aux_distill/mean_u": 0.2501906911000831, "aux_distill/n_active_tok": 180.875, "calib/answer_extract_rate": 0.99609375, "calib/avg_num_step_conf": 11.80859375, "calib/ece": 0.20453725490196056, "calib/final_conf_rate": 0.99609375, "calib/format_rate": 0.99609375, "calib/frac_conf_gt_0.9": 0.803921568627451, "calib/gap": 0.3732018186944873, "calib/mean_conf": 0.8054235294117648, "calib/mu_c": 0.9254335260115605, "calib/mu_w": 0.5522317073170732, "calib/nonempty_final_conf_rate": 0.99609375, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.16576470588235273, "calib/std_conf": 0.37370990247927954, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.2524092247301276, "calib/step_q_c_n": 2038.0, "calib/step_q_gap": 0.04279501153215809, "calib/step_q_w": 0.20961421319796952, "calib/step_q_w_n": 985.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2297.0, "completions/max_terminated_length": 2297.0, "completions/mean_length": 542.8359375, "completions/mean_terminated_length": 547.1102294921875, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.2112, "grad_norm": 0.3990510404109955, "learning_rate": 8.333333333333334e-08, "loss": 0.2275, "num_tokens": 43137553.0, "reward": 1.2288410663604736, "reward_std": 0.2174731194972992, "rewards/accuracy_reward_step": 0.67578125, "rewards/final_brier_reward_step": 0.7858071327209473, "rewards/format_reward_step": 0.99609375, "step": 198 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8574781194329262, "aux_distill/mean_u": 0.2731382857877819, "aux_distill/n_active_tok": 191.0, "calib/answer_extract_rate": 1.0, "calib/avg_num_step_conf": 12.76171875, "calib/ece": 0.2103906250000001, "calib/final_conf_rate": 1.0, "calib/format_rate": 1.0, "calib/frac_conf_gt_0.9": 0.8515625, "calib/gap": 0.23776595744680862, "calib/mean_conf": 0.8671093750000001, "calib/mu_c": 0.9302659574468086, "calib/mu_w": 0.6925, "calib/nonempty_final_conf_rate": 1.0, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 1.0, "calib/pce": 0.1715625000000001, "calib/std_conf": 0.30916382273336795, "calib/step_conf_rate": 1.0, "calib/step_q_c": 0.26340939008336994, "calib/step_q_c_n": 2279.0, "calib/step_q_gap": -0.03318980020003082, "calib/step_q_w": 0.29659919028340076, "calib/step_q_w_n": 988.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 618.19921875, "completions/mean_terminated_length": 623.0669555664062, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 0.21226666666666666, "grad_norm": 0.28748640418052673, "learning_rate": 5.555555555555556e-08, "loss": 0.2313, "num_tokens": 43400012.0, "reward": 1.2594335079193115, "reward_std": 0.19422554969787598, "rewards/accuracy_reward_step": 0.734375, "rewards/final_brier_reward_step": 0.7844921946525574, "rewards/format_reward_step": 1.0, "step": 199 }, { "aux_distill/lambda": 0.30000000000000004, "aux_distill/loss": 0.8403654135763645, "aux_distill/mean_u": 0.25869545011362055, "aux_distill/n_active_tok": 207.5, "calib/answer_extract_rate": 0.98046875, "calib/avg_num_step_conf": 13.1484375, "calib/ece": 0.23880478087649398, "calib/final_conf_rate": 0.98046875, "calib/format_rate": 0.97265625, "calib/frac_conf_gt_0.9": 0.7808764940239044, "calib/gap": 0.3254771813011512, "calib/mean_conf": 0.7949003984063744, "calib/mu_c": 0.9103086419753086, "calib/mu_w": 0.5848314606741574, "calib/nonempty_final_conf_rate": 0.98046875, "calib/nonempty_reasoning_rate": 1.0, "calib/nonempty_step_conf_rate": 0.99609375, "calib/pce": 0.19414342629482068, "calib/std_conf": 0.37575566630034124, "calib/step_conf_rate": 0.99609375, "calib/step_q_c": 0.3023505275498241, "calib/step_q_c_n": 1706.0, "calib/step_q_gap": 0.0009710094775349232, "calib/step_q_w": 0.3013795180722892, "calib/step_q_w_n": 1660.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 630.12109375, "completions/mean_terminated_length": 637.5928955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.21333333333333335, "grad_norm": 0.2310328185558319, "learning_rate": 2.777777777777778e-08, "loss": 0.236, "num_tokens": 43669371.0, "reward": 1.1727749109268188, "reward_std": 0.21215327084064484, "rewards/accuracy_reward_step": 0.6328125, "rewards/final_brier_reward_step": 0.740081250667572, "rewards/format_reward_step": 0.97265625, "step": 200 }, { "epoch": 0.21333333333333335, "step": 200, "total_flos": 0.0, "train_loss": 0.2056847586715594, "train_runtime": 17418.2127, "train_samples_per_second": 2.939, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 43669371, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }