{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.32, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 410.4126892089844, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0016, "grad_norm": 0.005749928764998913, "learning_rate": 5.000000000000001e-07, "loss": -0.0217, "num_tokens": 392512.0, "reward": 0.04026263207197189, "reward_std": 0.09501844644546509, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.018024610355496407, "rewards/confidence_one_or_zero": 0.0625, "rewards/format_reward": 0.0625, "rewards/mean_confidence_reward": 0.26346302032470703, "step": 1 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.30859375, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 341.23828125, "completions/mean_terminated_length": 493.5423889160156, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0032, "grad_norm": 0.006314713973551989, "learning_rate": 1.0000000000000002e-06, "loss": -0.0254, "num_tokens": 816933.0, "reward": 0.07134318351745605, "reward_std": 0.16403131186962128, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.029404297471046448, "rewards/confidence_one_or_zero": 0.07421875, "rewards/format_reward": 0.11328125, "rewards/mean_confidence_reward": 0.3325389623641968, "step": 2 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 312.1015625, "completions/mean_terminated_length": 459.18389892578125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0048, "grad_norm": 0.005154214799404144, "learning_rate": 1.5e-06, "loss": -0.0184, "num_tokens": 1227311.0, "reward": 0.044827938079833984, "reward_std": 0.10091917961835861, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.015436407178640366, "rewards/confidence_one_or_zero": 0.078125, "rewards/format_reward": 0.07421875, "rewards/mean_confidence_reward": 0.2802625000476837, "step": 3 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 313.24609375, "completions/mean_terminated_length": 424.2909851074219, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.005596967414021492, "learning_rate": 2.0000000000000003e-06, "loss": -0.0349, "num_tokens": 1650278.0, "reward": 0.044450514018535614, "reward_std": 0.1127510517835617, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.018587836995720863, "rewards/confidence_one_or_zero": 0.07421875, "rewards/format_reward": 0.0703125, "rewards/mean_confidence_reward": 0.2687794864177704, "step": 4 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 306.84765625, "completions/mean_terminated_length": 431.6098937988281, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.008, "grad_norm": 0.006082500796765089, "learning_rate": 2.5e-06, "loss": -0.0153, "num_tokens": 2080167.0, "reward": 0.05005643144249916, "reward_std": 0.11759157478809357, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.018080823123455048, "rewards/confidence_one_or_zero": 0.09375, "rewards/format_reward": 0.08203125, "rewards/mean_confidence_reward": 0.30190616846084595, "step": 5 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 323.73828125, "completions/mean_terminated_length": 499.2590026855469, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0096, "grad_norm": 0.006878445390611887, "learning_rate": 3e-06, "loss": -0.0257, "num_tokens": 2506484.0, "reward": 0.06703364849090576, "reward_std": 0.15876701474189758, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.036410294473171234, "rewards/confidence_one_or_zero": 0.07421875, "rewards/format_reward": 0.09765625, "rewards/mean_confidence_reward": 0.2985377311706543, "step": 6 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.29296875, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 326.8046875, "completions/mean_terminated_length": 462.22100830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0112, "grad_norm": 0.009645157493650913, "learning_rate": 3.5e-06, "loss": -0.0403, "num_tokens": 2925402.0, "reward": 0.07675451785326004, "reward_std": 0.17086508870124817, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.028508201241493225, "rewards/confidence_one_or_zero": 0.0859375, "rewards/format_reward": 0.125, "rewards/mean_confidence_reward": 0.3304399251937866, "step": 7 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 363.6953125, "completions/mean_terminated_length": 495.2446594238281, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 0.007638865616172552, "learning_rate": 4.000000000000001e-06, "loss": -0.0562, "num_tokens": 3338884.0, "reward": 0.10420513153076172, "reward_std": 0.2052772045135498, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.044346779584884644, "rewards/confidence_one_or_zero": 0.1015625, "rewards/format_reward": 0.1640625, "rewards/mean_confidence_reward": 0.3951185345649719, "step": 8 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.29296875, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 314.37890625, "completions/mean_terminated_length": 444.64642333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0144, "grad_norm": 0.011798583902418613, "learning_rate": 4.5e-06, "loss": -0.0687, "num_tokens": 3768333.0, "reward": 0.17055284976959229, "reward_std": 0.2825161814689636, "rewards/accuracy_reward": 0.00390625, "rewards/brier_reward": 0.06376074254512787, "rewards/confidence_one_or_zero": 0.11328125, "rewards/format_reward": 0.26953125, "rewards/mean_confidence_reward": 0.4795473515987396, "step": 9 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 357.54296875, "completions/mean_terminated_length": 492.1021423339844, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.016, "grad_norm": 0.008803842589259148, "learning_rate": 5e-06, "loss": -0.0374, "num_tokens": 4195512.0, "reward": 0.20917564630508423, "reward_std": 0.29108554124832153, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.09413106739521027, "rewards/confidence_one_or_zero": 0.17578125, "rewards/format_reward": 0.32421875, "rewards/mean_confidence_reward": 0.5659036636352539, "step": 10 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 364.484375, "completions/mean_terminated_length": 426.06390380859375, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.0176, "grad_norm": 0.010400340892374516, "learning_rate": 5e-06, "loss": -0.0757, "num_tokens": 4636212.0, "reward": 0.4166101813316345, "reward_std": 0.3802958130836487, "rewards/accuracy_reward": 0.01171875, "rewards/brier_reward": 0.20821861922740936, "rewards/confidence_one_or_zero": 0.171875, "rewards/format_reward": 0.6015625, "rewards/mean_confidence_reward": 0.7054708003997803, "step": 11 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 374.7265625, "completions/mean_terminated_length": 398.0498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 3.0, "epoch": 0.0192, "grad_norm": 0.009152157232165337, "learning_rate": 5e-06, "loss": -0.0242, "num_tokens": 5070462.0, "reward": 0.5529123544692993, "reward_std": 0.3406127095222473, "rewards/accuracy_reward": 0.0078125, "rewards/brier_reward": 0.32066667079925537, "rewards/confidence_one_or_zero": 0.19921875, "rewards/format_reward": 0.76953125, "rewards/mean_confidence_reward": 0.6666077971458435, "step": 12 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 305.55859375, "completions/mean_terminated_length": 319.27752685546875, "completions/min_length": 0.0, "completions/min_terminated_length": 7.0, "epoch": 0.0208, "grad_norm": 0.008684230037033558, "learning_rate": 5e-06, "loss": -0.0189, "num_tokens": 5497389.0, "reward": 0.5994592905044556, "reward_std": 0.30771589279174805, "rewards/accuracy_reward": 0.00390625, "rewards/brier_reward": 0.3629792332649231, "rewards/confidence_one_or_zero": 0.25390625, "rewards/format_reward": 0.828125, "rewards/mean_confidence_reward": 0.6504890322685242, "step": 13 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 255.3515625, "completions/mean_terminated_length": 258.37945556640625, "completions/min_length": 0.0, "completions/min_terminated_length": 9.0, "epoch": 0.0224, "grad_norm": 0.010425634682178497, "learning_rate": 5e-06, "loss": 0.0136, "num_tokens": 5909999.0, "reward": 0.699076771736145, "reward_std": 0.3184015154838562, "rewards/accuracy_reward": 0.01953125, "rewards/brier_reward": 0.460651695728302, "rewards/confidence_one_or_zero": 0.33203125, "rewards/format_reward": 0.8984375, "rewards/mean_confidence_reward": 0.5886218547821045, "step": 14 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 228.60546875, "completions/mean_terminated_length": 228.60546875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.024, "grad_norm": 0.008485889993607998, "learning_rate": 5e-06, "loss": 0.0061, "num_tokens": 6312106.0, "reward": 0.8145014047622681, "reward_std": 0.23522086441516876, "rewards/accuracy_reward": 0.015625, "rewards/brier_reward": 0.6211885213851929, "rewards/confidence_one_or_zero": 0.41796875, "rewards/format_reward": 0.9765625, "rewards/mean_confidence_reward": 0.4547516405582428, "step": 15 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 169.3046875, "completions/mean_terminated_length": 169.96864318847656, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 0.0256, "grad_norm": 0.010844497941434383, "learning_rate": 5e-06, "loss": 0.0199, "num_tokens": 6714200.0, "reward": 0.8929893374443054, "reward_std": 0.23516574501991272, "rewards/accuracy_reward": 0.046875, "rewards/brier_reward": 0.7234768271446228, "rewards/confidence_one_or_zero": 0.71484375, "rewards/format_reward": 0.96875, "rewards/mean_confidence_reward": 0.2515939474105835, "step": 16 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 99.51171875, "completions/mean_terminated_length": 99.90196990966797, "completions/min_length": 0.0, "completions/min_terminated_length": 30.0, "epoch": 0.0272, "grad_norm": 0.010410776361823082, "learning_rate": 5e-06, "loss": 0.0293, "num_tokens": 7052771.0, "reward": 0.9686627388000488, "reward_std": 0.1116916686296463, "rewards/accuracy_reward": 0.01171875, "rewards/brier_reward": 0.9295110702514648, "rewards/confidence_one_or_zero": 0.91015625, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.07008671760559082, "step": 17 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 69.91015625, "completions/mean_terminated_length": 69.91015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0288, "grad_norm": 0.007693614345043898, "learning_rate": 5e-06, "loss": 0.0155, "num_tokens": 7410788.0, "reward": 1.0056359767913818, "reward_std": 0.054224878549575806, "rewards/accuracy_reward": 0.03515625, "rewards/brier_reward": 0.9526762366294861, "rewards/confidence_one_or_zero": 0.98828125, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.001613281317986548, "step": 18 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 54.0859375, "completions/mean_terminated_length": 54.0859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0304, "grad_norm": 0.011272546835243702, "learning_rate": 5e-06, "loss": -0.0014, "num_tokens": 7768314.0, "reward": 1.0253913402557373, "reward_std": 0.06518790125846863, "rewards/accuracy_reward": 0.06640625, "rewards/brier_reward": 0.9257808923721313, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 3.9062499126885086e-05, "step": 19 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 49.15625, "completions/mean_terminated_length": 49.15625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.032, "grad_norm": 0.00929289497435093, "learning_rate": 5e-06, "loss": 0.01, "num_tokens": 8124378.0, "reward": 1.0371103286743164, "reward_std": 0.04432518407702446, "rewards/accuracy_reward": 0.08984375, "rewards/brier_reward": 0.90234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 49.47265625, "completions/mean_terminated_length": 49.47265625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0336, "grad_norm": 0.007096223067492247, "learning_rate": 5e-06, "loss": -0.0009, "num_tokens": 8479995.0, "reward": 1.0410165786743164, "reward_std": 0.05655324086546898, "rewards/accuracy_reward": 0.08203125, "rewards/brier_reward": 0.91796875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 21 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 51.7890625, "completions/mean_terminated_length": 51.7890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.0352, "grad_norm": 0.005252554547041655, "learning_rate": 5e-06, "loss": -0.0012, "num_tokens": 8832877.0, "reward": 1.0546884536743164, "reward_std": 0.0483323335647583, "rewards/accuracy_reward": 0.109375, "rewards/brier_reward": 0.890625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 22 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 54.7578125, "completions/mean_terminated_length": 54.7578125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0368, "grad_norm": 0.011090493761003017, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 9192591.0, "reward": 1.0605478286743164, "reward_std": 0.0958084836602211, "rewards/accuracy_reward": 0.13671875, "rewards/brier_reward": 0.85546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 23 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 55.78515625, "completions/mean_terminated_length": 55.78515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0384, "grad_norm": 0.008098751306533813, "learning_rate": 5e-06, "loss": -0.0023, "num_tokens": 9541448.0, "reward": 1.0644540786743164, "reward_std": 0.08022359758615494, "rewards/accuracy_reward": 0.13671875, "rewards/brier_reward": 0.859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 24 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 62.41796875, "completions/mean_terminated_length": 62.41796875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.04, "grad_norm": 0.007593750022351742, "learning_rate": 5e-06, "loss": -0.0014, "num_tokens": 9905859.0, "reward": 1.0566415786743164, "reward_std": 0.08320242166519165, "rewards/accuracy_reward": 0.13671875, "rewards/brier_reward": 0.8515625, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 61.703125, "completions/mean_terminated_length": 61.703125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0416, "grad_norm": 0.006790656130760908, "learning_rate": 5e-06, "loss": 0.0022, "num_tokens": 10275231.0, "reward": 1.0703134536743164, "reward_std": 0.07213689386844635, "rewards/accuracy_reward": 0.1484375, "rewards/brier_reward": 0.84765625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 26 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 65.6328125, "completions/mean_terminated_length": 65.6328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0432, "grad_norm": 0.0061500584706664085, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 10624729.0, "reward": 1.0585947036743164, "reward_std": 0.06575888395309448, "rewards/accuracy_reward": 0.1328125, "rewards/brier_reward": 0.859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 27 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 66.05078125, "completions/mean_terminated_length": 66.05078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0448, "grad_norm": 0.006998259574174881, "learning_rate": 5e-06, "loss": -0.0088, "num_tokens": 10976326.0, "reward": 1.0878915786743164, "reward_std": 0.07318098098039627, "rewards/accuracy_reward": 0.17578125, "rewards/brier_reward": 0.82421875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 28 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 64.43359375, "completions/mean_terminated_length": 64.43359375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0464, "grad_norm": 0.006655920296907425, "learning_rate": 5e-06, "loss": -0.0008, "num_tokens": 11341973.0, "reward": 1.0820322036743164, "reward_std": 0.08318543434143066, "rewards/accuracy_reward": 0.1640625, "rewards/brier_reward": 0.8359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 29 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 64.484375, "completions/mean_terminated_length": 64.484375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.048, "grad_norm": 0.006319932173937559, "learning_rate": 5e-06, "loss": 0.0018, "num_tokens": 11688553.0, "reward": 1.0957040786743164, "reward_std": 0.08278603851795197, "rewards/accuracy_reward": 0.19140625, "rewards/brier_reward": 0.80859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 62.36328125, "completions/mean_terminated_length": 62.36328125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0496, "grad_norm": 0.007409350946545601, "learning_rate": 5e-06, "loss": -0.0039, "num_tokens": 12039830.0, "reward": 1.0976572036743164, "reward_std": 0.08574788272380829, "rewards/accuracy_reward": 0.203125, "rewards/brier_reward": 0.79296875, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 31 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 68.796875, "completions/mean_terminated_length": 68.796875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0512, "grad_norm": 0.006146772764623165, "learning_rate": 5e-06, "loss": 0.0011, "num_tokens": 12383978.0, "reward": 1.1308603286743164, "reward_std": 0.0666126236319542, "rewards/accuracy_reward": 0.26171875, "rewards/brier_reward": 0.73828125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 32 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 69.16796875, "completions/mean_terminated_length": 69.16796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0528, "grad_norm": 0.006563497707247734, "learning_rate": 5e-06, "loss": 0.0025, "num_tokens": 12742837.0, "reward": 1.0664072036743164, "reward_std": 0.06365078687667847, "rewards/accuracy_reward": 0.1328125, "rewards/brier_reward": 0.8671875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 33 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 71.05859375, "completions/mean_terminated_length": 71.05859375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0544, "grad_norm": 0.006254891399294138, "learning_rate": 5e-06, "loss": 0.0011, "num_tokens": 13102620.0, "reward": 1.0917978286743164, "reward_std": 0.06332746148109436, "rewards/accuracy_reward": 0.18359375, "rewards/brier_reward": 0.81640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 34 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 70.12890625, "completions/mean_terminated_length": 70.12890625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.056, "grad_norm": 0.006259838584810495, "learning_rate": 5e-06, "loss": 0.0057, "num_tokens": 13465429.0, "reward": 1.1992197036743164, "reward_std": 0.06970866024494171, "rewards/accuracy_reward": 0.3984375, "rewards/brier_reward": 0.6015625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 63.85546875, "completions/mean_terminated_length": 63.85546875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0576, "grad_norm": 0.006907327566295862, "learning_rate": 5e-06, "loss": -0.0026, "num_tokens": 13815680.0, "reward": 1.1582040786743164, "reward_std": 0.08081457763910294, "rewards/accuracy_reward": 0.31640625, "rewards/brier_reward": 0.68359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 36 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 65.3828125, "completions/mean_terminated_length": 65.3828125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0592, "grad_norm": 0.007342278026044369, "learning_rate": 5e-06, "loss": -0.0008, "num_tokens": 14174674.0, "reward": 1.1699228286743164, "reward_std": 0.09639701247215271, "rewards/accuracy_reward": 0.33984375, "rewards/brier_reward": 0.66015625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 37 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 63.69921875, "completions/mean_terminated_length": 63.69921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0608, "grad_norm": 0.005854626186192036, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 14536365.0, "reward": 1.1015634536743164, "reward_std": 0.05129294842481613, "rewards/accuracy_reward": 0.203125, "rewards/brier_reward": 0.796875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 38 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 63.9453125, "completions/mean_terminated_length": 63.9453125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0624, "grad_norm": 0.007452231831848621, "learning_rate": 5e-06, "loss": -0.0017, "num_tokens": 14888983.0, "reward": 1.1113290786743164, "reward_std": 0.05937965214252472, "rewards/accuracy_reward": 0.22265625, "rewards/brier_reward": 0.77734375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 39 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 62.33984375, "completions/mean_terminated_length": 62.33984375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.064, "grad_norm": 0.005153173580765724, "learning_rate": 5e-06, "loss": -0.0016, "num_tokens": 15236582.0, "reward": 1.2089853286743164, "reward_std": 0.053135842084884644, "rewards/accuracy_reward": 0.41796875, "rewards/brier_reward": 0.58203125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 64.19140625, "completions/mean_terminated_length": 64.19140625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0656, "grad_norm": 0.008204660378396511, "learning_rate": 5e-06, "loss": -0.0013, "num_tokens": 15602439.0, "reward": 1.0898447036743164, "reward_std": 0.0676315501332283, "rewards/accuracy_reward": 0.1875, "rewards/brier_reward": 0.80859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 41 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 65.29296875, "completions/mean_terminated_length": 65.29296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0672, "grad_norm": 0.007392301224172115, "learning_rate": 5e-06, "loss": 0.0023, "num_tokens": 15951946.0, "reward": 1.2500009536743164, "reward_std": 0.07779218256473541, "rewards/accuracy_reward": 0.5, "rewards/brier_reward": 0.5, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 42 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 65.9296875, "completions/mean_terminated_length": 65.9296875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0688, "grad_norm": 0.025153739377856255, "learning_rate": 5e-06, "loss": -0.0024, "num_tokens": 16317912.0, "reward": 1.1640634536743164, "reward_std": 0.08529357612133026, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.63671875, "rewards/confidence_one_or_zero": 0.98828125, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 43 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 66.828125, "completions/mean_terminated_length": 66.828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0704, "grad_norm": 0.006532193627208471, "learning_rate": 5e-06, "loss": -0.0012, "num_tokens": 16686652.0, "reward": 1.1914072036743164, "reward_std": 0.047477371990680695, "rewards/accuracy_reward": 0.3828125, "rewards/brier_reward": 0.6171875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 44 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 62.3515625, "completions/mean_terminated_length": 62.3515625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.072, "grad_norm": 0.00799117423593998, "learning_rate": 5e-06, "loss": -0.0028, "num_tokens": 17040462.0, "reward": 1.1582040786743164, "reward_std": 0.06818589568138123, "rewards/accuracy_reward": 0.31640625, "rewards/brier_reward": 0.68359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 66.0234375, "completions/mean_terminated_length": 66.0234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0736, "grad_norm": 0.00791685376316309, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 17375468.0, "reward": 1.1464853286743164, "reward_std": 0.06194208562374115, "rewards/accuracy_reward": 0.29296875, "rewards/brier_reward": 0.70703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 46 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 73.671875, "completions/mean_terminated_length": 73.671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0752, "grad_norm": 0.006725848186761141, "learning_rate": 5e-06, "loss": 0.0033, "num_tokens": 17720488.0, "reward": 1.1425790786743164, "reward_std": 0.054445136338472366, "rewards/accuracy_reward": 0.28515625, "rewards/brier_reward": 0.71484375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 47 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 131.52734375, "completions/mean_terminated_length": 132.56298828125, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.0768, "grad_norm": 0.015706874430179596, "learning_rate": 5e-06, "loss": 0.0124, "num_tokens": 18080463.0, "reward": 1.1503915786743164, "reward_std": 0.14131194353103638, "rewards/accuracy_reward": 0.33984375, "rewards/brier_reward": 0.640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.0, "step": 48 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 60.30078125, "completions/mean_terminated_length": 60.30078125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0784, "grad_norm": 0.008152630180120468, "learning_rate": 5e-06, "loss": 0.001, "num_tokens": 18446988.0, "reward": 1.1640634536743164, "reward_std": 0.0668778270483017, "rewards/accuracy_reward": 0.328125, "rewards/brier_reward": 0.671875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 49 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 60.88671875, "completions/mean_terminated_length": 60.88671875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.08, "grad_norm": 0.007156162057071924, "learning_rate": 5e-06, "loss": 0.0012, "num_tokens": 18802703.0, "reward": 1.1269540786743164, "reward_std": 0.05102774128317833, "rewards/accuracy_reward": 0.25390625, "rewards/brier_reward": 0.74609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 50 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 63.71484375, "completions/mean_terminated_length": 63.71484375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.0816, "grad_norm": 0.009628606028854847, "learning_rate": 5e-06, "loss": 0.0056, "num_tokens": 19158862.0, "reward": 1.1347665786743164, "reward_std": 0.08870971202850342, "rewards/accuracy_reward": 0.27734375, "rewards/brier_reward": 0.71875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 51 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 69.57421875, "completions/mean_terminated_length": 69.57421875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0832, "grad_norm": 0.006935023702681065, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 19524057.0, "reward": 1.1562509536743164, "reward_std": 0.06509427726268768, "rewards/accuracy_reward": 0.3125, "rewards/brier_reward": 0.6875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 52 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 69.5, "completions/mean_terminated_length": 69.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0848, "grad_norm": 0.008871861733496189, "learning_rate": 5e-06, "loss": -0.0007, "num_tokens": 19880473.0, "reward": 1.1132822036743164, "reward_std": 0.08120957016944885, "rewards/accuracy_reward": 0.2265625, "rewards/brier_reward": 0.7734375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 53 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 73.41796875, "completions/mean_terminated_length": 73.41796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0864, "grad_norm": 0.006425413303077221, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 20234380.0, "reward": 1.1562509536743164, "reward_std": 0.07101795077323914, "rewards/accuracy_reward": 0.3125, "rewards/brier_reward": 0.6875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 54 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 73.94140625, "completions/mean_terminated_length": 73.94140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.088, "grad_norm": 0.007403024472296238, "learning_rate": 5e-06, "loss": -0.0011, "num_tokens": 20605917.0, "reward": 1.1269540786743164, "reward_std": 0.06135355681180954, "rewards/accuracy_reward": 0.25390625, "rewards/brier_reward": 0.74609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 55 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 83.75390625, "completions/mean_terminated_length": 83.75390625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0896, "grad_norm": 0.006885427515953779, "learning_rate": 5e-06, "loss": 0.0013, "num_tokens": 20974542.0, "reward": 1.2363290786743164, "reward_std": 0.056094493716955185, "rewards/accuracy_reward": 0.48046875, "rewards/brier_reward": 0.515625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 56 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 89.47265625, "completions/mean_terminated_length": 89.47265625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0912, "grad_norm": 0.004320678301155567, "learning_rate": 5e-06, "loss": -0.0057, "num_tokens": 21326775.0, "reward": 1.1132822036743164, "reward_std": 0.045503467321395874, "rewards/accuracy_reward": 0.2265625, "rewards/brier_reward": 0.7734375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 57 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 95.81640625, "completions/mean_terminated_length": 95.81640625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0928, "grad_norm": 0.008047458715736866, "learning_rate": 5e-06, "loss": 0.0063, "num_tokens": 21692672.0, "reward": 1.1503915786743164, "reward_std": 0.09199290722608566, "rewards/accuracy_reward": 0.30859375, "rewards/brier_reward": 0.6875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 58 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 95.40234375, "completions/mean_terminated_length": 95.40234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0944, "grad_norm": 0.00607341481372714, "learning_rate": 5e-06, "loss": 0.0066, "num_tokens": 22062023.0, "reward": 1.1796884536743164, "reward_std": 0.0662132203578949, "rewards/accuracy_reward": 0.359375, "rewards/brier_reward": 0.640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 59 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 108.17578125, "completions/mean_terminated_length": 108.17578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.096, "grad_norm": 0.005692021455615759, "learning_rate": 5e-06, "loss": 0.0085, "num_tokens": 22423764.0, "reward": 1.1269540786743164, "reward_std": 0.05983643978834152, "rewards/accuracy_reward": 0.25390625, "rewards/brier_reward": 0.74609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 93.9921875, "completions/mean_terminated_length": 93.9921875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0976, "grad_norm": 0.004186698235571384, "learning_rate": 5e-06, "loss": -0.002, "num_tokens": 22794730.0, "reward": 1.1777353286743164, "reward_std": 0.037416763603687286, "rewards/accuracy_reward": 0.35546875, "rewards/brier_reward": 0.64453125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 61 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 103.70703125, "completions/mean_terminated_length": 103.70703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0992, "grad_norm": 0.004728452768176794, "learning_rate": 5e-06, "loss": -0.0016, "num_tokens": 23156191.0, "reward": 1.1816415786743164, "reward_std": 0.055698275566101074, "rewards/accuracy_reward": 0.36328125, "rewards/brier_reward": 0.63671875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 62 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 117.2421875, "completions/mean_terminated_length": 117.70196533203125, "completions/min_length": 0.0, "completions/min_terminated_length": 37.0, "epoch": 0.1008, "grad_norm": 0.006030679680407047, "learning_rate": 5e-06, "loss": 0.0127, "num_tokens": 23528725.0, "reward": 1.1171884536743164, "reward_std": 0.05655202269554138, "rewards/accuracy_reward": 0.25, "rewards/brier_reward": 0.7421875, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 63 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 109.6171875, "completions/mean_terminated_length": 109.6171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1024, "grad_norm": 0.00417441688477993, "learning_rate": 5e-06, "loss": 0.0078, "num_tokens": 23910147.0, "reward": 1.1582040786743164, "reward_std": 0.03827172517776489, "rewards/accuracy_reward": 0.31640625, "rewards/brier_reward": 0.68359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 64 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 113.59765625, "completions/mean_terminated_length": 113.59765625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.104, "grad_norm": 0.004523205570876598, "learning_rate": 5e-06, "loss": 0.0082, "num_tokens": 24283652.0, "reward": 1.1464853286743164, "reward_std": 0.034854330122470856, "rewards/accuracy_reward": 0.30078125, "rewards/brier_reward": 0.6953125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 109.6328125, "completions/mean_terminated_length": 110.06275177001953, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.1056, "grad_norm": 0.0046501909382641315, "learning_rate": 5e-06, "loss": -0.0153, "num_tokens": 24661446.0, "reward": 1.1210947036743164, "reward_std": 0.04635843634605408, "rewards/accuracy_reward": 0.25, "rewards/brier_reward": 0.74609375, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 66 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 131.8515625, "completions/mean_terminated_length": 132.8897705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.1072, "grad_norm": 0.0051050446927547455, "learning_rate": 5e-06, "loss": -0.0071, "num_tokens": 25046312.0, "reward": 1.2031259536743164, "reward_std": 0.08831032365560532, "rewards/accuracy_reward": 0.421875, "rewards/brier_reward": 0.5703125, "rewards/confidence_one_or_zero": 0.9921875, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 67 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 175.33203125, "completions/mean_terminated_length": 176.7126007080078, "completions/min_length": 0.0, "completions/min_terminated_length": 35.0, "epoch": 0.1088, "grad_norm": 0.00740397721529007, "learning_rate": 5e-06, "loss": -0.0033, "num_tokens": 25432013.0, "reward": 1.1660165786743164, "reward_std": 0.1343783736228943, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.609375, "rewards/confidence_one_or_zero": 0.9921875, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.0, "step": 68 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 185.4229278564453, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 0.1104, "grad_norm": 0.0071678152307868, "learning_rate": 5e-06, "loss": 0.0051, "num_tokens": 25798581.0, "reward": 1.2304697036743164, "reward_std": 0.14793682098388672, "rewards/accuracy_reward": 0.515625, "rewards/brier_reward": 0.45703125, "rewards/confidence_one_or_zero": 0.98828125, "rewards/format_reward": 0.97265625, "rewards/mean_confidence_reward": 0.0, "step": 69 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 185.08984375, "completions/mean_terminated_length": 185.81570434570312, "completions/min_length": 0.0, "completions/min_terminated_length": 37.0, "epoch": 0.112, "grad_norm": 0.006619483698159456, "learning_rate": 5e-06, "loss": 0.015, "num_tokens": 26189180.0, "reward": 1.1679697036743164, "reward_std": 0.09055261313915253, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.640625, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 165.5703125, "completions/mean_terminated_length": 165.5703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1136, "grad_norm": 0.005290019791573286, "learning_rate": 5e-06, "loss": 0.0114, "num_tokens": 26565902.0, "reward": 1.2070322036743164, "reward_std": 0.10731258988380432, "rewards/accuracy_reward": 0.4140625, "rewards/brier_reward": 0.5859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 71 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 123.67578125, "completions/mean_terminated_length": 123.67578125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1152, "grad_norm": 0.006431462708860636, "learning_rate": 5e-06, "loss": -0.0074, "num_tokens": 26926955.0, "reward": 1.2207040786743164, "reward_std": 0.07384559512138367, "rewards/accuracy_reward": 0.44140625, "rewards/brier_reward": 0.55859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 72 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 131.01171875, "completions/mean_terminated_length": 131.01171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1168, "grad_norm": 0.009269109927117825, "learning_rate": 5e-06, "loss": -0.0066, "num_tokens": 27304238.0, "reward": 1.0742197036743164, "reward_std": 0.07411082088947296, "rewards/accuracy_reward": 0.1875, "rewards/brier_reward": 0.79296875, "rewards/confidence_one_or_zero": 0.984375, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.0, "step": 73 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 122.14453125, "completions/mean_terminated_length": 122.14453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1184, "grad_norm": 0.006270912941545248, "learning_rate": 5e-06, "loss": 0.0044, "num_tokens": 27661051.0, "reward": 1.2031259536743164, "reward_std": 0.09482251107692719, "rewards/accuracy_reward": 0.40625, "rewards/brier_reward": 0.59375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 74 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 135.33984375, "completions/mean_terminated_length": 135.33984375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.12, "grad_norm": 0.004971709102392197, "learning_rate": 5e-06, "loss": 0.0012, "num_tokens": 28045506.0, "reward": 1.1523447036743164, "reward_std": 0.0780605673789978, "rewards/accuracy_reward": 0.3125, "rewards/brier_reward": 0.68359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 112.8828125, "completions/mean_terminated_length": 112.8828125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1216, "grad_norm": 0.0035235807299613953, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 28410428.0, "reward": 1.1894540786743164, "reward_std": 0.034720130264759064, "rewards/accuracy_reward": 0.37890625, "rewards/brier_reward": 0.62109375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 76 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 125.0078125, "completions/mean_terminated_length": 125.0078125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1232, "grad_norm": 0.003428579308092594, "learning_rate": 5e-06, "loss": -0.0027, "num_tokens": 28783238.0, "reward": 1.2128915786743164, "reward_std": 0.01913524977862835, "rewards/accuracy_reward": 0.42578125, "rewards/brier_reward": 0.57421875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 77 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 123.5546875, "completions/mean_terminated_length": 123.5546875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1248, "grad_norm": 0.005758420564234257, "learning_rate": 5e-06, "loss": 0.0033, "num_tokens": 29151548.0, "reward": 1.2011728286743164, "reward_std": 0.08107856661081314, "rewards/accuracy_reward": 0.40234375, "rewards/brier_reward": 0.59765625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 78 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 131.78125, "completions/mean_terminated_length": 131.78125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1264, "grad_norm": 0.006008731666952372, "learning_rate": 5e-06, "loss": -0.0007, "num_tokens": 29525276.0, "reward": 1.0976572036743164, "reward_std": 0.0864686369895935, "rewards/accuracy_reward": 0.1953125, "rewards/brier_reward": 0.8046875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 79 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 134.1484375, "completions/mean_terminated_length": 134.1484375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.128, "grad_norm": 0.006112709175795317, "learning_rate": 5e-06, "loss": 0.0037, "num_tokens": 29901738.0, "reward": 1.1035165786743164, "reward_std": 0.07299062609672546, "rewards/accuracy_reward": 0.21484375, "rewards/brier_reward": 0.78125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 80 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 127.109375, "completions/mean_terminated_length": 127.109375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1296, "grad_norm": 0.005416556261479855, "learning_rate": 5e-06, "loss": -0.002, "num_tokens": 30274126.0, "reward": 1.1484384536743164, "reward_std": 0.07536393404006958, "rewards/accuracy_reward": 0.296875, "rewards/brier_reward": 0.703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 81 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 130.44140625, "completions/mean_terminated_length": 130.44140625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1312, "grad_norm": 0.00545592000707984, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 30650839.0, "reward": 1.1855478286743164, "reward_std": 0.08252204954624176, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.62890625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 82 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 124.1796875, "completions/mean_terminated_length": 124.1796875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.1328, "grad_norm": 0.0041496846824884415, "learning_rate": 5e-06, "loss": 0.0035, "num_tokens": 31020485.0, "reward": 1.2207040786743164, "reward_std": 0.053135842084884644, "rewards/accuracy_reward": 0.44140625, "rewards/brier_reward": 0.55859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 83 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 125.25390625, "completions/mean_terminated_length": 125.25390625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.1344, "grad_norm": 0.0039185709320008755, "learning_rate": 5e-06, "loss": 0.0012, "num_tokens": 31402606.0, "reward": 1.2519540786743164, "reward_std": 0.043794769793748856, "rewards/accuracy_reward": 0.50390625, "rewards/brier_reward": 0.49609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 84 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 120.93359375, "completions/mean_terminated_length": 120.93359375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.136, "grad_norm": 0.007022375240921974, "learning_rate": 5e-06, "loss": 0.0042, "num_tokens": 31768261.0, "reward": 1.2324228286743164, "reward_std": 0.07595369219779968, "rewards/accuracy_reward": 0.47265625, "rewards/brier_reward": 0.5234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 85 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 103.8671875, "completions/mean_terminated_length": 103.8671875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1376, "grad_norm": 0.0054367585107684135, "learning_rate": 5e-06, "loss": 0.0036, "num_tokens": 32125435.0, "reward": 1.2050790786743164, "reward_std": 0.048331111669540405, "rewards/accuracy_reward": 0.41015625, "rewards/brier_reward": 0.58984375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 86 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 95.42578125, "completions/mean_terminated_length": 95.42578125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1392, "grad_norm": 0.005786779802292585, "learning_rate": 5e-06, "loss": -0.007, "num_tokens": 32488520.0, "reward": 1.2714853286743164, "reward_std": 0.058260709047317505, "rewards/accuracy_reward": 0.54296875, "rewards/brier_reward": 0.45703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 87 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 94.78515625, "completions/mean_terminated_length": 94.78515625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1408, "grad_norm": 0.005196675658226013, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 32844833.0, "reward": 1.1835947036743164, "reward_std": 0.045503467321395874, "rewards/accuracy_reward": 0.3671875, "rewards/brier_reward": 0.6328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 88 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 87.51953125, "completions/mean_terminated_length": 87.51953125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1424, "grad_norm": 0.0070787761360406876, "learning_rate": 5e-06, "loss": 0.0041, "num_tokens": 33212142.0, "reward": 1.1835947036743164, "reward_std": 0.04294103384017944, "rewards/accuracy_reward": 0.3671875, "rewards/brier_reward": 0.6328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 89 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 77.0390625, "completions/mean_terminated_length": 77.0390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.144, "grad_norm": 0.0065049161203205585, "learning_rate": 5e-06, "loss": -0.0026, "num_tokens": 33566784.0, "reward": 1.1757822036743164, "reward_std": 0.026633426547050476, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.6484375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 90 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 74.1328125, "completions/mean_terminated_length": 74.1328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1456, "grad_norm": 0.005245994310826063, "learning_rate": 5e-06, "loss": 0.0043, "num_tokens": 33911994.0, "reward": 1.2363290786743164, "reward_std": 0.03360118716955185, "rewards/accuracy_reward": 0.47265625, "rewards/brier_reward": 0.52734375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 91 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 67.546875, "completions/mean_terminated_length": 67.546875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1472, "grad_norm": 0.007916582748293877, "learning_rate": 5e-06, "loss": -0.0011, "num_tokens": 34267734.0, "reward": 1.2011728286743164, "reward_std": 0.06917505711317062, "rewards/accuracy_reward": 0.40234375, "rewards/brier_reward": 0.59765625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 92 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 66.01953125, "completions/mean_terminated_length": 66.01953125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1488, "grad_norm": 0.0072298673912882805, "learning_rate": 5e-06, "loss": -0.0054, "num_tokens": 34623507.0, "reward": 1.1621103286743164, "reward_std": 0.051558155566453934, "rewards/accuracy_reward": 0.32421875, "rewards/brier_reward": 0.67578125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 93 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 63.97265625, "completions/mean_terminated_length": 63.97265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1504, "grad_norm": 0.007007594686001539, "learning_rate": 5e-06, "loss": -0.0011, "num_tokens": 34973988.0, "reward": 1.1953134536743164, "reward_std": 0.05214790999889374, "rewards/accuracy_reward": 0.390625, "rewards/brier_reward": 0.609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 94 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 65.8203125, "completions/mean_terminated_length": 65.8203125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.152, "grad_norm": 0.007378693204373121, "learning_rate": 5e-06, "loss": -0.0021, "num_tokens": 35338446.0, "reward": 1.1894540786743164, "reward_std": 0.043396592140197754, "rewards/accuracy_reward": 0.37890625, "rewards/brier_reward": 0.62109375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 95 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 62.55859375, "completions/mean_terminated_length": 62.55859375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1536, "grad_norm": 0.0066678086295723915, "learning_rate": 5e-06, "loss": -0.0008, "num_tokens": 35707853.0, "reward": 1.1699228286743164, "reward_std": 0.03360118716955185, "rewards/accuracy_reward": 0.33984375, "rewards/brier_reward": 0.66015625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 96 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 64.35546875, "completions/mean_terminated_length": 64.35546875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1552, "grad_norm": 0.004400665406137705, "learning_rate": 5e-06, "loss": 0.0016, "num_tokens": 36057144.0, "reward": 1.2226572036743164, "reward_std": 0.03189248964190483, "rewards/accuracy_reward": 0.4453125, "rewards/brier_reward": 0.5546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 97 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 66.6015625, "completions/mean_terminated_length": 66.6015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1568, "grad_norm": 0.005063364747911692, "learning_rate": 5e-06, "loss": 0.0011, "num_tokens": 36401298.0, "reward": 1.1718759536743164, "reward_std": 0.026633426547050476, "rewards/accuracy_reward": 0.34375, "rewards/brier_reward": 0.65625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 98 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 61.29296875, "completions/mean_terminated_length": 61.29296875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1584, "grad_norm": 0.005123145878314972, "learning_rate": 5e-06, "loss": 0.0015, "num_tokens": 36764013.0, "reward": 1.1933603286743164, "reward_std": 0.024924729019403458, "rewards/accuracy_reward": 0.38671875, "rewards/brier_reward": 0.61328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 99 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 57.15234375, "completions/mean_terminated_length": 57.15234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.16, "grad_norm": 0.005492492578923702, "learning_rate": 5e-06, "loss": 0.0008, "num_tokens": 37137556.0, "reward": 1.1953134536743164, "reward_std": 0.024659521877765656, "rewards/accuracy_reward": 0.390625, "rewards/brier_reward": 0.609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 100 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 56.99609375, "completions/mean_terminated_length": 56.99609375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.1616, "grad_norm": 0.005264037288725376, "learning_rate": 5e-06, "loss": -0.0012, "num_tokens": 37502395.0, "reward": 1.1660165786743164, "reward_std": 0.02380579523742199, "rewards/accuracy_reward": 0.33984375, "rewards/brier_reward": 0.65625, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 101 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 53.05859375, "completions/mean_terminated_length": 53.05859375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1632, "grad_norm": 0.0073048085905611515, "learning_rate": 5e-06, "loss": 0.0006, "num_tokens": 37854850.0, "reward": 1.1425790786743164, "reward_std": 0.02169768325984478, "rewards/accuracy_reward": 0.28515625, "rewards/brier_reward": 0.71484375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 102 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 55.15234375, "completions/mean_terminated_length": 55.15234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1648, "grad_norm": 0.0033208683598786592, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 38223889.0, "reward": 1.1660165786743164, "reward_std": 0.01657281443476677, "rewards/accuracy_reward": 0.33203125, "rewards/brier_reward": 0.66796875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 103 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 53.171875, "completions/mean_terminated_length": 53.171875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1664, "grad_norm": 0.005567228887230158, "learning_rate": 5e-06, "loss": -0.0006, "num_tokens": 38574245.0, "reward": 1.1699228286743164, "reward_std": 0.028930652886629105, "rewards/accuracy_reward": 0.33984375, "rewards/brier_reward": 0.66015625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 104 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 51.65234375, "completions/mean_terminated_length": 51.65234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.168, "grad_norm": 0.010346847586333752, "learning_rate": 5e-06, "loss": 0.0025, "num_tokens": 38937908.0, "reward": 1.2265634536743164, "reward_std": 0.05017400532960892, "rewards/accuracy_reward": 0.453125, "rewards/brier_reward": 0.546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 105 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 50.90234375, "completions/mean_terminated_length": 50.90234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1696, "grad_norm": 0.009891422465443611, "learning_rate": 5e-06, "loss": -0.0006, "num_tokens": 39307219.0, "reward": 1.1503915786743164, "reward_std": 0.04464973136782646, "rewards/accuracy_reward": 0.30078125, "rewards/brier_reward": 0.69921875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 106 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 52.31640625, "completions/mean_terminated_length": 52.31640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1712, "grad_norm": 0.006135431118309498, "learning_rate": 5e-06, "loss": -0.001, "num_tokens": 39671956.0, "reward": 1.1972665786743164, "reward_std": 0.03215769678354263, "rewards/accuracy_reward": 0.39453125, "rewards/brier_reward": 0.60546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 107 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 48.3828125, "completions/mean_terminated_length": 48.3828125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1728, "grad_norm": 0.006157853174954653, "learning_rate": 5e-06, "loss": -0.0006, "num_tokens": 40026910.0, "reward": 1.1972665786743164, "reward_std": 0.022552644833922386, "rewards/accuracy_reward": 0.39453125, "rewards/brier_reward": 0.60546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 108 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 47.84375, "completions/mean_terminated_length": 47.84375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1744, "grad_norm": 0.009094290435314178, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 40372246.0, "reward": 1.1757822036743164, "reward_std": 0.03998042270541191, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.6484375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 109 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 46.38671875, "completions/mean_terminated_length": 46.38671875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.176, "grad_norm": 0.007714184932410717, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 40728177.0, "reward": 1.1464853286743164, "reward_std": 0.04083415865898132, "rewards/accuracy_reward": 0.29296875, "rewards/brier_reward": 0.70703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 110 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 46.9453125, "completions/mean_terminated_length": 46.9453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1776, "grad_norm": 0.007513863034546375, "learning_rate": 5e-06, "loss": 0.0019, "num_tokens": 41081603.0, "reward": 1.1660165786743164, "reward_std": 0.01913524977862835, "rewards/accuracy_reward": 0.33203125, "rewards/brier_reward": 0.66796875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 111 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 45.87890625, "completions/mean_terminated_length": 45.87890625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1792, "grad_norm": 0.005671597085893154, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 41441476.0, "reward": 1.1503915786743164, "reward_std": 0.022552644833922386, "rewards/accuracy_reward": 0.30078125, "rewards/brier_reward": 0.69921875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 112 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 46.2265625, "completions/mean_terminated_length": 46.2265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1808, "grad_norm": 0.0054357945919036865, "learning_rate": 5e-06, "loss": -0.0007, "num_tokens": 41776358.0, "reward": 1.2968759536743164, "reward_std": 0.03189249336719513, "rewards/accuracy_reward": 0.59375, "rewards/brier_reward": 0.40625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 113 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 46.6484375, "completions/mean_terminated_length": 46.6484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1824, "grad_norm": 0.009192634373903275, "learning_rate": 5e-06, "loss": -0.0013, "num_tokens": 42121340.0, "reward": 1.1933603286743164, "reward_std": 0.06535948067903519, "rewards/accuracy_reward": 0.38671875, "rewards/brier_reward": 0.61328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 114 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 46.59765625, "completions/mean_terminated_length": 46.59765625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.184, "grad_norm": 0.00554991140961647, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 42475581.0, "reward": 1.1445322036743164, "reward_std": 0.03189248964190483, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.6171875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.96875, "rewards/mean_confidence_reward": 0.0, "step": 115 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 47.359375, "completions/mean_terminated_length": 47.359375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1856, "grad_norm": 0.046929169446229935, "learning_rate": 5e-06, "loss": -0.0028, "num_tokens": 42829713.0, "reward": 1.1660165786743164, "reward_std": 0.12684404850006104, "rewards/accuracy_reward": 0.41796875, "rewards/brier_reward": 0.5390625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.95703125, "rewards/mean_confidence_reward": 0.0, "step": 116 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 49.31640625, "completions/mean_terminated_length": 49.31640625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1872, "grad_norm": 0.00579815125092864, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 43172258.0, "reward": 1.1621103286743164, "reward_std": 0.01275724172592163, "rewards/accuracy_reward": 0.32421875, "rewards/brier_reward": 0.67578125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 117 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 50.3984375, "completions/mean_terminated_length": 50.3984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1888, "grad_norm": 0.005210595671087503, "learning_rate": 5e-06, "loss": 0.0009, "num_tokens": 43517144.0, "reward": 1.1601572036743164, "reward_std": 0.043661803007125854, "rewards/accuracy_reward": 0.3203125, "rewards/brier_reward": 0.6796875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 118 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 52.4453125, "completions/mean_terminated_length": 52.4453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1904, "grad_norm": 0.007191211450845003, "learning_rate": 5e-06, "loss": 0.0008, "num_tokens": 43867754.0, "reward": 1.1621103286743164, "reward_std": 0.037416763603687286, "rewards/accuracy_reward": 0.32421875, "rewards/brier_reward": 0.67578125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 119 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 51.90234375, "completions/mean_terminated_length": 51.90234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.192, "grad_norm": 0.007046000100672245, "learning_rate": 5e-06, "loss": -0.0008, "num_tokens": 44204993.0, "reward": 1.2402353286743164, "reward_std": 0.05695141479372978, "rewards/accuracy_reward": 0.48046875, "rewards/brier_reward": 0.51953125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 120 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 53.69921875, "completions/mean_terminated_length": 53.69921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1936, "grad_norm": 0.009322714060544968, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 44537516.0, "reward": 1.2539072036743164, "reward_std": 0.0672023743391037, "rewards/accuracy_reward": 0.5078125, "rewards/brier_reward": 0.4921875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 121 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 54.3984375, "completions/mean_terminated_length": 54.3984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1952, "grad_norm": 0.007987559773027897, "learning_rate": 5e-06, "loss": -0.0028, "num_tokens": 44894378.0, "reward": 1.1679697036743164, "reward_std": 0.0674663633108139, "rewards/accuracy_reward": 0.34375, "rewards/brier_reward": 0.65234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 122 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 53.44921875, "completions/mean_terminated_length": 53.44921875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1968, "grad_norm": 0.005494001787155867, "learning_rate": 5e-06, "loss": -0.0012, "num_tokens": 45252013.0, "reward": 1.2617197036743164, "reward_std": 0.044914938509464264, "rewards/accuracy_reward": 0.5234375, "rewards/brier_reward": 0.4765625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 123 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 55.171875, "completions/mean_terminated_length": 55.171875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.1984, "grad_norm": 0.007676423992961645, "learning_rate": 5e-06, "loss": 0.0046, "num_tokens": 45605305.0, "reward": 1.1679697036743164, "reward_std": 0.039125457406044006, "rewards/accuracy_reward": 0.3359375, "rewards/brier_reward": 0.6640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 124 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 54.44921875, "completions/mean_terminated_length": 54.44921875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.2, "grad_norm": 0.008286594413220882, "learning_rate": 5e-06, "loss": -0.0016, "num_tokens": 45965436.0, "reward": 1.1406259536743164, "reward_std": 0.04635842889547348, "rewards/accuracy_reward": 0.2890625, "rewards/brier_reward": 0.70703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 125 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 56.96484375, "completions/mean_terminated_length": 56.96484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2016, "grad_norm": 0.0051973238587379456, "learning_rate": 5e-06, "loss": 0.002, "num_tokens": 46328467.0, "reward": 1.2343759536743164, "reward_std": 0.03998042270541191, "rewards/accuracy_reward": 0.46875, "rewards/brier_reward": 0.53125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 126 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 55.6640625, "completions/mean_terminated_length": 55.6640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.2032, "grad_norm": 0.002425758633762598, "learning_rate": 5e-06, "loss": -0.001, "num_tokens": 46691533.0, "reward": 1.1601572036743164, "reward_std": 0.011048543266952038, "rewards/accuracy_reward": 0.3203125, "rewards/brier_reward": 0.6796875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 127 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 54.59765625, "completions/mean_terminated_length": 54.59765625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2048, "grad_norm": 0.014877156354486942, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 47053270.0, "reward": 1.2460947036743164, "reward_std": 0.07285766303539276, "rewards/accuracy_reward": 0.4921875, "rewards/brier_reward": 0.5078125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 128 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 55.13671875, "completions/mean_terminated_length": 55.13671875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2064, "grad_norm": 0.006909608840942383, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 47414457.0, "reward": 1.2070322036743164, "reward_std": 0.036563023924827576, "rewards/accuracy_reward": 0.4140625, "rewards/brier_reward": 0.5859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 129 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 53.84375, "completions/mean_terminated_length": 53.84375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.208, "grad_norm": 0.003339196555316448, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 47752457.0, "reward": 1.1464853286743164, "reward_std": 0.0191352479159832, "rewards/accuracy_reward": 0.29296875, "rewards/brier_reward": 0.70703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 130 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 55.34765625, "completions/mean_terminated_length": 55.34765625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2096, "grad_norm": 0.004676603712141514, "learning_rate": 5e-06, "loss": 0.0018, "num_tokens": 48113242.0, "reward": 1.2128915786743164, "reward_std": 0.03597327321767807, "rewards/accuracy_reward": 0.42578125, "rewards/brier_reward": 0.57421875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 131 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 53.95703125, "completions/mean_terminated_length": 53.95703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2112, "grad_norm": 0.003998136613518, "learning_rate": 5e-06, "loss": 0.0012, "num_tokens": 48471623.0, "reward": 1.1308603286743164, "reward_std": 0.0191352479159832, "rewards/accuracy_reward": 0.26171875, "rewards/brier_reward": 0.73828125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 132 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 52.5625, "completions/mean_terminated_length": 52.5625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2128, "grad_norm": 0.0053713698871433735, "learning_rate": 5e-06, "loss": -0.002, "num_tokens": 48821807.0, "reward": 1.1210947036743164, "reward_std": 0.04024440422654152, "rewards/accuracy_reward": 0.2421875, "rewards/brier_reward": 0.7578125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 133 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 51.9765625, "completions/mean_terminated_length": 51.9765625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2144, "grad_norm": 0.005026605445891619, "learning_rate": 5e-06, "loss": -0.0006, "num_tokens": 49169657.0, "reward": 1.1835947036743164, "reward_std": 0.020843947306275368, "rewards/accuracy_reward": 0.3671875, "rewards/brier_reward": 0.6328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 134 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 53.65625, "completions/mean_terminated_length": 53.65625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.216, "grad_norm": 0.010881892405450344, "learning_rate": 5e-06, "loss": -0.0006, "num_tokens": 49516481.0, "reward": 1.2324228286743164, "reward_std": 0.03360118716955185, "rewards/accuracy_reward": 0.46484375, "rewards/brier_reward": 0.53515625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 135 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 56.70703125, "completions/mean_terminated_length": 56.70703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.2176, "grad_norm": 0.00886172242462635, "learning_rate": 5e-06, "loss": 0.0028, "num_tokens": 49888774.0, "reward": 1.2050790786743164, "reward_std": 0.030904557555913925, "rewards/accuracy_reward": 0.41015625, "rewards/brier_reward": 0.58984375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 136 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 52.28125, "completions/mean_terminated_length": 52.28125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2192, "grad_norm": 0.004887416493147612, "learning_rate": 5e-06, "loss": -0.0013, "num_tokens": 50247878.0, "reward": 1.2304697036743164, "reward_std": 0.03642883151769638, "rewards/accuracy_reward": 0.4609375, "rewards/brier_reward": 0.5390625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 137 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 55.6171875, "completions/mean_terminated_length": 55.6171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.2208, "grad_norm": 0.00412516575306654, "learning_rate": 5e-06, "loss": 0.0006, "num_tokens": 50610956.0, "reward": 1.2421884536743164, "reward_std": 0.024659521877765656, "rewards/accuracy_reward": 0.484375, "rewards/brier_reward": 0.515625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 138 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 58.73046875, "completions/mean_terminated_length": 58.73046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2224, "grad_norm": 0.002713292371481657, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 50967759.0, "reward": 1.1269540786743164, "reward_std": 0.01275724172592163, "rewards/accuracy_reward": 0.25390625, "rewards/brier_reward": 0.74609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 139 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 54.40625, "completions/mean_terminated_length": 54.40625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.224, "grad_norm": 0.007535545155405998, "learning_rate": 5e-06, "loss": -0.002, "num_tokens": 51329615.0, "reward": 1.2421884536743164, "reward_std": 0.04379599541425705, "rewards/accuracy_reward": 0.4921875, "rewards/brier_reward": 0.50390625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 140 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 59.91796875, "completions/mean_terminated_length": 59.91796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2256, "grad_norm": 0.005773982498794794, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 51693218.0, "reward": 1.1855478286743164, "reward_std": 0.05615260452032089, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.62890625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 141 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 62.546875, "completions/mean_terminated_length": 62.546875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2272, "grad_norm": 0.007854162715375423, "learning_rate": 5e-06, "loss": 0.0023, "num_tokens": 52067630.0, "reward": 1.2011728286743164, "reward_std": 0.08844131231307983, "rewards/accuracy_reward": 0.41796875, "rewards/brier_reward": 0.57421875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 142 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 58.87890625, "completions/mean_terminated_length": 58.87890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2288, "grad_norm": 0.006240553688257933, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 52440703.0, "reward": 1.1914072036743164, "reward_std": 0.05740697309374809, "rewards/accuracy_reward": 0.390625, "rewards/brier_reward": 0.60546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 143 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 53.40234375, "completions/mean_terminated_length": 53.40234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2304, "grad_norm": 0.004978107754141092, "learning_rate": 5e-06, "loss": 0.0008, "num_tokens": 52792606.0, "reward": 1.1933603286743164, "reward_std": 0.04208730161190033, "rewards/accuracy_reward": 0.38671875, "rewards/brier_reward": 0.61328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 144 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 53.58984375, "completions/mean_terminated_length": 53.58984375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.232, "grad_norm": 0.003977401647716761, "learning_rate": 5e-06, "loss": 0.0014, "num_tokens": 53163565.0, "reward": 1.1914072036743164, "reward_std": 0.03445492684841156, "rewards/accuracy_reward": 0.3828125, "rewards/brier_reward": 0.6171875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 145 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 51.46484375, "completions/mean_terminated_length": 51.46484375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2336, "grad_norm": 0.003525414504110813, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 53525860.0, "reward": 1.2343759536743164, "reward_std": 0.020843947306275368, "rewards/accuracy_reward": 0.46875, "rewards/brier_reward": 0.53125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 146 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 49.3125, "completions/mean_terminated_length": 49.3125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2352, "grad_norm": 0.0049398792907595634, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 53873428.0, "reward": 1.2675790786743164, "reward_std": 0.04662363603711128, "rewards/accuracy_reward": 0.53515625, "rewards/brier_reward": 0.46484375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 147 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 50.09375, "completions/mean_terminated_length": 50.09375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2368, "grad_norm": 0.0061876606196165085, "learning_rate": 5e-06, "loss": -0.0018, "num_tokens": 54229540.0, "reward": 1.2695322036743164, "reward_std": 0.0388009138405323, "rewards/accuracy_reward": 0.5390625, "rewards/brier_reward": 0.4609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 148 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 50.296875, "completions/mean_terminated_length": 50.296875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2384, "grad_norm": 0.009590424597263336, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 54591136.0, "reward": 1.1875009536743164, "reward_std": 0.05582928657531738, "rewards/accuracy_reward": 0.375, "rewards/brier_reward": 0.625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 149 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 50.74609375, "completions/mean_terminated_length": 50.74609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.24, "grad_norm": 0.006619809195399284, "learning_rate": 5e-06, "loss": -0.0013, "num_tokens": 54930119.0, "reward": 1.1210947036743164, "reward_std": 0.04024440422654152, "rewards/accuracy_reward": 0.2421875, "rewards/brier_reward": 0.7578125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 150 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 49.984375, "completions/mean_terminated_length": 49.984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2416, "grad_norm": 0.005861316341906786, "learning_rate": 5e-06, "loss": -0.0006, "num_tokens": 55283315.0, "reward": 1.2558603286743164, "reward_std": 0.03616362065076828, "rewards/accuracy_reward": 0.51171875, "rewards/brier_reward": 0.48828125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 151 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 49.63671875, "completions/mean_terminated_length": 49.63671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2432, "grad_norm": 0.00885644182562828, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 55623406.0, "reward": 1.1523447036743164, "reward_std": 0.04294103384017944, "rewards/accuracy_reward": 0.3125, "rewards/brier_reward": 0.68359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 152 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2448, "grad_norm": 0.007674382999539375, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 55979030.0, "reward": 1.2070322036743164, "reward_std": 0.047477371990680695, "rewards/accuracy_reward": 0.4140625, "rewards/brier_reward": 0.5859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 153 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 50.91796875, "completions/mean_terminated_length": 50.91796875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.2464, "grad_norm": 0.005602710880339146, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 56334569.0, "reward": 1.2578134536743164, "reward_std": 0.04037860035896301, "rewards/accuracy_reward": 0.5234375, "rewards/brier_reward": 0.47265625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 154 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 50.79296875, "completions/mean_terminated_length": 50.99216079711914, "completions/min_length": 0.0, "completions/min_terminated_length": 37.0, "epoch": 0.248, "grad_norm": 0.007848069071769714, "learning_rate": 5e-06, "loss": -0.0042, "num_tokens": 56695340.0, "reward": 1.1523447036743164, "reward_std": 0.04892086982727051, "rewards/accuracy_reward": 0.3203125, "rewards/brier_reward": 0.671875, "rewards/confidence_one_or_zero": 0.99609375, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 155 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 52.45703125, "completions/mean_terminated_length": 52.45703125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2496, "grad_norm": 0.005280786659568548, "learning_rate": 5e-06, "loss": -0.0015, "num_tokens": 57050241.0, "reward": 1.2246103286743164, "reward_std": 0.03741675987839699, "rewards/accuracy_reward": 0.44921875, "rewards/brier_reward": 0.55078125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 156 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 50.06640625, "completions/mean_terminated_length": 50.06640625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2512, "grad_norm": 0.006250562611967325, "learning_rate": 5e-06, "loss": 0.0014, "num_tokens": 57398938.0, "reward": 1.2070322036743164, "reward_std": 0.04747737571597099, "rewards/accuracy_reward": 0.4140625, "rewards/brier_reward": 0.5859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 157 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 51.06640625, "completions/mean_terminated_length": 51.06640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.2528, "grad_norm": 0.011759904213249683, "learning_rate": 5e-06, "loss": 0.0034, "num_tokens": 57770347.0, "reward": 1.2226572036743164, "reward_std": 0.06490392982959747, "rewards/accuracy_reward": 0.4453125, "rewards/brier_reward": 0.5546875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 158 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 51.4765625, "completions/mean_terminated_length": 51.4765625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2544, "grad_norm": 0.006999201141297817, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 58126709.0, "reward": 1.1386728286743164, "reward_std": 0.037416763603687286, "rewards/accuracy_reward": 0.28515625, "rewards/brier_reward": 0.7109375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 159 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 47.21484375, "completions/mean_terminated_length": 47.21484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.256, "grad_norm": 0.0068825832568109035, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 58472412.0, "reward": 1.2031259536743164, "reward_std": 0.038136303424835205, "rewards/accuracy_reward": 0.40625, "rewards/brier_reward": 0.59375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 160 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 48.1953125, "completions/mean_terminated_length": 48.1953125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2576, "grad_norm": 0.006445925682783127, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 58823382.0, "reward": 1.2050790786743164, "reward_std": 0.0236715879291296, "rewards/accuracy_reward": 0.41015625, "rewards/brier_reward": 0.58984375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 161 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 46.9453125, "completions/mean_terminated_length": 46.9453125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2592, "grad_norm": 0.005022743251174688, "learning_rate": 5e-06, "loss": 0.0006, "num_tokens": 59166184.0, "reward": 1.1855478286743164, "reward_std": 0.03103875368833542, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.62890625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 162 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 46.54296875, "completions/mean_terminated_length": 46.54296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2608, "grad_norm": 0.008230634965002537, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 59516459.0, "reward": 1.2578134536743164, "reward_std": 0.05529887229204178, "rewards/accuracy_reward": 0.515625, "rewards/brier_reward": 0.484375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 163 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 45.5859375, "completions/mean_terminated_length": 45.5859375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2624, "grad_norm": 0.00862080603837967, "learning_rate": 5e-06, "loss": 0.0006, "num_tokens": 59875753.0, "reward": 1.2792978286743164, "reward_std": 0.03360118716955185, "rewards/accuracy_reward": 0.55859375, "rewards/brier_reward": 0.44140625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 164 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 45.5390625, "completions/mean_terminated_length": 45.5390625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.264, "grad_norm": 0.0062230536714196205, "learning_rate": 5e-06, "loss": 0.0013, "num_tokens": 60217323.0, "reward": 1.1796884536743164, "reward_std": 0.05273643881082535, "rewards/accuracy_reward": 0.359375, "rewards/brier_reward": 0.640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 165 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 44.92578125, "completions/mean_terminated_length": 44.92578125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2656, "grad_norm": 0.00558104133233428, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 60570864.0, "reward": 1.1738290786743164, "reward_std": 0.022228099405765533, "rewards/accuracy_reward": 0.34765625, "rewards/brier_reward": 0.65234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 166 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 43.72265625, "completions/mean_terminated_length": 43.72265625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2672, "grad_norm": 0.009653366170823574, "learning_rate": 5e-06, "loss": -0.0004, "num_tokens": 60938281.0, "reward": 1.1933603286743164, "reward_std": 0.04254163056612015, "rewards/accuracy_reward": 0.38671875, "rewards/brier_reward": 0.61328125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 167 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 44.7421875, "completions/mean_terminated_length": 44.7421875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2688, "grad_norm": 0.005720548797398806, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 61289079.0, "reward": 1.1308603286743164, "reward_std": 0.0191352479159832, "rewards/accuracy_reward": 0.26171875, "rewards/brier_reward": 0.73828125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 168 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 45.4453125, "completions/mean_terminated_length": 45.4453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2704, "grad_norm": 0.007109296508133411, "learning_rate": 5e-06, "loss": 0.0009, "num_tokens": 61647617.0, "reward": 1.1582040786743164, "reward_std": 0.04451553523540497, "rewards/accuracy_reward": 0.31640625, "rewards/brier_reward": 0.68359375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 169 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 45.26171875, "completions/mean_terminated_length": 45.26171875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.272, "grad_norm": 0.008910843171179295, "learning_rate": 5e-06, "loss": -0.001, "num_tokens": 61980084.0, "reward": 1.1621103286743164, "reward_std": 0.019990211352705956, "rewards/accuracy_reward": 0.32421875, "rewards/brier_reward": 0.67578125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 170 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 44.41796875, "completions/mean_terminated_length": 44.41796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2736, "grad_norm": 0.0, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 62333263.0, "reward": 1.2500009536743164, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/brier_reward": 0.5, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 171 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 43.0859375, "completions/mean_terminated_length": 43.0859375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2752, "grad_norm": 0.0074557592160999775, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 62696221.0, "reward": 1.1191415786743164, "reward_std": 0.03360118716955185, "rewards/accuracy_reward": 0.23828125, "rewards/brier_reward": 0.76171875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 172 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 42.8203125, "completions/mean_terminated_length": 42.8203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2768, "grad_norm": 0.005095581524074078, "learning_rate": 5e-06, "loss": 0.001, "num_tokens": 63049559.0, "reward": 1.1328134536743164, "reward_std": 0.039125461131334305, "rewards/accuracy_reward": 0.265625, "rewards/brier_reward": 0.734375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 173 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 42.484375, "completions/mean_terminated_length": 42.484375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2784, "grad_norm": 0.010611972771584988, "learning_rate": 5e-06, "loss": 0.0007, "num_tokens": 63408899.0, "reward": 1.1875009536743164, "reward_std": 0.02551448345184326, "rewards/accuracy_reward": 0.375, "rewards/brier_reward": 0.625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 174 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 41.0546875, "completions/mean_terminated_length": 41.0546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.28, "grad_norm": 0.006971416063606739, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 63749625.0, "reward": 1.2402353286743164, "reward_std": 0.048331111669540405, "rewards/accuracy_reward": 0.48046875, "rewards/brier_reward": 0.51953125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 175 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 40.8515625, "completions/mean_terminated_length": 40.8515625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2816, "grad_norm": 0.008722545579075813, "learning_rate": 5e-06, "loss": -0.0005, "num_tokens": 64088787.0, "reward": 1.1914072036743164, "reward_std": 0.036563027650117874, "rewards/accuracy_reward": 0.3828125, "rewards/brier_reward": 0.6171875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 176 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 40.27734375, "completions/mean_terminated_length": 40.27734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2832, "grad_norm": 0.010323431342840195, "learning_rate": 5e-06, "loss": 0.0013, "num_tokens": 64443402.0, "reward": 1.2148447036743164, "reward_std": 0.0384027361869812, "rewards/accuracy_reward": 0.4296875, "rewards/brier_reward": 0.5703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 177 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 39.2734375, "completions/mean_terminated_length": 39.2734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2848, "grad_norm": 0.005445533897727728, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 64787016.0, "reward": 1.2011728286743164, "reward_std": 0.026368219405412674, "rewards/accuracy_reward": 0.40234375, "rewards/brier_reward": 0.59765625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 178 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 38.3203125, "completions/mean_terminated_length": 38.3203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2864, "grad_norm": 0.005081367213279009, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 65137202.0, "reward": 1.2402353286743164, "reward_std": 0.023805785924196243, "rewards/accuracy_reward": 0.48046875, "rewards/brier_reward": 0.51953125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 179 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 36.71875, "completions/mean_terminated_length": 36.71875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.288, "grad_norm": 0.009812582284212112, "learning_rate": 5e-06, "loss": 0.0015, "num_tokens": 65473690.0, "reward": 1.2636728286743164, "reward_std": 0.06135355681180954, "rewards/accuracy_reward": 0.52734375, "rewards/brier_reward": 0.47265625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 180 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 36.46875, "completions/mean_terminated_length": 36.46875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2896, "grad_norm": 0.004110858775675297, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 65823274.0, "reward": 1.1855478286743164, "reward_std": 0.005524271633476019, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.62890625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 181 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 35.16015625, "completions/mean_terminated_length": 35.16015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2912, "grad_norm": 0.013062373735010624, "learning_rate": 5e-06, "loss": -0.0003, "num_tokens": 66166363.0, "reward": 1.1464853286743164, "reward_std": 0.04254163056612015, "rewards/accuracy_reward": 0.30078125, "rewards/brier_reward": 0.6953125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 182 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 35.27734375, "completions/mean_terminated_length": 35.27734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2928, "grad_norm": 0.004537950269877911, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 66505130.0, "reward": 1.2167978286743164, "reward_std": 0.015319675207138062, "rewards/accuracy_reward": 0.43359375, "rewards/brier_reward": 0.56640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 183 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 35.56640625, "completions/mean_terminated_length": 35.56640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2944, "grad_norm": 0.013359584845602512, "learning_rate": 5e-06, "loss": 0.0004, "num_tokens": 66854059.0, "reward": 1.2207040786743164, "reward_std": 0.03103875368833542, "rewards/accuracy_reward": 0.44140625, "rewards/brier_reward": 0.55859375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 184 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 35.4140625, "completions/mean_terminated_length": 35.4140625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.296, "grad_norm": 0.004776604939252138, "learning_rate": 5e-06, "loss": 0.0002, "num_tokens": 67204125.0, "reward": 1.2128915786743164, "reward_std": 0.01657281443476677, "rewards/accuracy_reward": 0.42578125, "rewards/brier_reward": 0.57421875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 185 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.16015625, "completions/mean_terminated_length": 34.16015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2976, "grad_norm": 0.004593758378177881, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 67545518.0, "reward": 1.2753915786743164, "reward_std": 0.01657281443476677, "rewards/accuracy_reward": 0.55078125, "rewards/brier_reward": 0.44921875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 186 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.578125, "completions/mean_terminated_length": 34.578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2992, "grad_norm": 0.004609708674252033, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 67891474.0, "reward": 1.1855478286743164, "reward_std": 0.02110915631055832, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.62890625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 187 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 34.76171875, "completions/mean_terminated_length": 34.76171875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3008, "grad_norm": 0.005076521076261997, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 68255949.0, "reward": 1.2167978286743164, "reward_std": 0.019990211352705956, "rewards/accuracy_reward": 0.43359375, "rewards/brier_reward": 0.56640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 188 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.65234375, "completions/mean_terminated_length": 34.65234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3024, "grad_norm": 0.015296814031898975, "learning_rate": 5e-06, "loss": 0.0007, "num_tokens": 68597300.0, "reward": 1.1777353286743164, "reward_std": 0.028930652886629105, "rewards/accuracy_reward": 0.35546875, "rewards/brier_reward": 0.64453125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 189 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 34.6015625, "completions/mean_terminated_length": 34.6015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.304, "grad_norm": 0.007346505299210548, "learning_rate": 5e-06, "loss": 0.0007, "num_tokens": 68945222.0, "reward": 1.1308603286743164, "reward_std": 0.01913524977862835, "rewards/accuracy_reward": 0.26171875, "rewards/brier_reward": 0.73828125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 190 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.1875, "completions/mean_terminated_length": 34.1875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3056, "grad_norm": 0.005888568237423897, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 69298710.0, "reward": 1.1992197036743164, "reward_std": 0.018281511962413788, "rewards/accuracy_reward": 0.3984375, "rewards/brier_reward": 0.6015625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 191 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 33.23828125, "completions/mean_terminated_length": 33.23828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3072, "grad_norm": 0.004927255213260651, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 69655875.0, "reward": 1.1074228286743164, "reward_std": 0.01275724172592163, "rewards/accuracy_reward": 0.21484375, "rewards/brier_reward": 0.78515625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 192 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 33.46484375, "completions/mean_terminated_length": 33.46484375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3088, "grad_norm": 0.002895692829042673, "learning_rate": 5e-06, "loss": -0.0001, "num_tokens": 70013034.0, "reward": 1.2148447036743164, "reward_std": 0.011048543266952038, "rewards/accuracy_reward": 0.4296875, "rewards/brier_reward": 0.5703125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 193 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 32.80078125, "completions/mean_terminated_length": 32.80078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3104, "grad_norm": 0.01532782893627882, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 70367679.0, "reward": 1.1679697036743164, "reward_std": 0.06292805820703506, "rewards/accuracy_reward": 0.3359375, "rewards/brier_reward": 0.6640625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 194 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 33.89453125, "completions/mean_terminated_length": 33.89453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.312, "grad_norm": 0.014220437966287136, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 70717156.0, "reward": 1.1386728286743164, "reward_std": 0.048331111669540405, "rewards/accuracy_reward": 0.27734375, "rewards/brier_reward": 0.72265625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 195 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 33.41015625, "completions/mean_terminated_length": 33.41015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3136, "grad_norm": 0.05932420864701271, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 71069909.0, "reward": 1.3300790786743164, "reward_std": 0.023805784061551094, "rewards/accuracy_reward": 0.66015625, "rewards/brier_reward": 0.33984375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 196 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 32.6875, "completions/mean_terminated_length": 32.6875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3152, "grad_norm": 0.014617173001170158, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 71421901.0, "reward": 1.2246103286743164, "reward_std": 0.03616362065076828, "rewards/accuracy_reward": 0.44921875, "rewards/brier_reward": 0.55078125, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 197 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 32.6875, "completions/mean_terminated_length": 32.6875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3168, "grad_norm": 0.010734383016824722, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 71774381.0, "reward": 1.1953134536743164, "reward_std": 0.058980248868465424, "rewards/accuracy_reward": 0.390625, "rewards/brier_reward": 0.609375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 198 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.453125, "completions/mean_terminated_length": 33.453125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3184, "grad_norm": 0.007351653650403023, "learning_rate": 5e-06, "loss": 0.0003, "num_tokens": 72128153.0, "reward": 1.2285165786743164, "reward_std": 0.030183792114257812, "rewards/accuracy_reward": 0.45703125, "rewards/brier_reward": 0.54296875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 199 }, { "calib/answer_extract_rate": 0.0, "calib/avg_num_step_conf": 0.0, "calib/final_conf_rate": 0.0, "calib/format_rate": 0.0, "calib/nonempty_final_conf_rate": 0.0, "calib/nonempty_reasoning_rate": 0.0, "calib/nonempty_step_conf_rate": 0.0, "calib/step_conf_rate": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 32.83984375, "completions/mean_terminated_length": 32.83984375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.32, "grad_norm": 0.011516711674630642, "learning_rate": 5e-06, "loss": -0.0002, "num_tokens": 72451840.0, "reward": 1.1875009536743164, "reward_std": 0.020843947306275368, "rewards/accuracy_reward": 0.375, "rewards/brier_reward": 0.625, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 200 }, { "epoch": 0.32, "step": 200, "total_flos": 0.0, "train_loss": -0.0015666725028131623, "train_runtime": 6054.4549, "train_samples_per_second": 8.457, "train_steps_per_second": 0.033 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 72451840, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }